0% found this document useful (0 votes)
32 views7 pages

Big Data Merged

Download as pdf or txt
Download as pdf or txt
Download as pdf or txt
You are on page 1/ 7

9/23/22, 9:17 PM Untitled13 - Jupyter Notebook

In [ ]:

#exp-1 datapreprocessing
import pandas as pd
import numpy as np
df=pd.read_csv('housing_price.csv')

In [ ]:

df.describe() #dataset description

In [ ]:

print(df.shape,df.size,df.ndim)

In [ ]:

df.isnull().sum() #null values count

In [ ]:

df.columns

In [ ]:

df.columns.size #number of columns

In [ ]:

df.head(10)

In [ ]:

df=df.replace(0,np.NaN) #replace 0 with NaN

In [ ]:

df.isnull().sum()

In [ ]:

df.head(10)

In [ ]:

df=df.dropna(thresh=20) #leaves row if only one NaN is present in the row

In [ ]:

df.isnull().sum()

localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 1/7
9/23/22, 9:17 PM Untitled13 - Jupyter Notebook

In [ ]:

df=df.fillna(value=df.loc[:,df.columns].mean()) #filling mean values

In [ ]:

df.isnull().sum() #council dtype is object

In [ ]:

df=df.dropna()

In [ ]:

df.isnull().sum()

In [ ]:

df.dtypes.value_counts()

In [ ]:

print(df.shape,df.size,df.ndim)

In [ ]:

x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [ ]:

In [ ]:

In [ ]:

from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(x, y,train_size=0.8,test_size=
print("Train size :",(len(x_train)/len(x))*100)
print("Test size :",(len(x_test)/len(x))*100)

In [ ]:

#exp-2 twitterdataset
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
df=pd.read_csv("tweet.csv")
df=df.drop(labels=["id","label"],axis=1)
df.head()

localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 2/7
9/23/22, 9:17 PM Untitled13 - Jupyter Notebook

In [ ]:

def remove_pattern(input_txt, pattern):


r = re.findall(pattern, input_txt)
for i in r:
input_txt = re.sub(i, '', input_txt)
return input_txt

In [ ]:

df['new']=np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")
df.head()

In [ ]:

df['new'] =df['new'].apply(lambda x:' '.join([w for w in x.split() if '.com' not in w]))


df.head()

In [ ]:

df['new'] =df['new'].str.replace("[^a-zA-Z]", " ")


df.head()

In [ ]:

import nltk
nltk.download('punkt')
tf=pd.DataFrame()
from nltk.tokenize import word_tokenize
tf['tokens']=df['new'].apply(lambda x: word_tokenize(x.lower()))
tf.head()

In [ ]:

from nltk.stem import PorterStemmer


stemmer = PorterStemmer()
tf['tokens']=tf['tokens'].apply(lambda x: [stemmer.stem(i) for i in x])
tf.head()

In [ ]:

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
tf['tokens'].apply(lambda x: [lemmatizer.lemmatize(i) for i in x])
tf.head()

In [ ]:

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tf['tokens']=tf['tokens'].apply(lambda x: [ i for i in x if(i not in stop_words)])
tf.head()

localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 3/7
9/23/22, 9:17 PM Untitled13 - Jupyter Notebook

In [ ]:

tf['tokens']=tf['tokens'].apply(lambda x:' '.join([w for w in x if len(w)>3]))


tf.head()

In [ ]:

tf=tf.replace('',np.NaN)
tf.dropna(axis=0,inplace=True)
tf.head()

In [ ]:

from nltk.tokenize import word_tokenize


tokens=[]
for i in list(tf.loc[:,'tokens']):
tokens+=word_tokenize(i)

In [ ]:

print(tokens)

In [ ]:

mpw=[]
for i in set(tokens):
if(tokens.count(i)>500):
mpw.append(i)
print(i,tokens.count(i))

In [ ]:

plt.bar(mpw,[tokens.count(i) for i in mpw])

In [ ]:

#exp-3 ML algorithms
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 4/7
9/23/22, 9:17 PM Untitled13 - Jupyter Notebook

In [ ]:

social = pd.read_csv("Social_Network_Ads.csv")
X=social.iloc[:,[2,3]].values
y=social.iloc[:,-1].values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)
sc=StandardScaler()
sc.fit(X_train)
X_train=sc.transform(X_train)
X_test=sc.transform(X_test)
svc=SVC(kernel='linear',C=10.0,random_state=1)
svc.fit(X_train,y_train)
SVC(C=10.0, kernel='linear', random_state=1)
y_pred=svc.predict(X_test)
conf_matrix=confusion_matrix(y_true=y_test,y_pred=y_pred)
fig,ax=plt.subplots(figsize=(5,5))
ax.matshow(conf_matrix,cmap=plt.cm.Oranges,alpha=0.3)

In [ ]:

for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j,y=i,s=conf_matrix[i,j],va='center',size='xx-large')
plt.plot(y_test,y_pred)
plt.xlabel('Predictions',fontsize=18)
plt.ylabel('Actuals',fontsize=18)
plt.title('Confusion Matrix',fontsize=18)
plt.show()

In [ ]:

print('Precision: %.3f'%precision_score(y_test,y_pred))
print('Recall: %.3f'%recall_score(y_test,y_pred))
print('Accuracy: %3f'%accuracy_score(y_test,y_pred))
print('fl_score: %.3f'%f1_score(y_test,y_pred))

In [ ]:

#exp-4 correlation techniques


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [ ]:

dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [ ]:

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state =1)

localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 5/7
9/23/22, 9:17 PM Untitled13 - Jupyter Notebook

In [ ]:

from sklearn.linear_model import LinearRegression


regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [ ]:

y_pred = regressor.predict(X_test)

In [ ]:

plt.scatter(X_train, y_train, color = 'black')


plt.plot(X_train,regressor.predict(X_train),color = 'red')
plt.title('Salary vs Experience (Training set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

In [ ]:

from numpy import cov


from scipy.stats import pearsonr
covariance = cov(y_test,y_pred)
corr,_ = pearsonr(y_test,y_pred)
print('covariance:', covariance)
print('Pearsons correlation: %.3f' % corr)

In [7]:

#exp-5 classification algo


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [8]:

dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [9]:

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 6/7
9/23/22, 9:17 PM Untitled13 - Jupyter Notebook

In [10]:

from sklearn.linear_model import LogisticRegression


logreg = LogisticRegression()
logreg.fit( X_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
.format(logreg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
.format(logreg.score(X_test, y_test)))

Accuracy of Logistic regression classifier on training set: 0.80

Accuracy of Logistic regression classifier on test set: 0.89

In [11]:

from sklearn.ensemble import RandomForestClassifier


randclas=RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
randclas.fit(X_train, y_train)
print('Accuracy of Random Forest classifier on training set:{:.2f}'.format(randclas.score(X
print('Accuracy of Random forest classifier on testing set:{:.2f}'.format(randclas.score(X_

Accuracy of Random Forest classifier on training set:0.98

Accuracy of Random forest classifier on testing set:0.92

In [13]:

from sklearn.svm import SVC


svcclas = SVC(kernel = 'linear', random_state = 0)
svcclas.fit(X_train, y_train)
print('Accuracy of support vector machine on training set: {:.2f}'.format(svcclas.score(X_t
print('Accuracy of support vector machine on testing set: {:.2f}'.format(svcclas.score(X_te

Accuracy of support vector machine on training set: 0.81

Accuracy of support vector machine on testing set: 0.89

In [21]:

from sklearn.ensemble import GradientBoostingRegressor


gbr = GradientBoostingRegressor(n_estimators = 1000, max_depth = 3, min_samples_split = 5,l
gbr.fit(X_train, y_train)
print(" Accuracy of gradient boosting on training set: %.3f" % gbr.score(X_train, y_train))
print(" Accuracy of gradient boosting on testing set: %.3f" % gbr.score(X_test, y_test))

Accuracy of gradient boosting on training set: 0.860

Accuracy of gradient boosting on testing set: 0.717

C:\Users\hi\anaconda3\lib\site-packages\sklearn\ensemble\_gb.py:286: FutureW
arning: The loss 'ls' was deprecated in v1.0 and will be removed in version
1.2. Use 'squared_error' which is equivalent.

warnings.warn(

localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 7/7

You might also like