Big Data Merged
Big Data Merged
Big Data Merged
In [ ]:
#exp-1 datapreprocessing
import pandas as pd
import numpy as np
df=pd.read_csv('housing_price.csv')
In [ ]:
In [ ]:
print(df.shape,df.size,df.ndim)
In [ ]:
In [ ]:
df.columns
In [ ]:
In [ ]:
df.head(10)
In [ ]:
In [ ]:
df.isnull().sum()
In [ ]:
df.head(10)
In [ ]:
In [ ]:
df.isnull().sum()
localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 1/7
9/23/22, 9:17 PM Untitled13 - Jupyter Notebook
In [ ]:
In [ ]:
In [ ]:
df=df.dropna()
In [ ]:
df.isnull().sum()
In [ ]:
df.dtypes.value_counts()
In [ ]:
print(df.shape,df.size,df.ndim)
In [ ]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]
In [ ]:
In [ ]:
In [ ]:
In [ ]:
#exp-2 twitterdataset
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
df=pd.read_csv("tweet.csv")
df=df.drop(labels=["id","label"],axis=1)
df.head()
localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 2/7
9/23/22, 9:17 PM Untitled13 - Jupyter Notebook
In [ ]:
In [ ]:
df['new']=np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")
df.head()
In [ ]:
In [ ]:
In [ ]:
import nltk
nltk.download('punkt')
tf=pd.DataFrame()
from nltk.tokenize import word_tokenize
tf['tokens']=df['new'].apply(lambda x: word_tokenize(x.lower()))
tf.head()
In [ ]:
In [ ]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
tf['tokens'].apply(lambda x: [lemmatizer.lemmatize(i) for i in x])
tf.head()
In [ ]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tf['tokens']=tf['tokens'].apply(lambda x: [ i for i in x if(i not in stop_words)])
tf.head()
localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 3/7
9/23/22, 9:17 PM Untitled13 - Jupyter Notebook
In [ ]:
In [ ]:
tf=tf.replace('',np.NaN)
tf.dropna(axis=0,inplace=True)
tf.head()
In [ ]:
In [ ]:
print(tokens)
In [ ]:
mpw=[]
for i in set(tokens):
if(tokens.count(i)>500):
mpw.append(i)
print(i,tokens.count(i))
In [ ]:
In [ ]:
#exp-3 ML algorithms
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 4/7
9/23/22, 9:17 PM Untitled13 - Jupyter Notebook
In [ ]:
social = pd.read_csv("Social_Network_Ads.csv")
X=social.iloc[:,[2,3]].values
y=social.iloc[:,-1].values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)
sc=StandardScaler()
sc.fit(X_train)
X_train=sc.transform(X_train)
X_test=sc.transform(X_test)
svc=SVC(kernel='linear',C=10.0,random_state=1)
svc.fit(X_train,y_train)
SVC(C=10.0, kernel='linear', random_state=1)
y_pred=svc.predict(X_test)
conf_matrix=confusion_matrix(y_true=y_test,y_pred=y_pred)
fig,ax=plt.subplots(figsize=(5,5))
ax.matshow(conf_matrix,cmap=plt.cm.Oranges,alpha=0.3)
In [ ]:
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j,y=i,s=conf_matrix[i,j],va='center',size='xx-large')
plt.plot(y_test,y_pred)
plt.xlabel('Predictions',fontsize=18)
plt.ylabel('Actuals',fontsize=18)
plt.title('Confusion Matrix',fontsize=18)
plt.show()
In [ ]:
print('Precision: %.3f'%precision_score(y_test,y_pred))
print('Recall: %.3f'%recall_score(y_test,y_pred))
print('Accuracy: %3f'%accuracy_score(y_test,y_pred))
print('fl_score: %.3f'%f1_score(y_test,y_pred))
In [ ]:
In [ ]:
dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
In [ ]:
localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 5/7
9/23/22, 9:17 PM Untitled13 - Jupyter Notebook
In [ ]:
In [ ]:
y_pred = regressor.predict(X_test)
In [ ]:
In [ ]:
In [7]:
In [8]:
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
In [9]:
localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 6/7
9/23/22, 9:17 PM Untitled13 - Jupyter Notebook
In [10]:
In [11]:
In [13]:
In [21]:
C:\Users\hi\anaconda3\lib\site-packages\sklearn\ensemble\_gb.py:286: FutureW
arning: The loss 'ls' was deprecated in v1.0 and will be removed in version
1.2. Use 'squared_error' which is equivalent.
warnings.warn(
localhost:8888/notebooks/Untitled13.ipynb?kernel_name=python3 7/7