Scaling in One Range: 5172 Rows × 3002 Columns
Scaling in One Range: 5172 Rows × 3002 Columns
import pandas as pd
import numpy as np
In [2]:
df=pd.read_csv("emails.csv")
In [3]:
df
Out[3]:
Email
the to ect and for of a you hou
No.
Email
0 0 0 1 0 0 0 2 0 0
1
Email
1 8 13 24 6 6 2 102 1 27
2
Email
2 0 0 1 0 0 0 8 0 0
3
Email
3 0 5 22 0 5 1 51 2 10
4
Email
4 7 6 17 1 5 2 57 0 9
5
... ... ... ... ... ... ... ... ... ... ...
Email
5167 2 2 2 3 0 0 32 0 0
5168
Email
5168 35 27 11 2 6 5 151 4 3
5169
Email
5169 0 0 1 1 0 0 11 0 0
5170
Email
5170 2 7 1 0 2 1 28 2 0
5171
Email
5171 22 24 5 1 6 5 148 8 2
5172
In [5]:
df.isnull().sum()
Out[5]:
Email No. 0
the 0
to 0
ect 0
and 0
..
military 0
allowing 0
ff 0
dry 0
Prediction 0
Length: 3002, dtype: int64
In [6]:
df.shape
Out[6]:
(5172, 3002)
In [7]:
x=df.drop(['Email No.','Prediction'],axis=1)
y=df['Prediction']
x.shape
Out[7]:
(5172, 3000)
In [8]:
y.shape
Out[8]:
(5172,)
In [12]:
scaler=MinMaxScaler()
x_scale=scaler.fit_transform(x)
x_scale.shape
Out[12]:
(5172, 3000)
In [14]:
x_train,x_test,y_train,y_test=train_test_split
In [16]:
set(x.dtypes)
Out[16]:
{dtype('int64')}
In [18]:
import seaborn as sns
sns.countplot(x=y)
Out[18]:
<Axes: xlabel='Prediction', ylabel='count'>
In [21]:
k_values = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
accuracy_values = []
In [24]:
from tqdm.notebook import tqdm
from sklearn import metrics
for i in tqdm(range(len(k_values))):
model = KNeighborsClassifier(n_neighbors=k
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = metrics.accuracy_score(y_test,
accuracy_values.append(accuracy)
In [25]:
accuracy_values
Out[25]:
[0.9033255993812839,
0.8631090487238979,
0.8460943542150039,
0.8368136117556071,
0.8213457076566125,
0.8012374323279196,
0.7880897138437741,
0.7710750193348801,
0.7610208816705336,
0.7470997679814385,
0.7393658159319412,
0.7293116782675947,
0.7138437741686001,
0.7068832173240526,
0.6960556844547564]
In [27]:
import plotly.express as px
px.line(x=k_values, y=accuracy_values)
0.9
0.85
0.8
y
0.75
0.7
10 20
In [28]:
optimal_k = -1
optimal_accuracy = -1
for i in list(zip(k_values, accuracy_values))
if i[1] > optimal_accuracy:
optimal_k = i[0]
optimal_accuracy = i[1]
In [29]:
knn_model = KNeighborsClassifier(n_neighbors=o
In [30]:
knn_model.fit(x_train, y_train)
Out[30]:
▾ KNeighborsClassifier
KNeighborsClassifier(n_neighbors=1)
In [31]:
y_pred = knn_model.predict(x_test)
In [32]:
print(metrics.classification_report(y_test, y_
accuracy 0.90
1293
macro avg 0.88 0.91 0.89
1293
weighted avg 0.91 0.90 0.91
1293
In [47]:
from sklearn.metrics import ConfusionMatrixDis
ConfusionMatrixDisplay.from_predictions(y_test
Out[47]:
<sklearn.metrics._plot.confusion_matrix.Confu
sionMatrixDisplay at 0x265260408d0>
Svm
In [34]:
from sklearn.svm import SVC
svm_model = SVC(kernel='sigmoid')
In [35]:
svm_model.fit(x_train,y_train)
Out[35]:
▾ SVC
SVC(kernel='sigmoid')
In [40]:
from sklearn.metrics import accuracy_score
y_predict=svm_model.predict(x_test)
print("accuracy" ,accuracy_score(y_test,y_pred
accuracy 0.8561484918793504
In [41]:
svm_model = SVC(kernel='linear')
In [42]:
svm_model.fit(x_train,y_train)
Out[42]:
▾ SVC
SVC(kernel='linear')
In [43]:
y_predict=svm_model.predict(x_test)
print("accuracy" ,accuracy_score(y_test,y_pred
accuracy 0.9659706109822119
In [44]:
svm_model = SVC(kernel='rbf')
svm_model.fit(x_train,y_train)
y_predict=svm_model.predict(x_test)
print("accuracy" ,accuracy_score(y_test,y_pred
accuracy 0.9505027068832174
In [45]:
svm_model = SVC(kernel='poly')
svm_model.fit(x_train,y_train)
y_predict=svm_model.predict(x_test)
print("accuracy" ,accuracy_score(y_test,y_pred
accuracy 0.7548337200309359
In [ ]: