P2) Code Email Spam Detection
P2) Code Email Spam Detection
In [14]: import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
In [15]: df = pd.read_csv("./emails.csv")
In [16]: df.head()
Out[16]: Email
the to ect and for of a you hou ... connevey jay valued lay infrastructure
No.
Email
0 0 0 1 0 0 0 2 0 0 ... 0 0 0 0 0
1
Email
1 8 13 24 6 6 2 102 1 27 ... 0 0 0 0 0
2
Email
2 0 0 1 0 0 0 8 0 0 ... 0 0 0 0 0
3
Email
3 0 5 22 0 5 1 51 2 10 ... 0 0 0 0 0
4
Email
4 7 6 17 1 5 2 57 0 9 ... 0 0 0 0 0
5
In [17]: df.isnull().sum()
Email No. 0
Out[17]:
the 0
to 0
ect 0
and 0
..
military 0
allowing 0
ff 0
dry 0
Prediction 0
Length: 3002, dtype: int64
In [18]: X = df.iloc[:,1:3001]
X
Out[18]: the to ect and for of a you hou in ... enhancements connevey jay valued lay
0 0 0 1 0 0 0 2 0 0 0 ... 0 0 0 0 0
1 8 13 24 6 6 2 102 1 27 18 ... 0 0 0 0 0
2 0 0 1 0 0 0 8 0 0 4 ... 0 0 0 0 0
3 0 5 22 0 5 1 51 2 10 1 ... 0 0 0 0 0
4 7 6 17 1 5 2 57 0 9 3 ... 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5167 2 2 2 3 0 0 32 0 0 5 ... 0 0 0 0 0
5169 0 0 1 1 0 0 11 0 0 1 ... 0 0 0 0 0
5170 2 7 1 0 2 1 28 2 0 8 ... 0 0 0 0 0
In [19]: Y = df.iloc[:,-1].values
Y
Out[24]: ▾ KNeighborsClassifier
KNeighborsClassifier(n_neighbors=7)
In [25]: print(knn.predict(X_test))
[0 0 1 ... 0 1 0]
0.8685990338164251
In [ ]: