Email Spam Classifier
Email Spam Classifier
Email Spam Classifier
May 4, 2024
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle: Support for setting the
'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed
two minor releases later; use 'mathtext.fallback : 'cm' instead.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will
1
be removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The savefig.jpeg_quality rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed
two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The animation.avconv_path rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The animation.avconv_args rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
return
2
wordcloud = WordCloud(width = 1200, height = 800, stopwords=stopwords,␣
↪max_font_size = 50, margin=0, background_color = "white").generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
wordcloud.to_file(output_image_file)
return
Test_punc_removed_join = ''.join(Test_punc_removed)
Test_punc_removed_join_clean = [word for word in␣
↪Test_punc_removed_join.split() if word.lower() not in stopwords.
↪words('english')]
return vectorizer.fit_transform(v_data_column)
3
print(classification_report(y_test, y_predict_test))
print("test set")
4
pyplot.show()
return
def apply_svm(self, X, y):
#DIVIDE THE DATA INTO TRAINING AND TESTING PRIOR TO TRAINING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Training model
#'linear', 'poly', 'rbf'
params = {'kernel': 'linear', 'C': 2, 'gamma': 1}
svm_cv = svm.SVC(C=params['C'], kernel=params['kernel'],␣
↪gamma=params['gamma'], probability=True)
svm_cv.fit(X_train, y_train)
# Predicting the Test set results
y_predict_test = svm_cv.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
#sns.heatmap(cm, annot=True)
#Evaluating Model
print(classification_report(y_test, y_predict_test))
print("test set")
5
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('SVM: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='SVM')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
return
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
text 5728 non-null object
spam 5728 non-null int64
dtypes: int64(1), object(1)
memory usage: 89.6+ KB
[8]: data_frame.head()
6
data_frame.groupby('spam').describe()
[9]: text
count unique top freq
spam
0 4360 4327 Subject: tiger evals - attachment tiger hosts… 2
1 1368 1368 Subject: localized software , all languages av… 1
[10]: 43952
#plt.legend()
#plt.title('Distribution of Number of Words')
#plt.xlabel('Number of Words')
#plt.show()
7
[12]: #data_frame['spam']==0
data_frame[data_frame['spam']==0].text.values
print(max(ham_words_length))
print(max(spam_words_length))
8479
6131
8
plt.title('Distribution of Number of Words')
plt.xlabel('Number of Words')
plt.legend()
plt.show()
ham_meanword_length = data_frame[data_frame['spam']==0].text.
↪apply(mean_word_length)
spam_meanword_length = data_frame[data_frame['spam']==1].text.
↪apply(mean_word_length)
9
sns.distplot(spam_meanword_length , norm_hist = True, bins = 30, label = 'Spam')
plt.title('Distribution of Mean Word Length')
plt.xlabel('Mean Word Length')
plt.legend()
plt.show()
#There is not a significant difference for the length of words used by ham and␣
↪spam emails
def stop_words_ratio(x):
num_total_words = 0
10
num_stop_words = 0
for word in word_tokenize(x):
if word in stop_words:
num_stop_words += 1
num_total_words += 1
return num_stop_words/num_total_words
ham_stopwords = data_frame[data_frame['spam']==0].text.apply(stop_words_ratio)
spam_stopwords = data_frame[data_frame['spam']==1].text.apply(stop_words_ratio)
11
[16]: spam_stopwords
[16]: 0 0.230769
1 0.277778
2 0.397727
3 0.191919
4 0.396226
…
1363 0.342105
1364 0.365854
1365 0.437500
1366 0.446809
1367 0.320024
Name: text, Length: 1368, dtype: float64
12
print( 'Spam percentage =', (len(spam) / len(data_frame) )*100,"%")
print( 'Ham percentage =', (len(ham) / len(data_frame) )*100,"%")
sns.countplot(data_frame['Ham(0) and Spam(1)'], label = "Count")
#word_cloud_obj = generate_word_cloud()
#word_cloud_obj.word_cloud(ham["clean_text"], "ham_word_cloud.png")
#word_cloud_obj.word_cloud(spam["clean_text"], "spam_word_cloud.png")
#text_spam = " ".join(review for review in spam["clean_text"])
13
14
[19]: data_clean_obj = data_cleaning()
# Let's test the newly added function
#data_frame['clean_text'] = data_frame['text'].apply(message_cleaning)
#data_frame['clean_text'] = data_frame['text'].apply(data_clean_obj.
↪message_cleaning)
data_frame['clean_text'] = data_clean_obj.apply_to_column(data_frame['text'])
[20]: data_frame.head()
clean_text
0 Subject naturally irresistible corporate ident…
1 Subject stock trading gunslinger fanny merrill…
2 Subject unbelievable new homes made easy im wa…
3 Subject 4 color printing special request addit…
4 Subject money get software cds software compat…
[21]: data_obj.data_frame.head()
clean_text
0 Subject naturally irresistible corporate ident…
1 Subject stock trading gunslinger fanny merrill…
2 Subject unbelievable new homes made easy im wa…
3 Subject 4 color printing special request addit…
4 Subject money get software cds software compat…
[22]: data_obj.write_to_csvfile("processed_file.csv")
15
[24]: #Separating Descriptive and Target Feature
X = spamham_countvectorizer
label = data_frame['spam'].values
y = label
[25]: cv_object.apply_naive_bayes(X,y)
test set
16
17
No Skill: ROC AUC=0.500
Naive Bayes: ROC AUC=0.998
18
[26]: cv_object.apply_svm(X,y)
test set
19
[0.0244898 0.9755102 ]]
20
No Skill: ROC AUC=0.500
SVM: ROC AUC=0.998
21
[ ]:
[ ]:
22