0% found this document useful (0 votes)
15 views22 pages

Email Spam Classifier

Download as pdf or txt
Download as pdf or txt
Download as pdf or txt
You are on page 1/ 22

email-spam-classifier

May 4, 2024

[1]: import pandas as pd


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import string
from nltk.corpus import stopwords
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
from sklearn import model_selection
from sklearn import svm
from nltk import word_tokenize
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
from sklearn.metrics import plot_confusion_matrix

In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle: Support for setting the
'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed
two minor releases later; use 'mathtext.fallback : 'cm' instead.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will

1
be removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The savefig.jpeg_quality rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed
two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The animation.avconv_path rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The animation.avconv_args rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.

[2]: #Parent Class for Data


class data_read_write(object):
def __init__(self):
pass
def __init__(self, file_link):
self.data_frame = pd.read_csv(file_link)
def read_csv_file(self, file_link):
#data_frame_read = pd.read_csv(file_link)
#return data_frame_read
#self.data_frame = pd.read_csv(file_link)
return self.data_frame
def write_to_csvfile(self, file_link):
self.data_frame.to_csv(file_link, encoding='utf-8', index=False,␣
↪header=True)

return

[3]: #Child Class for Data_read_write


class generate_word_cloud(data_read_write):
def __init__(self):
pass
#Child own Function
def variance_column(self, data):
return variance(data)
#Polymorphism
def word_cloud(self, data_frame_column, output_image_file):
text = " ".join(review for review in data_frame_column)
stopwords = set(STOPWORDS)
stopwords.update(["subject"])

2
wordcloud = WordCloud(width = 1200, height = 800, stopwords=stopwords,␣
↪max_font_size = 50, margin=0, background_color = "white").generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
wordcloud.to_file(output_image_file)
return

[4]: #Child Class for Data_read_write


class data_cleaning(data_read_write):
def __init__(self):
pass
def message_cleaning(self, message):
Test_punc_removed = [char for char in message if char not in string.
↪punctuation]

Test_punc_removed_join = ''.join(Test_punc_removed)
Test_punc_removed_join_clean = [word for word in␣
↪Test_punc_removed_join.split() if word.lower() not in stopwords.

↪words('english')]

final_join = ' '.join(Test_punc_removed_join_clean)


return final_join

def apply_to_column(self, data_column_text):


data_processed = data_column_text.apply(self.message_cleaning)
return data_processed

[5]: #Child Class for Data_read_write


class apply_embeddding_and_model(data_read_write):
def __init__(self):
pass
def apply_count_vector(self, v_data_column):
vectorizer = CountVectorizer(min_df=2,analyzer = "word",tokenizer =␣
↪None,preprocessor = None,stop_words = None)

return vectorizer.fit_transform(v_data_column)

def apply_naive_bayes(self, X, y):


#DIVIDE THE DATA INTO TRAINING AND TESTING PRIOR TO TRAINING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Training model
NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)
# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
#sns.heatmap(cm, annot=True)
#Evaluating Model

3
print(classification_report(y_test, y_predict_test))
print("test set")

print("\nAccuracy Score: " + str(metrics.accuracy_score(y_test,␣


↪y_predict_test)))
print("F1 Score: " + str(metrics.f1_score(y_test, y_predict_test)))
print("Recall: " + str(metrics.recall_score(y_test, y_predict_test)))
print("Precision: " + str(metrics.precision_score(y_test,␣
↪y_predict_test)))

class_names = ['ham', 'spam']


titles_options = [("Confusion matrix, without normalization", None),
("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
disp = plot_confusion_matrix(NB_classifier, X_test, y_test,
display_labels=class_names,
cmap=plt.cm.Blues,
normalize=normalize)
disp.ax_.set_title(title)
print(title)
print(disp.confusion_matrix)
plt.show()

# generate a no skill prediction (majority class)


ns_probs = [0 for _ in range(len(y_test))]
# predict probabilities
lr_probs = NB_classifier.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Naive Bayes: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Naive Bayes')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot

4
pyplot.show()

return
def apply_svm(self, X, y):
#DIVIDE THE DATA INTO TRAINING AND TESTING PRIOR TO TRAINING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Training model
#'linear', 'poly', 'rbf'
params = {'kernel': 'linear', 'C': 2, 'gamma': 1}
svm_cv = svm.SVC(C=params['C'], kernel=params['kernel'],␣
↪gamma=params['gamma'], probability=True)

svm_cv.fit(X_train, y_train)
# Predicting the Test set results
y_predict_test = svm_cv.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
#sns.heatmap(cm, annot=True)
#Evaluating Model
print(classification_report(y_test, y_predict_test))
print("test set")

print("\nAccuracy Score: " + str(metrics.accuracy_score(y_test,␣


↪y_predict_test)))
print("F1 Score: " + str(metrics.f1_score(y_test, y_predict_test)))
print("Recall: " + str(metrics.recall_score(y_test, y_predict_test)))
print("Precision: " + str(metrics.precision_score(y_test,␣
↪y_predict_test)))

class_names = ['ham', 'spam']


titles_options = [("Confusion matrix, without normalization", None),
("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
disp = plot_confusion_matrix(svm_cv, X_test, y_test,
display_labels=class_names,
cmap=plt.cm.Blues,
normalize=normalize)
disp.ax_.set_title(title)
print(title)
print(disp.confusion_matrix)
plt.show()

# generate a no skill prediction (majority class)


ns_probs = [0 for _ in range(len(y_test))]
# predict probabilities
lr_probs = svm_cv.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores

5
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('SVM: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='SVM')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
return

[6]: data_obj = data_read_write("emails.csv")

[7]: data_frame = data_obj.read_csv_file("processed.csv")


data_frame.head()
data_frame.tail()
data_frame.describe()
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
text 5728 non-null object
spam 5728 non-null int64
dtypes: int64(1), object(1)
memory usage: 89.6+ KB

[8]: data_frame.head()

[8]: text spam


0 Subject: naturally irresistible your corporate… 1
1 Subject: the stock trading gunslinger fanny i… 1
2 Subject: unbelievable new homes made easy im … 1
3 Subject: 4 color printing special request add… 1
4 Subject: do not have money , get software cds … 1

[9]: #Visualize dataset


# Let's see which message is the most popular ham/spam message

6
data_frame.groupby('spam').describe()

[9]: text
count unique top freq
spam
0 4360 4327 Subject: tiger evals - attachment tiger hosts… 2
1 1368 1368 Subject: localized software , all languages av… 1

[10]: # Let's get the length of the messages


data_frame['length'] = data_frame['text'].apply(len)
data_frame['length'].max()

[10]: 43952

[11]: #data_frame['length'].plot(bins=100, kind='hist')


#Length of characters for ham emails is more as compared to spam emails
sns.set(rc={'figure.figsize':(11.7,8.27)})
ham_messages_length = data_frame[data_frame['spam']==0]
spam_messages_length = data_frame[data_frame['spam']==1]

ham_messages_length['length'].plot(bins=100, kind='hist',label = 'Ham')


spam_messages_length['length'].plot(bins=100, kind='hist',label = 'Spam')
#sns.distplot(ham_messages_length['length'], bins=10, norm_hist = True, label =␣
↪'Ham')

#sns.distplot(spam_messages_length['length'], bins=10, norm_hist = True, label␣


↪= 'Spam')

plt.title('Distribution of Length of Email Text')


plt.xlabel('Length of Email Text')
plt.legend()

#ax = sns.distplot(ham_words_length, norm_hist = True, bins = 30, label = 'Ham')


#ax = sns.distplot(spam_words_length, norm_hist = True, bins = 30, label =␣
↪'Spam')

#plt.legend()
#plt.title('Distribution of Number of Words')
#plt.xlabel('Number of Words')
#plt.show()

[11]: <matplotlib.legend.Legend at 0x2e158719f88>

7
[12]: #data_frame['spam']==0
data_frame[data_frame['spam']==0].text.values

ham_words_length = [len(word_tokenize(title)) for title in␣


↪data_frame[data_frame['spam']==0].text.values]

spam_words_length = [len(word_tokenize(title)) for title in␣


↪data_frame[data_frame['spam']==1].text.values]

print(max(ham_words_length))
print(max(spam_words_length))

8479
6131

[13]: #There is spike in spam emails with less number of words


#Even when our dataset include 24 percent of spam emails out of total emails-
#Looks like Spam emails have less words as compared to ham emails
sns.set(rc={'figure.figsize':(11.7,8.27)})
ax = sns.distplot(ham_words_length, norm_hist = True, bins = 30, label = 'Ham')
ax = sns.distplot(spam_words_length, norm_hist = True, bins = 30, label =␣
↪'Spam')

#ham_words_length.plot(bins=100, kind='hist',label = 'Ham')


#spam_words_length.plot(bins=100, kind='hist',label = 'Spam')

8
plt.title('Distribution of Number of Words')
plt.xlabel('Number of Words')
plt.legend()

plt.show()

[14]: def mean_word_length(x):


word_lengths = np.array([])
for word in word_tokenize(x):
word_lengths = np.append(word_lengths, len(word))
return word_lengths.mean()

ham_meanword_length = data_frame[data_frame['spam']==0].text.
↪apply(mean_word_length)

spam_meanword_length = data_frame[data_frame['spam']==1].text.
↪apply(mean_word_length)

sns.distplot(ham_meanword_length, norm_hist = True, bins = 30, label = 'Ham')

9
sns.distplot(spam_meanword_length , norm_hist = True, bins = 30, label = 'Spam')
plt.title('Distribution of Mean Word Length')
plt.xlabel('Mean Word Length')
plt.legend()
plt.show()

#There is not a significant difference for the length of words used by ham and␣
↪spam emails

[15]: #Checking ratio of stop words


#Both spam and ham email contain stopwords
#All Spam emails contain stop words with a mean of 0.281
#All Ham emails contain stop words with a mean of 0.278
#But we can see from the graph, spam email contain high stop words ratio as␣
↪compared to ham emails.

from nltk.corpus import stopwords


stop_words = set(stopwords.words('english'))

def stop_words_ratio(x):
num_total_words = 0

10
num_stop_words = 0
for word in word_tokenize(x):
if word in stop_words:
num_stop_words += 1
num_total_words += 1
return num_stop_words/num_total_words

ham_stopwords = data_frame[data_frame['spam']==0].text.apply(stop_words_ratio)
spam_stopwords = data_frame[data_frame['spam']==1].text.apply(stop_words_ratio)

sns.distplot(ham_stopwords, norm_hist = True, label = 'Ham')


sns.distplot(spam_stopwords, label = 'Spam')

print('Ham Mean: {:.3f}'.format(ham_stopwords.values.mean()))


print('Spam Mean: {:.3f}'.format(spam_stopwords.values.mean()))
plt.title('Distribution of Stop-word Ratio')
plt.xlabel('Stop Word Ratio')
plt.legend()

Ham Mean: 0.278


Spam Mean: 0.281

[15]: <matplotlib.legend.Legend at 0x2e15f4d2848>

11
[16]: spam_stopwords

[16]: 0 0.230769
1 0.277778
2 0.397727
3 0.191919
4 0.396226

1363 0.342105
1364 0.365854
1365 0.437500
1366 0.446809
1367 0.320024
Name: text, Length: 1368, dtype: float64

[33]: # Let's divide the messages into spam and ham


ham = data_frame[data_frame['spam']==0]
spam = data_frame[data_frame['spam']==1]
spam['length'].plot(bins=60, kind='hist')
ham['length'].plot(bins=60, kind='hist')
data_frame['Ham(0) and Spam(1)'] = data_frame['spam']

12
print( 'Spam percentage =', (len(spam) / len(data_frame) )*100,"%")
print( 'Ham percentage =', (len(ham) / len(data_frame) )*100,"%")
sns.countplot(data_frame['Ham(0) and Spam(1)'], label = "Count")

#word_cloud_obj = generate_word_cloud()
#word_cloud_obj.word_cloud(ham["clean_text"], "ham_word_cloud.png")
#word_cloud_obj.word_cloud(spam["clean_text"], "spam_word_cloud.png")
#text_spam = " ".join(review for review in spam["clean_text"])

Spam percentage = 23.88268156424581 %


Ham percentage = 76.11731843575419 %

[33]: <AxesSubplot:xlabel='Ham(0) and Spam(1)', ylabel='count'>

[18]: word_cloud_obj = generate_word_cloud()


word_cloud_obj.word_cloud(ham["text"], "ham_word_cloud.png")
word_cloud_obj.word_cloud(spam["text"], "spam_word_cloud.png")

13
14
[19]: data_clean_obj = data_cleaning()
# Let's test the newly added function
#data_frame['clean_text'] = data_frame['text'].apply(message_cleaning)
#data_frame['clean_text'] = data_frame['text'].apply(data_clean_obj.
↪message_cleaning)

data_frame['clean_text'] = data_clean_obj.apply_to_column(data_frame['text'])

[20]: data_frame.head()

[20]: text spam length \


0 Subject: naturally irresistible your corporate… 1 1484
1 Subject: the stock trading gunslinger fanny i… 1 598
2 Subject: unbelievable new homes made easy im … 1 448
3 Subject: 4 color printing special request add… 1 500
4 Subject: do not have money , get software cds … 1 235

clean_text
0 Subject naturally irresistible corporate ident…
1 Subject stock trading gunslinger fanny merrill…
2 Subject unbelievable new homes made easy im wa…
3 Subject 4 color printing special request addit…
4 Subject money get software cds software compat…

[21]: data_obj.data_frame.head()

[21]: text spam length \


0 Subject: naturally irresistible your corporate… 1 1484
1 Subject: the stock trading gunslinger fanny i… 1 598
2 Subject: unbelievable new homes made easy im … 1 448
3 Subject: 4 color printing special request add… 1 500
4 Subject: do not have money , get software cds … 1 235

clean_text
0 Subject naturally irresistible corporate ident…
1 Subject stock trading gunslinger fanny merrill…
2 Subject unbelievable new homes made easy im wa…
3 Subject 4 color printing special request addit…
4 Subject money get software cds software compat…

[22]: data_obj.write_to_csvfile("processed_file.csv")

[23]: #APPLY COUNT VECTORIZER TO OUR MESSAGES LIST

# Define the cleaning pipeline we defined earlier


#vectorizer = CountVectorizer()
cv_object = apply_embeddding_and_model()
spamham_countvectorizer = cv_object.apply_count_vector(data_frame['clean_text'])

15
[24]: #Separating Descriptive and Target Feature
X = spamham_countvectorizer
label = data_frame['spam'].values
y = label

[25]: cv_object.apply_naive_bayes(X,y)

precision recall f1-score support

0 1.00 0.99 0.99 901


1 0.98 0.99 0.98 245

accuracy 0.99 1146


macro avg 0.99 0.99 0.99 1146
weighted avg 0.99 0.99 0.99 1146

test set

Accuracy Score: 0.9921465968586387


F1 Score: 0.9817444219066936
Recall: 0.9877551020408163
Precision: 0.9758064516129032
Confusion matrix, without normalization
[[895 6]
[ 3 242]]
Normalized confusion matrix
[[0.99334073 0.00665927]
[0.0122449 0.9877551 ]]

16
17
No Skill: ROC AUC=0.500
Naive Bayes: ROC AUC=0.998

18
[26]: cv_object.apply_svm(X,y)

precision recall f1-score support

0 0.99 0.99 0.99 901


1 0.98 0.98 0.98 245

accuracy 0.99 1146


macro avg 0.99 0.98 0.99 1146
weighted avg 0.99 0.99 0.99 1146

test set

Accuracy Score: 0.9904013961605584


F1 Score: 0.9775051124744377
Recall: 0.9755102040816327
Precision: 0.9795081967213115
Confusion matrix, without normalization
[[896 5]
[ 6 239]]
Normalized confusion matrix
[[0.99445061 0.00554939]

19
[0.0244898 0.9755102 ]]

20
No Skill: ROC AUC=0.500
SVM: ROC AUC=0.998

21
[ ]:

[ ]:

22

You might also like