0% found this document useful (0 votes)

30 views22 pages

Email Spam Classifier

The document discusses building models to classify emails as spam or ham. It imports various libraries and defines classes for reading data, cleaning text, applying embeddings and building classification models using naive bayes and SVM. Functions are used to preprocess text, vectorize data, train and evaluate models and generate classification metrics and confusion matrices.

Uploaded by

phenomenal beast

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

30 views22 pages

Email Spam Classifier

Uploaded by

phenomenal beast

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 22

email-spam-classifier

May 4, 2024

[1]: import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import string
from nltk.corpus import stopwords
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
from sklearn import model_selection
from sklearn import svm
from nltk import word_tokenize
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
from sklearn.metrics import plot_confusion_matrix

In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle: Support for setting the
'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed
two minor releases later; use 'mathtext.fallback : 'cm' instead.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will

1
be removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The savefig.jpeg_quality rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed
two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The animation.avconv_path rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The animation.avconv_args rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.

[2]: #Parent Class for Data

class data_read_write(object):
def __init__(self):
pass
def __init__(self, file_link):
self.data_frame = pd.read_csv(file_link)
def read_csv_file(self, file_link):
#data_frame_read = pd.read_csv(file_link)
#return data_frame_read
#self.data_frame = pd.read_csv(file_link)
return self.data_frame
def write_to_csvfile(self, file_link):
self.data_frame.to_csv(file_link, encoding='utf-8', index=False,␣
↪header=True)

return

[3]: #Child Class for Data_read_write

class generate_word_cloud(data_read_write):
def __init__(self):
pass
#Child own Function
def variance_column(self, data):
return variance(data)
#Polymorphism
def word_cloud(self, data_frame_column, output_image_file):
text = " ".join(review for review in data_frame_column)
stopwords = set(STOPWORDS)
stopwords.update(["subject"])

2
wordcloud = WordCloud(width = 1200, height = 800, stopwords=stopwords,␣
↪max_font_size = 50, margin=0, background_color = "white").generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
wordcloud.to_file(output_image_file)
return

[4]: #Child Class for Data_read_write

class data_cleaning(data_read_write):
def __init__(self):
pass
def message_cleaning(self, message):
Test_punc_removed = [char for char in message if char not in string.
↪punctuation]

Test_punc_removed_join = ''.join(Test_punc_removed)
Test_punc_removed_join_clean = [word for word in␣
↪Test_punc_removed_join.split() if word.lower() not in stopwords.

↪words('english')]

final_join = ' '.join(Test_punc_removed_join_clean)

return final_join

def apply_to_column(self, data_column_text):

data_processed = data_column_text.apply(self.message_cleaning)
return data_processed

[5]: #Child Class for Data_read_write

class apply_embeddding_and_model(data_read_write):
def __init__(self):
pass
def apply_count_vector(self, v_data_column):
vectorizer = CountVectorizer(min_df=2,analyzer = "word",tokenizer =␣
↪None,preprocessor = None,stop_words = None)

return vectorizer.fit_transform(v_data_column)

def apply_naive_bayes(self, X, y):

#DIVIDE THE DATA INTO TRAINING AND TESTING PRIOR TO TRAINING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Training model
NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)
# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
#sns.heatmap(cm, annot=True)
#Evaluating Model

3
print(classification_report(y_test, y_predict_test))
print("test set")

print("\nAccuracy Score: " + str(metrics.accuracy_score(y_test,␣

↪y_predict_test)))
print("F1 Score: " + str(metrics.f1_score(y_test, y_predict_test)))
print("Recall: " + str(metrics.recall_score(y_test, y_predict_test)))
print("Precision: " + str(metrics.precision_score(y_test,␣
↪y_predict_test)))

class_names = ['ham', 'spam']

titles_options = [("Confusion matrix, without normalization", None),
("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
disp = plot_confusion_matrix(NB_classifier, X_test, y_test,
display_labels=class_names,
cmap=plt.cm.Blues,
normalize=normalize)
disp.ax_.set_title(title)
print(title)
print(disp.confusion_matrix)
plt.show()

# generate a no skill prediction (majority class)

ns_probs = [0 for _ in range(len(y_test))]
# predict probabilities
lr_probs = NB_classifier.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Naive Bayes: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Naive Bayes')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot

4
pyplot.show()

return
def apply_svm(self, X, y):
#DIVIDE THE DATA INTO TRAINING AND TESTING PRIOR TO TRAINING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Training model
#'linear', 'poly', 'rbf'
params = {'kernel': 'linear', 'C': 2, 'gamma': 1}
svm_cv = svm.SVC(C=params['C'], kernel=params['kernel'],␣
↪gamma=params['gamma'], probability=True)

svm_cv.fit(X_train, y_train)
# Predicting the Test set results
y_predict_test = svm_cv.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
#sns.heatmap(cm, annot=True)
#Evaluating Model
print(classification_report(y_test, y_predict_test))
print("test set")

print("\nAccuracy Score: " + str(metrics.accuracy_score(y_test,␣

class_names = ['ham', 'spam']

titles_options = [("Confusion matrix, without normalization", None),
("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
disp = plot_confusion_matrix(svm_cv, X_test, y_test,
display_labels=class_names,
cmap=plt.cm.Blues,
normalize=normalize)
disp.ax_.set_title(title)
print(title)
print(disp.confusion_matrix)
plt.show()

# generate a no skill prediction (majority class)

ns_probs = [0 for _ in range(len(y_test))]
# predict probabilities
lr_probs = svm_cv.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores

5
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('SVM: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='SVM')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
return

[6]: data_obj = data_read_write("emails.csv")

[7]: data_frame = data_obj.read_csv_file("processed.csv")

data_frame.head()
data_frame.tail()
data_frame.describe()
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
text 5728 non-null object
spam 5728 non-null int64
dtypes: int64(1), object(1)
memory usage: 89.6+ KB

[8]: data_frame.head()

[8]: text spam

0 Subject: naturally irresistible your corporate… 1
1 Subject: the stock trading gunslinger fanny i… 1
2 Subject: unbelievable new homes made easy im … 1
3 Subject: 4 color printing special request add… 1
4 Subject: do not have money , get software cds … 1

[9]: #Visualize dataset

# Let's see which message is the most popular ham/spam message

6
data_frame.groupby('spam').describe()

[9]: text
count unique top freq
spam
0 4360 4327 Subject: tiger evals - attachment tiger hosts… 2
1 1368 1368 Subject: localized software , all languages av… 1

[10]: # Let's get the length of the messages

data_frame['length'] = data_frame['text'].apply(len)
data_frame['length'].max()

[10]: 43952

[11]: #data_frame['length'].plot(bins=100, kind='hist')

#Length of characters for ham emails is more as compared to spam emails
sns.set(rc={'figure.figsize':(11.7,8.27)})
ham_messages_length = data_frame[data_frame['spam']==0]
spam_messages_length = data_frame[data_frame['spam']==1]

ham_messages_length['length'].plot(bins=100, kind='hist',label = 'Ham')

spam_messages_length['length'].plot(bins=100, kind='hist',label = 'Spam')
#sns.distplot(ham_messages_length['length'], bins=10, norm_hist = True, label =␣
↪'Ham')

#sns.distplot(spam_messages_length['length'], bins=10, norm_hist = True, label␣

↪= 'Spam')

plt.title('Distribution of Length of Email Text')

plt.xlabel('Length of Email Text')
plt.legend()

#ax = sns.distplot(ham_words_length, norm_hist = True, bins = 30, label = 'Ham')

#ax = sns.distplot(spam_words_length, norm_hist = True, bins = 30, label =␣
↪'Spam')

#plt.legend()
#plt.title('Distribution of Number of Words')
#plt.xlabel('Number of Words')
#plt.show()

[11]: <matplotlib.legend.Legend at 0x2e158719f88>

7
[12]: #data_frame['spam']==0
data_frame[data_frame['spam']==0].text.values

ham_words_length = [len(word_tokenize(title)) for title in␣

↪data_frame[data_frame['spam']==0].text.values]

spam_words_length = [len(word_tokenize(title)) for title in␣

↪data_frame[data_frame['spam']==1].text.values]

print(max(ham_words_length))
print(max(spam_words_length))

8479
6131

[13]: #There is spike in spam emails with less number of words

#Even when our dataset include 24 percent of spam emails out of total emails-
#Looks like Spam emails have less words as compared to ham emails
sns.set(rc={'figure.figsize':(11.7,8.27)})
ax = sns.distplot(ham_words_length, norm_hist = True, bins = 30, label = 'Ham')
ax = sns.distplot(spam_words_length, norm_hist = True, bins = 30, label =␣
↪'Spam')

#ham_words_length.plot(bins=100, kind='hist',label = 'Ham')

#spam_words_length.plot(bins=100, kind='hist',label = 'Spam')

8
plt.title('Distribution of Number of Words')
plt.xlabel('Number of Words')
plt.legend()

plt.show()

[14]: def mean_word_length(x):

word_lengths = np.array([])
for word in word_tokenize(x):
word_lengths = np.append(word_lengths, len(word))
return word_lengths.mean()

ham_meanword_length = data_frame[data_frame['spam']==0].text.
↪apply(mean_word_length)

spam_meanword_length = data_frame[data_frame['spam']==1].text.
↪apply(mean_word_length)

sns.distplot(ham_meanword_length, norm_hist = True, bins = 30, label = 'Ham')

9
sns.distplot(spam_meanword_length , norm_hist = True, bins = 30, label = 'Spam')
plt.title('Distribution of Mean Word Length')
plt.xlabel('Mean Word Length')
plt.legend()
plt.show()

#There is not a significant difference for the length of words used by ham and␣
↪spam emails

[15]: #Checking ratio of stop words

#Both spam and ham email contain stopwords
#All Spam emails contain stop words with a mean of 0.281
#All Ham emails contain stop words with a mean of 0.278
#But we can see from the graph, spam email contain high stop words ratio as␣
↪compared to ham emails.

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def stop_words_ratio(x):
num_total_words = 0

10
num_stop_words = 0
for word in word_tokenize(x):
if word in stop_words:
num_stop_words += 1
num_total_words += 1
return num_stop_words/num_total_words

ham_stopwords = data_frame[data_frame['spam']==0].text.apply(stop_words_ratio)
spam_stopwords = data_frame[data_frame['spam']==1].text.apply(stop_words_ratio)

sns.distplot(ham_stopwords, norm_hist = True, label = 'Ham')

sns.distplot(spam_stopwords, label = 'Spam')

print('Ham Mean: {:.3f}'.format(ham_stopwords.values.mean()))

print('Spam Mean: {:.3f}'.format(spam_stopwords.values.mean()))
plt.title('Distribution of Stop-word Ratio')
plt.xlabel('Stop Word Ratio')
plt.legend()

Ham Mean: 0.278

Spam Mean: 0.281

[15]: <matplotlib.legend.Legend at 0x2e15f4d2848>

11
[16]: spam_stopwords

[16]: 0 0.230769
1 0.277778
2 0.397727
3 0.191919
4 0.396226
…
1363 0.342105
1364 0.365854
1365 0.437500
1366 0.446809
1367 0.320024
Name: text, Length: 1368, dtype: float64

[33]: # Let's divide the messages into spam and ham

ham = data_frame[data_frame['spam']==0]
spam = data_frame[data_frame['spam']==1]
spam['length'].plot(bins=60, kind='hist')
ham['length'].plot(bins=60, kind='hist')
data_frame['Ham(0) and Spam(1)'] = data_frame['spam']

12
print( 'Spam percentage =', (len(spam) / len(data_frame) )*100,"%")
print( 'Ham percentage =', (len(ham) / len(data_frame) )*100,"%")
sns.countplot(data_frame['Ham(0) and Spam(1)'], label = "Count")

#word_cloud_obj = generate_word_cloud()
#word_cloud_obj.word_cloud(ham["clean_text"], "ham_word_cloud.png")
#word_cloud_obj.word_cloud(spam["clean_text"], "spam_word_cloud.png")
#text_spam = " ".join(review for review in spam["clean_text"])

Spam percentage = 23.88268156424581 %

Ham percentage = 76.11731843575419 %

[33]: <AxesSubplot:xlabel='Ham(0) and Spam(1)', ylabel='count'>

[18]: word_cloud_obj = generate_word_cloud()

word_cloud_obj.word_cloud(ham["text"], "ham_word_cloud.png")
word_cloud_obj.word_cloud(spam["text"], "spam_word_cloud.png")

13
14
[19]: data_clean_obj = data_cleaning()
# Let's test the newly added function
#data_frame['clean_text'] = data_frame['text'].apply(message_cleaning)
#data_frame['clean_text'] = data_frame['text'].apply(data_clean_obj.
↪message_cleaning)

data_frame['clean_text'] = data_clean_obj.apply_to_column(data_frame['text'])

[20]: data_frame.head()

[20]: text spam length \

0 Subject: naturally irresistible your corporate… 1 1484
1 Subject: the stock trading gunslinger fanny i… 1 598
2 Subject: unbelievable new homes made easy im … 1 448
3 Subject: 4 color printing special request add… 1 500
4 Subject: do not have money , get software cds … 1 235

clean_text
0 Subject naturally irresistible corporate ident…
1 Subject stock trading gunslinger fanny merrill…
2 Subject unbelievable new homes made easy im wa…
3 Subject 4 color printing special request addit…
4 Subject money get software cds software compat…

[21]: data_obj.data_frame.head()

[21]: text spam length \

[22]: data_obj.write_to_csvfile("processed_file.csv")

[23]: #APPLY COUNT VECTORIZER TO OUR MESSAGES LIST

# Define the cleaning pipeline we defined earlier

#vectorizer = CountVectorizer()
cv_object = apply_embeddding_and_model()
spamham_countvectorizer = cv_object.apply_count_vector(data_frame['clean_text'])

15
[24]: #Separating Descriptive and Target Feature
X = spamham_countvectorizer
label = data_frame['spam'].values
y = label

[25]: cv_object.apply_naive_bayes(X,y)

precision recall f1-score support

0 1.00 0.99 0.99 901

1 0.98 0.99 0.98 245

accuracy 0.99 1146

macro avg 0.99 0.99 0.99 1146
weighted avg 0.99 0.99 0.99 1146

test set

Accuracy Score: 0.9921465968586387

F1 Score: 0.9817444219066936
Recall: 0.9877551020408163
Precision: 0.9758064516129032
Confusion matrix, without normalization
[[895 6]
[ 3 242]]
Normalized confusion matrix
[[0.99334073 0.00665927]
[0.0122449 0.9877551 ]]

16
17
No Skill: ROC AUC=0.500
Naive Bayes: ROC AUC=0.998

18
[26]: cv_object.apply_svm(X,y)

precision recall f1-score support

0 0.99 0.99 0.99 901

1 0.98 0.98 0.98 245

accuracy 0.99 1146

macro avg 0.99 0.98 0.99 1146
weighted avg 0.99 0.99 0.99 1146

test set

Accuracy Score: 0.9904013961605584

F1 Score: 0.9775051124744377
Recall: 0.9755102040816327
Precision: 0.9795081967213115
Confusion matrix, without normalization
[[896 5]
[ 6 239]]
Normalized confusion matrix
[[0.99445061 0.00554939]

19
[0.0244898 0.9755102 ]]

20
No Skill: ROC AUC=0.500
SVM: ROC AUC=0.998

21
[ ]:

[ ]:

Spam Detection Using Tensorflow
No ratings yet
Spam Detection Using Tensorflow
13 pages
Email Spam Detection Final Presentation-21BSCHH010002
No ratings yet
Email Spam Detection Final Presentation-21BSCHH010002
17 pages
Efficient Python Tricks and Tools For Data Scientists - by Khuyen Tran
No ratings yet
Efficient Python Tricks and Tools For Data Scientists - by Khuyen Tran
20 pages
ALTIVAR312 P8 2009 07 en
100% (1)
ALTIVAR312 P8 2009 07 en
127 pages
Checkpoint Packet Flow
No ratings yet
Checkpoint Packet Flow
3 pages
ML Week10.1
No ratings yet
ML Week10.1
5 pages
Salesforce 100 Interview
No ratings yet
Salesforce 100 Interview
2 pages
Machine Learning Code Explanation
No ratings yet
Machine Learning Code Explanation
33 pages
Lab 6
No ratings yet
Lab 6
47 pages
Aped For Fake News
No ratings yet
Aped For Fake News
6 pages
Lab Report 8
No ratings yet
Lab Report 8
11 pages
Artificial Intelligence (18Csc305J) Lab: EXPERIMENT 13: Implementation of NLP Problem
No ratings yet
Artificial Intelligence (18Csc305J) Lab: EXPERIMENT 13: Implementation of NLP Problem
9 pages
Project Name Spam Email Detection 1
No ratings yet
Project Name Spam Email Detection 1
7 pages
Naive Bayes Classification - Jupyter Notebook
No ratings yet
Naive Bayes Classification - Jupyter Notebook
4 pages
IR 4 E-Mail Spam Filtering Spam - Dataset
No ratings yet
IR 4 E-Mail Spam Filtering Spam - Dataset
2 pages
Group 4 MovieReview
No ratings yet
Group 4 MovieReview
10 pages
Shreya Srivastava-27
No ratings yet
Shreya Srivastava-27
3 pages
Apply Logistic Regression To Amazon Reviews Data Set (M)
No ratings yet
Apply Logistic Regression To Amazon Reviews Data Set (M)
11 pages
Workshop - NLP - Ipynb - Colaboratory
No ratings yet
Workshop - NLP - Ipynb - Colaboratory
5 pages
Apply SVM To Amazon Reviews Data Set Avg W2vec (M)
No ratings yet
Apply SVM To Amazon Reviews Data Set Avg W2vec (M)
8 pages
ML Lab6
No ratings yet
ML Lab6
4 pages
7 Aiml
No ratings yet
7 Aiml
4 pages
17 - Source Code - nlp-2-5
No ratings yet
17 - Source Code - nlp-2-5
4 pages
Reverse Engineering The YouTube Algorithm PT 2
No ratings yet
Reverse Engineering The YouTube Algorithm PT 2
6 pages
Lead Acid Battery
No ratings yet
Lead Acid Battery
13 pages
Ai&Ml Lab: Dept of CSE, SUK
No ratings yet
Ai&Ml Lab: Dept of CSE, SUK
3 pages
IR Prac 5
No ratings yet
IR Prac 5
3 pages
Data Mining Numericals
No ratings yet
Data Mining Numericals
38 pages
Orla CDP45 - PDF
No ratings yet
Orla CDP45 - PDF
85 pages
Multi Classification - Py (For 1 Class TP, TN, FP, FN)
No ratings yet
Multi Classification - Py (For 1 Class TP, TN, FP, FN)
25 pages
Topic Classifierby David Caleb
No ratings yet
Topic Classifierby David Caleb
7 pages
School of Engineering: Lab Manual On Machine Learning Lab
No ratings yet
School of Engineering: Lab Manual On Machine Learning Lab
23 pages
Report On - Social Media Research Topic Modeling
No ratings yet
Report On - Social Media Research Topic Modeling
26 pages
Sample Code
No ratings yet
Sample Code
9 pages
Guide To Teaching Slope
100% (1)
Guide To Teaching Slope
24 pages
Spam Detection
No ratings yet
Spam Detection
10 pages
DM 04 04 Rule-Based Classification
No ratings yet
DM 04 04 Rule-Based Classification
72 pages
A Project Report: in Partial Fulfillment For The Award of The Degree
No ratings yet
A Project Report: in Partial Fulfillment For The Award of The Degree
50 pages
ML Prac1-10
No ratings yet
ML Prac1-10
32 pages
NLP Tushar
No ratings yet
NLP Tushar
21 pages
Email Spam Detection
No ratings yet
Email Spam Detection
3 pages
Ai Project File
No ratings yet
Ai Project File
11 pages
Sma Exp 10 Code Print
No ratings yet
Sma Exp 10 Code Print
7 pages
Intel (R) ME SW Installation Guide PDF
No ratings yet
Intel (R) ME SW Installation Guide PDF
30 pages
Implemention of Sms Spam Filtering
No ratings yet
Implemention of Sms Spam Filtering
27 pages
Arnav MLlab04
No ratings yet
Arnav MLlab04
7 pages
Blue Doodle Project Presentation
No ratings yet
Blue Doodle Project Presentation
15 pages
Aiml Assignment-2
No ratings yet
Aiml Assignment-2
8 pages
Python CA 4
No ratings yet
Python CA 4
9 pages
Microproject Report
No ratings yet
Microproject Report
23 pages
Email Spam Classifier Phase1
No ratings yet
Email Spam Classifier Phase1
4 pages
Sms Spam Using Machine Learning 4
No ratings yet
Sms Spam Using Machine Learning 4
42 pages
A Comprehensive Guide To Understand and Implement Text Classification in Python
No ratings yet
A Comprehensive Guide To Understand and Implement Text Classification in Python
34 pages
AI Phash3
No ratings yet
AI Phash3
11 pages
Code
No ratings yet
Code
6 pages
Aiml 5-8
No ratings yet
Aiml 5-8
19 pages
AI Report
No ratings yet
AI Report
8 pages
Notebook - Text Classification
No ratings yet
Notebook - Text Classification
7 pages
Data Preprocessing
No ratings yet
Data Preprocessing
9 pages
Lab5 Example Fall 23
No ratings yet
Lab5 Example Fall 23
4 pages
Naive Bayes Classification
No ratings yet
Naive Bayes Classification
8 pages
MLA TAB Lecture2
No ratings yet
MLA TAB Lecture2
84 pages
AI Phase4
No ratings yet
AI Phase4
11 pages
Blackberry Case Study
No ratings yet
Blackberry Case Study
5 pages
WDM - Week - I
No ratings yet
WDM - Week - I
24 pages
As Cfe Interop 61850 en PDF
No ratings yet
As Cfe Interop 61850 en PDF
29 pages
Sample
No ratings yet
Sample
6 pages
UNIT 4 Data Science Notes
No ratings yet
UNIT 4 Data Science Notes
4 pages
Spam Detection Model
No ratings yet
Spam Detection Model
4 pages
Machine Learning Learning With Email Spam Detection
No ratings yet
Machine Learning Learning With Email Spam Detection
5 pages
Automatic Account Determination in MM
No ratings yet
Automatic Account Determination in MM
6 pages
What Do You Mean by Demand Forecasting?
No ratings yet
What Do You Mean by Demand Forecasting?
17 pages
SeisImager 2D TM Manual Compressed
No ratings yet
SeisImager 2D TM Manual Compressed
257 pages
Manual
No ratings yet
Manual
48 pages
HTW Berlin Master Thesis
100% (4)
HTW Berlin Master Thesis
6 pages
Document
No ratings yet
Document
11 pages
Notes On C Prog
No ratings yet
Notes On C Prog
28 pages
Project Report On Golden Bricks
No ratings yet
Project Report On Golden Bricks
29 pages
Manual For Blower Door Operation-200 1000 2000 3000
No ratings yet
Manual For Blower Door Operation-200 1000 2000 3000
88 pages
Đ Án 1 (Thành+Dương)
No ratings yet
Đ Án 1 (Thành+Dương)
28 pages
Holiday Homework
No ratings yet
Holiday Homework
22 pages
Mba 4 TH Sem Only
No ratings yet
Mba 4 TH Sem Only
29 pages
Unit 5
No ratings yet
Unit 5
41 pages
Microsoft Nav 2009 Part A
No ratings yet
Microsoft Nav 2009 Part A
3 pages
1.2 EOC Answers
No ratings yet
1.2 EOC Answers
26 pages
Document Approval Chain Example
No ratings yet
Document Approval Chain Example
18 pages
CS Project
No ratings yet
CS Project
14 pages
Daljit PDF
No ratings yet
Daljit PDF
2 pages
BrightEye 5 - Analog Composite TBC and Frame Sync BE5
No ratings yet
BrightEye 5 - Analog Composite TBC and Frame Sync BE5
2 pages

Email Spam Classifier

Uploaded by

Email Spam Classifier

Uploaded by

email-spam-classifier

[1]: import pandas as pd

[2]: #Parent Class for Data

[3]: #Child Class for Data_read_write

[4]: #Child Class for Data_read_write

final_join = ' '.join(Test_punc_removed_join_clean)

def apply_to_column(self, data_column_text):

[5]: #Child Class for Data_read_write

def apply_naive_bayes(self, X, y):

print("\nAccuracy Score: " + str(metrics.accuracy_score(y_test,␣

class_names = ['ham', 'spam']

# generate a no skill prediction (majority class)

print("\nAccuracy Score: " + str(metrics.accuracy_score(y_test,␣

class_names = ['ham', 'spam']

# generate a no skill prediction (majority class)

[6]: data_obj = data_read_write("emails.csv")

[7]: data_frame = data_obj.read_csv_file("processed.csv")

[8]: text spam

[9]: #Visualize dataset

[10]: # Let's get the length of the messages

[11]: #data_frame['length'].plot(bins=100, kind='hist')

ham_messages_length['length'].plot(bins=100, kind='hist',label = 'Ham')

#sns.distplot(spam_messages_length['length'], bins=10, norm_hist = True, label␣

plt.title('Distribution of Length of Email Text')

#ax = sns.distplot(ham_words_length, norm_hist = True, bins = 30, label = 'Ham')

[11]: <matplotlib.legend.Legend at 0x2e158719f88>

ham_words_length = [len(word_tokenize(title)) for title in␣

spam_words_length = [len(word_tokenize(title)) for title in␣

[13]: #There is spike in spam emails with less number of words

#ham_words_length.plot(bins=100, kind='hist',label = 'Ham')

[14]: def mean_word_length(x):

sns.distplot(ham_meanword_length, norm_hist = True, bins = 30, label = 'Ham')

[15]: #Checking ratio of stop words

from nltk.corpus import stopwords

sns.distplot(ham_stopwords, norm_hist = True, label = 'Ham')

print('Ham Mean: {:.3f}'.format(ham_stopwords.values.mean()))

Ham Mean: 0.278

[15]: <matplotlib.legend.Legend at 0x2e15f4d2848>

[33]: # Let's divide the messages into spam and ham

Spam percentage = 23.88268156424581 %

[33]: <AxesSubplot:xlabel='Ham(0) and Spam(1)', ylabel='count'>

[18]: word_cloud_obj = generate_word_cloud()

[20]: text spam length \

[21]: text spam length \

[23]: #APPLY COUNT VECTORIZER TO OUR MESSAGES LIST

# Define the cleaning pipeline we defined earlier

precision recall f1-score support

0 1.00 0.99 0.99 901

accuracy 0.99 1146

Accuracy Score: 0.9921465968586387

precision recall f1-score support

0 0.99 0.99 0.99 901

accuracy 0.99 1146

Accuracy Score: 0.9904013961605584

You might also like