Email Spam Detection
Email Spam Detection
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report,
confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("C:/Users/dhamini_eashitha/Downloads/mail_data.csv",
encoding='latin-1')
df.columns = ['label', 'message']
df['message'] = df['message'].apply(clean_text)
print(df)
label message
0 0 go until jurong point crazy available only in ...
1 0 ok lar joking wif u oni
2 1 free entry in a wkly comp to win fa cup final...
3 0 u dun say so early hor u c already then say
4 0 nah i dont think he goes to usf he lives aroun...
... ... ...
5567 1 this is the nd time we have tried contact u u...
5568 0 will ã¼ b going to esplanade fr home
5569 0 pity was in mood for that soany other suggest...
5570 0 the guy did some bitching but i acted like id ...
5571 0 rofl its true to its name
KNeighborsClassifier()
# Predictions
y_pred = model.predict(X_test_tfidf)
# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test,
y_pred))
Accuracy: 0.9210762331838565
Classification Report:
precision recall f1-score support
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
# Example Prediction
sample_email = ["Congratulations! You've won a free iPhone. Click here
to claim your prize."]
sample_email_tfidf = vectorizer.transform(sample_email)
prediction = model.predict(sample_email_tfidf)
print("Spam" if prediction[0] == 1 else "Ham")
Ham