Spam Detection 1
Spam Detection 1
May 5, 2025
[208]: df = pd.read_csv('spam.csv')
[209]: df.sample(5)
[209]: v1 v2 Unnamed: 2 \
4809 ham Honey, can you pls find out how much they sell… NaN
392 ham Morning only i can ok. NaN
4231 ham I'm at home. Please call NaN
2520 ham Misplaced your number and was sending texts to… NaN
2270 ham U know we watchin at lido? NaN
Unnamed: 3 Unnamed: 4
4809 NaN NaN
392 NaN NaN
4231 NaN NaN
2520 NaN NaN
2270 NaN NaN
[212]: df.info()
1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 v1 5572 non-null object
1 v2 5572 non-null object
2 Unnamed: 2 50 non-null object
3 Unnamed: 3 12 non-null object
4 Unnamed: 4 6 non-null object
dtypes: object(5)
memory usage: 217.8+ KB
[214]: df.sample(5)
[214]: v1 v2
5080 ham Yeah, give me a call if you've got a minute
2805 ham Can a not?
3223 ham Sorry da thangam.it's my mistake.
4499 ham Nvm take ur time.
589 ham I'm in a meeting, call me later at
[215]: df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)
[217]: df.head()
[218]: df.isnull().sum()
2
[218]: target 0
text 0
dtype: int64
[219]: df.duplicated().sum()
[219]: np.int64(403)
[220]: df = df.drop_duplicates(keep='first')
[221]: df.duplicated().sum()
[221]: np.int64(0)
[222]: ## EDA
[223]: df.head()
[ ]:
[ ]:
[224]: df['target'].value_counts()
[224]: target
ham 4516
spam 653
Name: count, dtype: int64
3
[226]: # Data is imbalanced
4
[notice] A new release of pip is available: 25.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
[229]: nltk.download('punkt_tab')
[229]: True
[231]: df.head()
[233]: df.head()
num_words
0 24
1 8
2 37
3 13
4 15
[235]: df[['num_characters','num_words','num_sentences']].describe()
numeric_df = df.select_dtypes(include=['number'])
5
[236]: # For spam messages
df[df['target'] == 'spam' ][['num_characters','num_words','num_sentences']].
↪describe()
6
[240]: sns.pairplot(df,hue='target')
7
[241]: # Filter to only include numeric columns
numeric_df = df.select_dtypes(include=['number'])
sns.heatmap(numeric_df.corr(),annot=True)
8
[242]: # 3. Data Preprocessing
# Initialize stemmer
9
ps = PorterStemmer()
y = []
for i in text:
if i.isalnum():
y.append(i)
text = y[:]
y.clear()
for i in text:
if i not in stopwords.words('english') and i not in string.punctuation:
y.append(i)
text = y[:]
y.clear()
for i in text:
y.append(ps.stem(i))
[135]: df.head()
10
c:\users\ank94\appdata\local\programs\python\python313\lib\site-packages (1.9.4)
Requirement already satisfied: numpy>=1.6.1 in
c:\users\ank94\appdata\local\programs\python\python313\lib\site-packages (from
wordcloud) (2.2.5)
Requirement already satisfied: pillow in
c:\users\ank94\appdata\local\programs\python\python313\lib\site-packages (from
wordcloud) (11.2.1)
Requirement already satisfied: matplotlib in
c:\users\ank94\appdata\local\programs\python\python313\lib\site-packages (from
wordcloud) (3.10.1)
Requirement already satisfied: contourpy>=1.0.1 in
c:\users\ank94\appdata\local\programs\python\python313\lib\site-packages (from
matplotlib->wordcloud) (1.3.2)
Requirement already satisfied: cycler>=0.10 in
c:\users\ank94\appdata\local\programs\python\python313\lib\site-packages (from
matplotlib->wordcloud) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in
c:\users\ank94\appdata\local\programs\python\python313\lib\site-packages (from
matplotlib->wordcloud) (4.57.0)
Requirement already satisfied: kiwisolver>=1.3.1 in
c:\users\ank94\appdata\local\programs\python\python313\lib\site-packages (from
matplotlib->wordcloud) (1.4.8)
Requirement already satisfied: packaging>=20.0 in
c:\users\ank94\appdata\local\programs\python\python313\lib\site-packages (from
matplotlib->wordcloud) (24.2)
Requirement already satisfied: pyparsing>=2.3.1 in
c:\users\ank94\appdata\local\programs\python\python313\lib\site-packages (from
matplotlib->wordcloud) (3.2.3)
Requirement already satisfied: python-dateutil>=2.7 in
c:\users\ank94\appdata\local\programs\python\python313\lib\site-packages (from
matplotlib->wordcloud) (2.9.0.post0)
Requirement already satisfied: six>=1.5 in
c:\users\ank94\appdata\local\programs\python\python313\lib\site-packages (from
python-dateutil>=2.7->matplotlib->wordcloud) (1.17.0)
Note: you may need to restart the kernel to use updated packages.
11
[144]: # Display the word cloud
plt.imshow(wc)
12
[147]: df.head()
[148]: spam_corpus = []
for msg in df[df['target'] == 'spam'] ['transformed_text'].tolist():
for word in msg.split():
spam_corpus.append(word)
[149]: len(spam_corpus)
13
[149]: 9939
plt.tight_layout()
plt.show()
14
[151]: ham_corpus = []
for msg in df[df['target'] == 'ham'] ['transformed_text'].tolist():
for word in msg.split():
ham_corpus.append(word)
[152]: len(ham_corpus)
[152]: 35404
15
# Get the most common words and create DataFrame
ham_common = Counter(ham_corpus).most_common(30) # Get top 30 words
ham_df = pd.DataFrame(ham_common, columns=['Word', 'Count'])
plt.tight_layout()
plt.show()
16
[154]: # 4. Model Building
[156]: x = tfidf.fit_transform(df['transformed_text']).toarray
[157]: X = cv.fit_transform(df['transformed_text']).toarray()
[159]: X.shape
[161]: y = df['target'].values
[162]: y
17
[167]: gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print (accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))
0.8800773694390716
[[792 104]
[ 20 118]]
0.5315315315315315
[168]: mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print (accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))
0.9690522243713733
[[883 13]
[ 19 119]]
0.9015151515151515
[169]: bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print (accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))
0.9700193423597679
[[893 3]
[ 28 110]]
0.9734513274336283
18
[notice] A new release of pip is available: 25.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
[174]: clfs = {
'SVC': svc,
'KNR': knc,
'NB': bnb,
'DTC': dtc,
'lr':lrc,
'RF': rfc,
'AdaBoost': abc,
'BgC': bc,
'GBDT': gbdt,
'XGB': xgb # Changed 'xgb' to 'XGB' for consistency
}
19
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
# Make predictions
y_pred = svc.predict(X_test)
[177]: train_classifier(svc,X_train,y_train,X_test,y_test)
C:\Users\ank94\AppData\Local\Programs\Python\Python313\Lib\site-
packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning:
Precision is ill-defined and being set to 0.0 due to no predicted samples. Use
`zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
# Make predictions
y_pred = model.predict(X_test)
# Calculate precision
precision = precision_score(y_test, y_pred, zero_division=1)
return precision
# Usage
svc = SVC()
precision = evaluate_model(svc, X_train, y_train, X_test, y_test)
print(f"Precision: {precision:.4f}")
20
Precision: 1.0000
[179]: accuracy_scores = []
precision_scores = []
print(f"For {name}:")
print(f"Accuracy - {current_accuracy}")
print(f"Precision - {current_precision}")
accuracy_scores.append(current_accuracy)
precision_scores.append(current_precision)
C:\Users\ank94\AppData\Local\Programs\Python\Python313\Lib\site-
packages\sklearn\metrics\_classification.py:1565: UndefinedMetricWarning:
Precision is ill-defined and being set to 0.0 due to no predicted samples. Use
`zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
For SVC:
Accuracy - 0.8665377176015474
Precision - 0.0
For KNR:
Accuracy - 0.9313346228239845
Precision - 0.7768595041322314
For NB:
Accuracy - 0.9700193423597679
Precision - 0.9734513274336283
For DTC:
Accuracy - 0.9516441005802708
Precision - 0.8928571428571429
For lr:
Accuracy - 0.97678916827853
Precision - 0.9523809523809523
For RF:
Accuracy - 0.971953578336557
Precision - 0.990990990990991
For AdaBoost:
Accuracy - 0.9448742746615088
Precision - 0.8932038834951457
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Cell In[179], line 5
2 precision_scores = []
21
4 for name, clf in clfs.items():
----> 5 current_accuracy, current_precision =␣
↪train_classifier(clf, X_train, y_train, X_test, y_test)
7 print(f"For {name}:")
8 print(f"Accuracy - {current_accuracy}")
File␣
↪~\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.
↪py:63, in _deprecate_positional_args.<locals>._inner_deprecate_positional_args.
↪<locals>.inner_f(*args, **kwargs)
File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py:
↪1389, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args,␣
↪**kwargs)
1382 estimator._validate_params()
1384 with config_context(
1385 skip_parameter_validation=(
1386 prefer_skip_nested_validation or global_skip_validation
1387 )
1388 ):
-> 1389 return fit_method(estimator, *args, **kwargs)
File␣
↪~\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\ensemble\_bagging.
File␣
↪~\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\ensemble\_bagging.
↪py:532, in BaseBagging._fit(self, X, y, max_samples, max_depth, check_input,␣
↪**fit_params)
22
--> 532 all_results = Parallel(
533 n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args()
534 )(
535 delayed(_parallel_build_estimators)(
536 n_estimators[i],
537 self,
538 X,
539 y,
540 seeds[starts[i] : starts[i + 1]],
541 total_n_estimators,
542 verbose=self.verbose,
543 check_input=check_input,
544 fit_params=routed_params.estimator.fit,
545 )
546 for i in range(n_jobs)
547 )
549 # Reduce
550 self.estimators_ += list(
551 itertools.chain.from_iterable(t[0] for t in all_results)
552 )
File␣
↪~\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\parallel.
72 config = get_config()
73 iterable_with_config = (
74 (_with_config(delayed_func, config), args, kwargs)
75 for delayed_func, args, kwargs in iterable
76 )
---> 77 return super().__call__(iterable_with_config)
File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\parallel.
↪py:1918, in Parallel.__call__(self, iterable)
File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\parallel.
↪py:1847, in Parallel._get_sequential_output(self, iterable)
1845 self.n_dispatched_batches += 1
1846 self.n_dispatched_tasks += 1
-> 1847 res = func(*args, **kwargs)
23
1848 self.n_completed_tasks += 1
1849 self.print_progress()
File␣
↪~\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\parallel.
137 config = {}
138 with config_context(**config):
--> 139 return self.function(*args, **kwargs)
File␣
↪~\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\ensemble\_bagging.
↪py:189, in _parallel_build_estimators(n_estimators, ensemble, X, y, seeds,␣
File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py:
↪1389, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args,␣
↪**kwargs)
1382 estimator._validate_params()
1384 with config_context(
1385 skip_parameter_validation=(
1386 prefer_skip_nested_validation or global_skip_validation
1387 )
1388 ):
-> 1389 return fit_method(estimator, *args, **kwargs)
File␣
↪~\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\tree\_classes.
993 @_fit_context(prefer_skip_nested_validation=True)
994 def fit(self, X, y, sample_weight=None, check_input=True):
995 """Build a decision tree classifier from the training set (X, y).
996
997 Parameters
(…) 1021 Fitted estimator.
1022 """
-> 1024 super()._fit(
1025 X,
1026 y,
1027 sample_weight=sample_weight,
1028 check_input=check_input,
1029 )
1030 return self
24
File␣
↪~\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\tree\_classes.
↪py:472, in BaseDecisionTree._fit(self, X, y, sample_weight, check_input,␣
↪missing_values_in_feature_mask)
461 else:
462 builder = BestFirstTreeBuilder(
463 splitter,
464 min_samples_split,
(…) 469 self.min_impurity_decrease,
470 )
--> 472␣
↪builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
KeyboardInterrupt:
# Ensure all lists have the same length (use the minimum length)
min_length = min(len(algorithms), len(accuracy_scores), len(precision_scores))
[181]: performance_df
[183]: performance_df1
25
[183]: Algorithm variable value
0 RF Accuracy 0.971954
1 NB Accuracy 0.970019
2 lr Accuracy 0.976789
3 AdaBoost Accuracy 0.944874
4 DTC Accuracy 0.951644
5 KNR Accuracy 0.931335
6 SVC Accuracy 0.866538
7 RF Precision 0.990991
8 NB Precision 0.973451
9 lr Precision 0.952381
10 AdaBoost Precision 0.893204
11 DTC Precision 0.892857
12 KNR Precision 0.776860
13 SVC Precision 0.000000
26
[185]: # First convert clfs.keys() to a list
algorithms = list(clfs.keys())
27
# Calculate the minimum length
min_length = min(len(algorithms), len(accuracy_scores), len(precision_scores))
# Now create the DataFrame using only the first min_length elements of each list
temp_df = pd.DataFrame({
'Algorithm': algorithms[:min_length],
'Accuracy_scaling': accuracy_scores[:min_length],
'Precision_scaling': precision_scores[:min_length]
}).sort_values('Precision_scaling', ascending=False)
# Now create the DataFrame using only the first min_length elements of each list
temp_df = pd.DataFrame({
'Algorithm': algorithms[:min_length],
'Accuracy_num_chars': accuracy_scores[:min_length],
'Precision_num_chars': precision_scores[:min_length]
}).sort_values('Precision_num_chars', ascending=False)
[191]: new_df_scaled.merge(temp_df,on='Algorithm')
28
5 0.931335 0.776860 0.931335
6 0.866538 0.000000 0.866538
Precision_num_chars
0 0.990991
1 0.973451
2 0.952381
3 0.893204
4 0.892857
5 0.776860
6 0.000000
[192]: new_df_scaled
Accuracy_scaling_y Precision_scaling_y
0 0.971954 0.990991
1 0.970019 0.973451
2 0.976789 0.952381
3 0.944874 0.893204
4 0.951644 0.892857
5 0.931335 0.776860
6 0.866538 0.000000
[195]: voting.fit(X_train,y_train)
[195]: VotingClassifier(estimators=[('rf',
RandomForestClassifier(n_estimators=50,
random_state=2)),
('bnb', BernoulliNB()),
29
('lr',
LogisticRegression(penalty='l1',
solver='liblinear'))],
voting='soft')
[196]: VotingClassifier(estimators=[('rf',
RandomForestClassifier(n_estimators=100,
random_state=2)),
('bnb', BernoulliNB()),
('lr', LogisticRegression())],
voting='soft')
Accuracy 0.9758220502901354
Precision 1.0
[202]: clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))
Accuracy 0.97678916827853
Precision 0.9672131147540983
30
[246]: import pickle
[ ]:
[ ]:
[ ]:
[ ]:
31