0% found this document useful (0 votes)
19 views51 pages

Another Copy of Ensemble Models Original Paid

It is the combination of copy of ensemble models document

Uploaded by

Vijay ragavan
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
19 views51 pages

Another Copy of Ensemble Models Original Paid

It is the combination of copy of ensemble models document

Uploaded by

Vijay ragavan
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 51

import numpy as np

import pandas as pd
import matplotlib.pyplot as plt

from google.colab import drive


drive.mount('/content/drive')

Mounted at /content/drive

#Reading the data, Adding column names to it, and merging it as one

df1 = pd.read_csv('/content/drive/MyDrive/kaggle/ test_data.txt',


sep='\t')
df2 = pd.read_csv('/content/drive/MyDrive/kaggle/ test_data.txt',
sep='\t')
df3 = pd.read_csv('/content/drive/MyDrive/kaggle/Validation_data.txt',
sep='\t')
print(df1.shape)
print(df2.shape)
print(df3.shape)
df1.columns = ['index', 'id', 'label', 'statement', 'subject',
'speaker', 'JobTitle', 'State', 'Party', 'BTC', 'FC', 'HT', 'MT',
'POF', 'context', 'justification']
df2.columns = ['index', 'id', 'label', 'statement', 'subject',
'speaker', 'JobTitle', 'State', 'Party', 'BTC', 'FC', 'HT', 'MT',
'POF', 'context', 'justification']
df3.columns = ['index', 'id', 'label', 'statement', 'subject',
'speaker', 'JobTitle', 'State', 'Party', 'BTC', 'FC', 'HT', 'MT',
'POF', 'context', 'justification']
df = pd.concat([df1, df2, df3], axis=0)
print(df.shape)
df.head()

(1266, 16)
(1266, 16)
(1283, 16)
(3815, 16)

{"summary":"{\n \"name\": \"df\",\n \"rows\": 3815,\n \"fields\":


[\n {\n \"column\": \"index\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 367,\n \"min\": 1,\n
\"max\": 1283,\n \"num_unique_values\": 1283,\n
\"samples\": [\n 1244,\n 1246,\n 271\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"id\",\n \"properties\": {\n
\"dtype\": \"string\",\n \"num_unique_values\": 2549,\n
\"samples\": [\n \"3453.json\",\n \"2833.json\",\n
\"1807.json\"\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"label\",\n \"properties\": {\n \"dtype\": \"category\",\
n \"num_unique_values\": 6,\n \"samples\": [\n
\"false\",\n \"half-true\",\n \"mostly-true\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"statement\",\n
\"properties\": {\n \"dtype\": \"string\",\n
\"num_unique_values\": 2549,\n \"samples\": [\n
\"Studies have shown that in the absence of federal reproductive
health funds, we are going to see the level of abortion in Georgia
increase by about 44 percent.\",\n \"Since the first of the
year, 153 businesses at last count had moved out of California to
Texas.\",\n \"Minnick voted to let the government fund
abortion under Obamacare.\"\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"subject\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 1285,\n
\"samples\": [\n \"bipartisanship,message-machine-
2012,voting-record\",\n \"pensions\",\n
\"crime,criminal-justice,guns,legal-issues\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"speaker\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
1069,\n \"samples\": [\n \"mike-rogers\",\n
\"sid-covington\",\n \"david-mellon\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"JobTitle\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
485,\n \"samples\": [\n \"New Jersey Association of
Railroad Passengers President\",\n \"Transportation
consultant\",\n \"pro-Trump super PAC\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"State\",\n \"properties\": {\
n \"dtype\": \"category\",\n \"num_unique_values\": 54,\
n \"samples\": [\n \"Minnesota\",\n
\"Kansas\",\n \"Colorado \"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Party\",\n \"properties\": {\
n \"dtype\": \"category\",\n \"num_unique_values\": 18,\
n \"samples\": [\n \"democrat\",\n
\"republican\",\n \"columnist\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"BTC\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 18,\n \"min\": 0,\n
\"max\": 70,\n \"num_unique_values\": 30,\n \"samples\":
[\n 31,\n 4,\n 7\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"FC\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 24,\n \"min\": 0,\n
\"max\": 114,\n \"num_unique_values\": 30,\n
\"samples\": [\n 39,\n 8,\n 41\n ],\
n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"HT\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 35,\n \"min\": 0,\n
\"max\": 160,\n \"num_unique_values\": 29,\n
\"samples\": [\n 22,\n 38,\n 32\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"MT\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 36,\n \"min\": 0,\n \"max\": 163,\n
\"num_unique_values\": 27,\n \"samples\": [\n 7,\n
34,\n 76\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"POF\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 15,\n \"min\": 0,\n \"max\": 105,\n
\"num_unique_values\": 20,\n \"samples\": [\n 0,\n
18,\n 11\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"context\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 1410,\n
\"samples\": [\n \"posts on Facebook\",\n \"Senate
Education Committee hearing\",\n \"remarks at FreedomWorks
Texas Summit in Austin\"\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"justification\",\n \"properties\": {\n
\"dtype\": \"string\",\n \"num_unique_values\": 2536,\n
\"samples\": [\n \"Rubio cherry-picks the highest number he
can find -- $800 billion in new taxes -- to garner opposition to the
recently upheld health care law. He doesnt tell readers anything more
about the figure, including the fact that these \\\"taxes\\\" would be
garnered over 10 years. His statement also indicates that rich and
poor people will feel the effects of the laws various revenue-raising
provisions with the same degree of pain. The law taxes wealthier
Americans to a greater degree to provide more services for the
poor.\",\n \"So Clinton's ad is pushing the envelope quite a
bit to say that her plan would save $1,700 per person.\",\n
\"But a credible study suggests that many, or even most, small-
businesses have income that the new top tax rate wont touch. And the
system allows them to file under the corporate tax system if they so
desire.\"\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"df"}

import seaborn as sns


from sklearn.preprocessing import LabelEncoder
df_copy=df.copy()
df_copy.drop(['statement','subject','justification'],axis=1,inplace=Tr
ue)
df_copy['id'] = df_copy['id'].apply(lambda x: x[:-5])
df_copy['id'] = df_copy['id'].astype('int64')
le = LabelEncoder()
df_copy['label'] = le.fit_transform(df_copy['label'])
df_copy['speaker'] = le.fit_transform(df_copy['speaker'])
df_copy['JobTitle'] = le.fit_transform(df_copy['JobTitle'])
df_copy['State'] = le.fit_transform(df_copy['State'])
df_copy['Party'] = le.fit_transform(df_copy['Party'])
print(df_copy.head())
corr_matrix = df_copy.corr()
plt.figure(figsize=(11,11))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

index id label speaker JobTitle State Party BTC FC HT


MT \
0 1 11685 1 573 342 51 5 2 1 0
0
1 2 11096 1 274 267 29 15 63 114 51
37
2 3 5209 2 866 424 34 15 1 1 3
1
3 4 9524 4 945 485 51 5 5 7 2
2
4 5 5962 5 118 485 36 15 1 2 1
1

POF context
0 0 a news conference
1 61 comments on ABC's This Week.
2 1 a radio show
3 7 a web video
4 0 a campaign website

<ipython-input-4-a6f62fe34331>:14: FutureWarning: The default value of


numeric_only in DataFrame.corr is deprecated. In a future version, it
will default to False. Select only valid columns or specify the value
of numeric_only to silence this warning.
corr_matrix = df_copy.corr()
sns.scatterplot(x='BTC',y='FC',data=df_copy)
plt.show()
sns.scatterplot(x='POF',y='HT',data=df_copy)
plt.show()
sns.scatterplot(x='MT',y='FC',data=df_copy)
plt.show()
#Dropping non-required columns

df = df.drop(['index', 'id', 'JobTitle', 'State', 'BTC', 'FC', 'HT',


'MT', 'POF', 'context', 'justification'], axis=1)
df.head()

{"summary":"{\n \"name\": \"df\",\n \"rows\": 3815,\n \"fields\":


[\n {\n \"column\": \"label\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 6,\n
\"samples\": [\n \"false\",\n \"half-true\",\n
\"mostly-true\"\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"statement\",\n \"properties\": {\n \"dtype\":
\"string\",\n \"num_unique_values\": 2549,\n
\"samples\": [\n \"Studies have shown that in the absence of
federal reproductive health funds, we are going to see the level of
abortion in Georgia increase by about 44 percent.\",\n
\"Since the first of the year, 153 businesses at last count had moved
out of California to Texas.\",\n \"Minnick voted to let the
government fund abortion under Obamacare.\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"subject\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
1285,\n \"samples\": [\n \"bipartisanship,message-
machine-2012,voting-record\",\n \"pensions\",\n
\"crime,criminal-justice,guns,legal-issues\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"speaker\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
1069,\n \"samples\": [\n \"mike-rogers\",\n
\"sid-covington\",\n \"david-mellon\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Party\",\n \"properties\": {\
n \"dtype\": \"category\",\n \"num_unique_values\": 18,\
n \"samples\": [\n \"democrat\",\n
\"republican\",\n \"columnist\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe","variable_name":"df"}

#Converting data into binary classification

df['label'] = df['label'].map({'true': 1, 'half-true': 1, 'mostly-


true': 1, 'false': 0, 'pants-fire': 0, 'barely-true': 0})
df.head()

{"summary":"{\n \"name\": \"df\",\n \"rows\": 3815,\n \"fields\":


[\n {\n \"column\": \"label\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n
\"max\": 1,\n \"num_unique_values\": 2,\n \"samples\":
[\n 1,\n 0\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"statement\",\n \"properties\": {\n
\"dtype\": \"string\",\n \"num_unique_values\": 2549,\n
\"samples\": [\n \"Studies have shown that in the absence of
federal reproductive health funds, we are going to see the level of
abortion in Georgia increase by about 44 percent.\",\n
\"Since the first of the year, 153 businesses at last count had moved
out of California to Texas.\"\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"subject\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 1285,\n
\"samples\": [\n \"bipartisanship,message-machine-
2012,voting-record\",\n \"pensions\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"speaker\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
1069,\n \"samples\": [\n \"mike-rogers\",\n
\"sid-covington\"\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Party\",\n \"properties\": {\n \"dtype\": \"category\",\
n \"num_unique_values\": 18,\n \"samples\": [\n
\"democrat\",\n \"republican\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe","variable_name":"df"}
#Merge the statement and subject columns into one column for analysis

df['text'] = df['subject'] + ' ' + df['statement']


df = df.drop(['subject', 'statement'], axis=1)
df.head()

{"summary":"{\n \"name\": \"df\",\n \"rows\": 3815,\n \"fields\":


[\n {\n \"column\": \"label\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n
\"max\": 1,\n \"num_unique_values\": 2,\n \"samples\":
[\n 1,\n 0\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"speaker\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 1069,\n
\"samples\": [\n \"mike-rogers\",\n \"sid-
covington\"\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Party\",\n \"properties\": {\n \"dtype\": \"category\",\
n \"num_unique_values\": 18,\n \"samples\": [\n
\"democrat\",\n \"republican\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"text\",\n \"properties\": {\n
\"dtype\": \"string\",\n \"num_unique_values\": 2549,\n
\"samples\": [\n \"abortion Studies have shown that in the
absence of federal reproductive health funds, we are going to see the
level of abortion in Georgia increase by about 44 percent.\",\n
\"economy Since the first of the year, 153 businesses at last count
had moved out of California to Texas.\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe","variable_name":"df"}

#Dropping missing values rows

print("Number of missing values in each column:")


print(df.isnull().sum())
print("We drop the missing values")
df = df.dropna()
print("The shape of the dataset is now: ", df.shape)

Number of missing values in each column:


label 0
speaker 0
Party 0
text 0
dtype: int64
We drop the missing values
The shape of the dataset is now: (3815, 4)

#Converting data into lowercase


df = df.apply(lambda x: x.astype(str).str.lower())
print(df.head())

label speaker Party \


0 0 katrina-shankland democrat
1 0 donald-trump republican
2 1 rob-cornilles republican
3 0 state-democratic-party-wisconsin democrat
4 1 brendan-doherty republican

text
0 jobs wisconsin is on pace to double the number...
1 military,veterans,voting-record says john mcca...
2 medicare,message-machine-2012,campaign-adverti...
3 campaign-finance,legal-issues,campaign-adverti...
4 federal-budget,pensions,retirement over the pa...

#Removing punctuations except comma, any links and any extra white spaces

df['text'] = df['text'].str.replace('[^\w\s,]', '')


df['text'] = df['text'].str.replace('http\S+|www.\S+', '', case=False)
df['text'] = df['text'].str.replace(' ,', ',')
df['text'] = df['text'].str.replace(', ', ',')

<ipython-input-11-fca5e7e659c5>:1: FutureWarning: The default value of


regex will change from True to False in a future version.
df['text'] = df['text'].str.replace('[^\w\s,]', '')
<ipython-input-11-fca5e7e659c5>:2: FutureWarning: The default value of
regex will change from True to False in a future version.
df['text'] = df['text'].str.replace('http\S+|www.\S+', '',
case=False)

print(df.head())

label speaker Party \


0 0 katrina-shankland democrat
1 0 donald-trump republican
2 1 rob-cornilles republican
3 0 state-democratic-party-wisconsin democrat
4 1 brendan-doherty republican

text
0 jobs wisconsin is on pace to double the number...
1 military,veterans,votingrecord says john mccai...
2 medicare,messagemachine2012,campaignadvertisin...
3 campaignfinance,legalissues,campaignadvertisin...
4 federalbudget,pensions,retirement over the pas...

#Tokenization of text column


import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.

True

from nltk.tokenize import RegexpTokenizer

# Assuming df is your DataFrame


tokenizer = RegexpTokenizer(r'\w+')

# Check the data types in the 'text' column


print(df['text'].apply(type).value_counts())

# Convert non-string values to strings


df['text'] = df['text'].astype(str)

# Apply the tokenizer


df['text'] = df['text'].apply(lambda x: tokenizer.tokenize(x))

# Print the DataFrame


print(df.head())

<class 'str'> 3815


Name: text, dtype: int64
label speaker Party \
0 0 katrina-shankland democrat
1 0 donald-trump republican
2 1 rob-cornilles republican
3 0 state-democratic-party-wisconsin democrat
4 1 brendan-doherty republican

text
0 [jobs, wisconsin, is, on, pace, to, double, th...
1 [military, veterans, votingrecord, says, john,...
2 [medicare, messagemachine2012, campaignadverti...
3 [campaignfinance, legalissues, campaignadverti...
4 [federalbudget, pensions, retirement, over, th...

#Lemmatization of text column

from nltk.stem import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(y) for y
in x])
print(df.head())

label speaker Party \


0 0 katrina-shankland democrat
1 0 donald-trump republican
2 1 rob-cornilles republican
3 0 state-democratic-party-wisconsin democrat
4 1 brendan-doherty republican

text
0 [job, wisconsin, is, on, pace, to, double, the...
1 [military, veteran, votingrecord, say, john, m...
2 [medicare, messagemachine2012, campaignadverti...
3 [campaignfinance, legalissues, campaignadverti...
4 [federalbudget, pension, retirement, over, the...

#Stop Words removal from text column

from nltk.corpus import stopwords


stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: [item for item in x if item
not in stop])
print(df.head())

label speaker Party \


0 0 katrina-shankland democrat
1 0 donald-trump republican
2 1 rob-cornilles republican
3 0 state-democratic-party-wisconsin democrat
4 1 brendan-doherty republican

text
0 [job, wisconsin, pace, double, number, layoff,...
1 [military, veteran, votingrecord, say, john, m...
2 [medicare, messagemachine2012, campaignadverti...
3 [campaignfinance, legalissues, campaignadverti...
4 [federalbudget, pension, retirement, past, fiv...

#Joining text column into a string for vectorization

df['text'] = df['text'].apply(lambda x: ' '.join(x))

#Visualization of data

from wordcloud import WordCloud


all_words = ' '.join([text for text in df['text']])
wordcloud = WordCloud(width=800, height=500, random_state=21,
max_font_size=110).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

#Using TFIDF and BOW for vectorization

def to_vector_Tfidf(df, col):


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=2000)
vectorizer.fit(df[col])
return vectorizer.transform(df[col])

def to_vector_bow(df, col):


from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=2000)
vectorizer.fit(df[col])
return vectorizer.transform(df[col])

text_vector_tfidf = to_vector_Tfidf(df, 'text')


print("Shape of the tfidf vector: ", text_vector_tfidf.shape)
print(text_vector_tfidf.shape)
text_vector_bow = to_vector_bow(df, 'text')
print("Shape of the text vector for bow vectorization: ",
text_vector_bow.shape)
print(text_vector_bow.shape)

Shape of the tfidf vector: (3815, 2000)


(3815, 2000)
Shape of the text vector for bow vectorization: (3815, 2000)
(3815, 2000)

#Running PCA

text_vector_tfidf_copy=text_vector_tfidf.copy()
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit_transform(text_vector_tfidf_copy.toarray())
plt.figure(figsize=(10, 7))
plt.scatter(pca.components_[0], pca.components_[1])
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('PCA on tfidf vector')
plt.show()

text_vector_bow_copy=text_vector_bow.copy()
pca = PCA(n_components=2)
pca.fit_transform(text_vector_bow_copy.toarray())
plt.figure(figsize=(10, 7))
plt.scatter(pca.components_[0], pca.components_[1])
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('PCA on bow vector')
plt.show()
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_text_vector_tfidf_tsne_copy=text_vector_tfidf.copy()
tsne_text_vector_tfidf_tsne_copy =
tsne.fit_transform(tsne_text_vector_tfidf_tsne_copy.toarray())
plt.figure(figsize=(10, 7))
import seaborn as sns
sns.scatterplot(x=tsne_text_vector_tfidf_tsne_copy[:,0],
y=tsne_text_vector_tfidf_tsne_copy[:,1], hue=df['label'])
plt.title('TSNE on tfidf vector')
plt.show()

[t-SNE] Computing 121 nearest neighbors...


[t-SNE] Indexed 3815 samples in 0.005s...
[t-SNE] Computed neighbors for 3815 samples in 1.747s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3815
[t-SNE] Computed conditional probabilities for sample 2000 / 3815
[t-SNE] Computed conditional probabilities for sample 3000 / 3815
[t-SNE] Computed conditional probabilities for sample 3815 / 3815
[t-SNE] Mean sigma: 0.446278
[t-SNE] KL divergence after 50 iterations with early exaggeration:
82.590515
[t-SNE] KL divergence after 300 iterations: 2.271853

tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)


tsne_text_vector_bow_tsne_copy=text_vector_bow.copy()
tsne_text_vector_bow_tsne_copy =
tsne.fit_transform(tsne_text_vector_bow_tsne_copy.toarray())
plt.figure(figsize=(10, 7))
sns.scatterplot(x=tsne_text_vector_bow_tsne_copy[:,0],
y=tsne_text_vector_bow_tsne_copy[:,1], hue=df['label'])
plt.title('TSNE on bow vector')
plt.show()

[t-SNE] Computing 121 nearest neighbors...


[t-SNE] Indexed 3815 samples in 0.005s...
[t-SNE] Computed neighbors for 3815 samples in 1.741s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3815
[t-SNE] Computed conditional probabilities for sample 2000 / 3815
[t-SNE] Computed conditional probabilities for sample 3000 / 3815
[t-SNE] Computed conditional probabilities for sample 3815 / 3815
[t-SNE] Mean sigma: 1.387043
[t-SNE] KL divergence after 250 iterations with early exaggeration:
89.803696
[t-SNE] KL divergence after 300 iterations: 3.507192

text_vector_tfidf = text_vector_tfidf.toarray()
text_vector_bow = text_vector_bow.toarray()

label_vector = df['label'].values
speaker_vector = df['speaker'].values
party_vector = df['Party'].values
label_vector = label_vector.reshape(-1, 1)
speaker_vector = speaker_vector.reshape(-1, 1)
party_vector = party_vector.reshape(-1, 1)

dataF1 = np.concatenate((text_vector_tfidf, label_vector), axis=1)


dataF2 = np.concatenate((text_vector_bow, label_vector), axis=1)
dataF3 = np.concatenate((text_vector_tfidf, label_vector,
speaker_vector, party_vector), axis=1)
dataF4 = np.concatenate((text_vector_bow, label_vector,
speaker_vector, party_vector), axis=1)
print(dataF1.shape)
print(dataF2.shape)
print(dataF3.shape)
print(dataF4.shape)

(3815, 2001)
(3815, 2001)
(3815, 2003)
(3815, 2003)

dataF1=pd.DataFrame(dataF1)
dataF2=pd.DataFrame(dataF2)
dataF3=pd.DataFrame(dataF3)
dataF4=pd.DataFrame(dataF4)
print(dataF1.head())

0 1 2 3 4 5 6 7 8 9 ... 1991 1992


1993 \
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.32355 0.0
0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
0.0

1994 1995 1996 1997 1998 1999 2000


0 0.0 0.0 0.0 0.0 0.0 0.0 0
1 0.0 0.0 0.0 0.0 0.0 0.0 0
2 0.0 0.0 0.0 0.0 0.0 0.0 1
3 0.0 0.0 0.0 0.0 0.0 0.0 0
4 0.0 0.0 0.0 0.0 0.0 0.0 1

[5 rows x 2001 columns]

for i in range(2000):
dataF1.rename(columns={i: 'tfidf'+str(i)}, inplace=True)
dataF2.rename(columns={i: 'bow'+str(i)}, inplace=True)
dataF3.rename(columns={i: 'tfidf'+str(i)}, inplace=True)
dataF4.rename(columns={i: 'bow'+str(i)}, inplace=True)
dataF1.rename(columns={2000: 'label'}, inplace=True)
dataF2.rename(columns={2000: 'label'}, inplace=True)
dataF3.rename(columns={2000: 'label'}, inplace=True)
dataF4.rename(columns={2000: 'label'}, inplace=True)
dataF3.rename(columns={2001: 'speaker'}, inplace=True)
dataF4.rename(columns={2001: 'speaker'}, inplace=True)
dataF3.rename(columns={2002: 'party'}, inplace=True)
dataF4.rename(columns={2002: 'party'}, inplace=True)
from sklearn.preprocessing import LabelEncoder
le3=LabelEncoder()
le4=LabelEncoder()
dataF3['speaker']=le3.fit_transform(dataF3['speaker'])
dataF4['speaker']=le4.fit_transform(dataF4['speaker'])
dataF3['party']=le3.fit_transform(dataF3['party'])
dataF4['party']=le4.fit_transform(dataF4['party'])

from sklearn.model_selection import train_test_split


X_train1, X_test1, y_train1, y_test1 =
train_test_split(dataF1.drop('label', axis=1), dataF1['label'],
test_size=0.2, random_state=0)
X_val1, X_test1, y_val1, y_test1 = train_test_split(X_test1, y_test1,
test_size=0.5, random_state=0)
X_train2, X_test2, y_train2, y_test2 =
train_test_split(dataF2.drop('label', axis=1), dataF2['label'],
test_size=0.2, random_state=0)
X_val2, X_test2, y_val2, y_test2 = train_test_split(X_test2, y_test2,
test_size=0.5, random_state=0)
X_train3, X_test3, y_train3, y_test3 =
train_test_split(dataF3.drop('label', axis=1), dataF3['label'],
test_size=0.2, random_state=0)
X_val3, X_test3, y_val3, y_test3 = train_test_split(X_test3, y_test3,
test_size=0.5, random_state=0)
X_train4, X_test4, y_train4, y_test4 =
train_test_split(dataF4.drop('label', axis=1), dataF4['label'],
test_size=0.2, random_state=0)
X_val4, X_test4, y_val4, y_test4 = train_test_split(X_test4, y_test4,
test_size=0.5, random_state=0)

from sklearn.naive_bayes import GaussianNB


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

print(dataF1.head())
print(dataF3.head())

tfidf0 tfidf1 tfidf2 tfidf3 tfidf4 tfidf5 tfidf6 tfidf7 tfidf8


tfidf9 ... \
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
0.0 ...
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
0.0 ...
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
0.0 ...
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
0.0 ...
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
0.0 ...

tfidf1991 tfidf1992 tfidf1993 tfidf1994 tfidf1995 tfidf1996


tfidf1997 \
0 0.0 0.0 0.0 0.0 0.0 0.0
0.0
1 0.0 0.0 0.0 0.0 0.0 0.0
0.0
2 0.0 0.0 0.0 0.0 0.0 0.0
0.0
3 0.32355 0.0 0.0 0.0 0.0 0.0
0.0
4 0.0 0.0 0.0 0.0 0.0 0.0
0.0

tfidf1998 tfidf1999 label


0 0.0 0.0 0
1 0.0 0.0 0
2 0.0 0.0 1
3 0.0 0.0 0
4 0.0 0.0 1

[5 rows x 2001 columns]


tfidf0 tfidf1 tfidf2 tfidf3 tfidf4 tfidf5 tfidf6 tfidf7 tfidf8
tfidf9 ... \
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
0.0 ...
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
0.0 ...
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
0.0 ...
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
0.0 ...
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
0.0 ...

tfidf1993 tfidf1994 tfidf1995 tfidf1996 tfidf1997 tfidf1998


tfidf1999 label \
0 0.0 0.0 0.0 0.0 0.0 0.0
0.0 0
1 0.0 0.0 0.0 0.0 0.0 0.0
0.0 0
2 0.0 0.0 0.0 0.0 0.0 0.0
0.0 1
3 0.0 0.0 0.0 0.0 0.0 0.0
0.0 0
4 0.0 0.0 0.0 0.0 0.0 0.0
0.0 1

speaker party
0 570 5
1 269 15
2 865 15
3 944 5
4 113 15

[5 rows x 2003 columns]

#Grid Search and Learning Curves

from sklearn.model_selection import learning_curve


from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

#1.Logistic Regression Model

# Cell 1: Training and Saving the Logistic Regression Model


import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Assume X_train4, y_train4, X_val4, y_val4 are defined

def logreg_grid_learn(X_train, y_train, X_val, y_val):


logreg = LogisticRegression(random_state=0, max_iter=20000)
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(logreg, param_grid, cv=3, refit=True,
n_jobs=-1, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Save the trained model


Pkl_Filename = "Pickle_LogReg_Model.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump(grid_search, file)

# Call the function and save the result


logreg_grid_learn(X_train4, y_train4, X_val4, y_val4)

Fitting 3 folds for each of 7 candidates, totalling 21 fits


# Loading the Logistic Regression model
Pkl_Filename = "Pickle_LogReg_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
Pickled_grid_search = pickle.load(file)

# Printing the parameters of the model


print("Best Parameters:", Pickled_grid_search.best_params_)
print("Best Score:", Pickled_grid_search.best_score_)
print("Accuracy on Validation Set: ", accuracy_score(y_val4,
Pickled_grid_search.predict(X_val4)))

Best Parameters: {'C': 10}


Best Score: 0.7113317222154608
Accuracy on Validation Set: 0.8162729658792651

# Printing classification report on the validation set


print("Classification Report on Validation Set:")
print(classification_report(y_val4,
Pickled_grid_search.predict(X_val4)))

Classification Report on Validation Set:


precision recall f1-score support

0 0.78 0.80 0.79 161


1 0.85 0.83 0.84 220

accuracy 0.82 381


macro avg 0.81 0.81 0.81 381
weighted avg 0.82 0.82 0.82 381

# Plotting the learning curve


train_sizes, train_scores, test_scores = learning_curve(
Pickled_grid_search.best_estimator_, X_train4, y_train4, cv=3,
scoring='accuracy', n_jobs=-1, verbose=1, shuffle=True
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o',
label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean -
train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--',
marker='s', label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean -
test_std, alpha=0.15, color='green')
plt.title("Learning Curve for Logistic Regression")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.legend(loc='best')
plt.show()

[learning_curve] Training set sizes: [ 203 661 1118 1576 2034]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent


workers.
[Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 1.4min finished

# Assuming you have defined X_test4 before


X_new = X_test4

# Load the pickled Logistic Regression model


Pkl_Filename = "Pickle_LogReg_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
loaded_model = pickle.load(file)

# Make predictions using the loaded model


predictions = loaded_model.predict(X_new)

# Print some information for debugging


print("Predictions:", predictions)
print("First prediction:", predictions[0])
# Check the truth value of the first element based on the model's
class mapping
if predictions[0] == '0':
print('The news is fake.')
elif predictions[0] == '1':
print('The news is true.')
else:
print('Unexpected prediction value.')

# Alternatively, you can check the class labels used in training


class_labels = loaded_model.classes_
print("Class labels:", class_labels)

Predictions: ['1' '1' '1' '1' '0' '0' '1' '1' '1' '0' '1' '0' '1' '1'
'1' '0' '1' '0'
'0' '0' '1' '0' '1' '0' '1' '1' '1' '0' '1' '1' '1' '1' '1' '0' '0'
'0'
'0' '1' '1' '1' '0' '0' '1' '0' '1' '1' '1' '0' '1' '0' '1' '0' '1'
'0'
'1' '0' '0' '1' '1' '1' '1' '0' '1' '0' '1' '1' '1' '0' '1' '1' '0'
'1'
'1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '0' '0' '0' '1' '0' '0'
'0'
'1' '0' '0' '0' '0' '1' '0' '1' '0' '1' '1' '0' '1' '0' '1' '1' '1'
'1'
'0' '0' '0' '0' '0' '0' '1' '0' '1' '1' '1' '1' '0' '1' '0' '0' '0'
'0'
'1' '1' '1' '0' '1' '0' '1' '1' '1' '1' '1' '0' '1' '1' '0' '1' '0'
'0'
'0' '0' '0' '1' '0' '0' '0' '1' '0' '1' '0' '1' '1' '0' '1' '1' '1'
'0'
'1' '0' '0' '1' '1' '0' '0' '1' '0' '0' '0' '0' '1' '1' '0' '1' '0'
'1'
'1' '0' '0' '1' '1' '1' '1' '1' '0' '1' '1' '0' '0' '0' '1' '1' '0'
'0'
'1' '0' '1' '1' '1' '0' '0' '1' '0' '1' '0' '0' '0' '0' '0' '0' '0'
'1'
'0' '1' '1' '0' '0' '0' '0' '0' '1' '1' '0' '0' '1' '0' '0' '1' '0'
'1'
'1' '1' '0' '1' '1' '1' '1' '0' '1' '0' '0' '0' '0' '0' '0' '1' '1'
'1'
'1' '1' '1' '1' '0' '1' '1' '0' '1' '1' '0' '0' '1' '1' '0' '0' '1'
'1'
'1' '1' '1' '0' '1' '0' '0' '1' '0' '0' '0' '1' '0' '0' '0' '0' '1'
'1'
'1' '0' '1' '1' '0' '0' '0' '1' '0' '0' '0' '1' '0' '0' '0' '1' '1'
'1'
'1' '1' '1' '1' '0' '1' '0' '1' '0' '0' '1' '0' '1' '1' '0' '0' '1'
'1'
'1' '0' '1' '1' '1' '0' '0' '1' '0' '1' '0' '1' '1' '1' '0' '1' '0'
'1'
'1' '0' '0' '1' '0' '0' '0' '0' '1' '0' '1' '1' '1' '1' '0' '1' '0'
'0'
'1' '1' '0' '1' '1' '1' '0' '1' '0' '1' '1' '0' '0' '1' '1' '0' '1'
'0'
'0' '0' '1' '0']
First prediction: 1
The news is true.
Class labels: ['0' '1']

#2.Gaussian Naive Bayes Model

# Cell 1: Training and Saving the Gaussian Naive Bayes Model


import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Assume X_train4, y_train4, X_val4, y_val4 are defined

def gauss_grid_learn(X_train, y_train, X_val, y_val):


gnb = GaussianNB()
param_grid = {'var_smoothing': np.logspace(0, -9, num=10)}
grid_search = GridSearchCV(gnb, param_grid, cv=3, refit=True,
n_jobs=-1, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Save the trained model


Pkl_Filename = "Pickle_GaussianNB_Model.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump(grid_search, file)

# Call the function and save the result


gauss_grid_learn(X_train4, y_train4, X_val4, y_val4)

Fitting 3 folds for each of 10 candidates, totalling 30 fits

# Loading the Gaussian Naive Bayes model


Pkl_Filename = "Pickle_GaussianNB_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
Pickled_grid_search = pickle.load(file)

# Printing the parameters of the model


print("Best Parameters:", Pickled_grid_search.best_params_)
print("Best Score:", Pickled_grid_search.best_score_)
print("Accuracy on Validation Set: ", accuracy_score(y_val4,
Pickled_grid_search.predict(X_val4)))
Best Parameters: {'var_smoothing': 1e-08}
Best Score: 0.628108018305699
Accuracy on Validation Set: 0.6902887139107612

# Printing classification report on the validation set


print("Classification Report on Validation Set:")
print(classification_report(y_val4,
Pickled_grid_search.predict(X_val4)))

Classification Report on Validation Set:


precision recall f1-score support

0 0.60 0.79 0.68 161


1 0.80 0.62 0.70 220

accuracy 0.69 381


macro avg 0.70 0.70 0.69 381
weighted avg 0.72 0.69 0.69 381

# Plotting the learning curve


train_sizes, train_scores, test_scores = learning_curve(
Pickled_grid_search.best_estimator_, X_train4, y_train4, cv=3,
scoring='accuracy', n_jobs=-1, verbose=1, shuffle=True
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o',
label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean -
train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--',
marker='s', label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean -
test_std, alpha=0.15, color='green')
plt.title("Learning Curve for Gaussian Naive Bayes")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.legend(loc='best')
plt.show()

[learning_curve] Training set sizes: [ 203 661 1118 1576 2034]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent


workers.
[Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 13.8s finished
# Assuming you have defined X_test4 before
X_new = X_test4

# Load the pickled Gaussian Naive Bayes model


Pkl_Filename = "Pickle_GaussianNB_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
loaded_model = pickle.load(file)

# Make predictions using the loaded model


predictions = loaded_model.predict(X_new)

# Print some information for debugging


print("Predictions:", predictions)
print("First prediction:", predictions[0])

# Check the truth value of the first element based on the model's
class mapping
if predictions[0] == '0':
print('The news is fake.')
elif predictions[0] == '1':
print('The news is true.')
else:
print('Unexpected prediction value.')
# Alternatively, you can check the class labels used in training
class_labels = loaded_model.classes_
print("Class labels:", class_labels)

Predictions: ['1' '0' '1' '1' '0' '0' '0' '0' '1' '0' '1' '1' '1' '0'
'1' '0' '1' '0'
'0' '1' '1' '0' '1' '0' '1' '1' '0' '0' '0' '1' '0' '1' '1' '0' '0'
'1'
'0' '0' '1' '0' '0' '0' '1' '1' '1' '0' '1' '0' '1' '0' '0' '0' '1'
'0'
'1' '0' '0' '0' '0' '1' '1' '0' '0' '0' '0' '1' '1' '0' '0' '1' '0'
'1'
'0' '1' '1' '0' '1' '0' '0' '1' '1' '0' '0' '0' '0' '1' '1' '0' '0'
'0'
'1' '0' '0' '1' '1' '1' '0' '1' '1' '1' '1' '0' '0' '0' '1' '1' '0'
'1'
'0' '0' '0' '0' '0' '0' '1' '0' '1' '1' '0' '1' '1' '1' '0' '0' '1'
'0'
'1' '0' '0' '0' '0' '0' '1' '1' '1' '1' '1' '1' '0' '1' '0' '1' '0'
'0'
'0' '0' '0' '1' '0' '0' '0' '0' '1' '0' '0' '0' '1' '0' '0' '0' '1'
'0'
'1' '0' '0' '1' '1' '0' '0' '1' '0' '0' '1' '0' '1' '1' '0' '0' '0'
'1'
'1' '0' '1' '1' '1' '1' '1' '0' '0' '0' '1' '1' '1' '0' '1' '0' '0'
'1'
'0' '1' '1' '1' '0' '0' '0' '1' '0' '1' '0' '0' '0' '1' '0' '0' '0'
'1'
'0' '1' '1' '0' '0' '0' '0' '0' '0' '1' '0' '0' '1' '0' '0' '1' '0'
'1'
'0' '1' '1' '1' '1' '1' '1' '0' '1' '0' '0' '0' '0' '1' '1' '1' '1'
'1'
'0' '1' '0' '1' '0' '1' '0' '0' '0' '0' '0' '1' '1' '0' '0' '0' '0'
'0'
'0' '0' '1' '0' '1' '0' '0' '1' '1' '0' '0' '1' '0' '0' '0' '0' '1'
'1'
'1' '0' '1' '1' '0' '0' '0' '1' '0' '0' '0' '1' '0' '0' '0' '1' '0'
'1'
'1' '1' '1' '1' '0' '1' '0' '1' '0' '0' '1' '0' '0' '1' '0' '0' '0'
'1'
'1' '0' '1' '1' '1' '0' '0' '1' '0' '1' '0' '0' '0' '0' '1' '1' '1'
'1'
'1' '1' '1' '1' '0' '0' '0' '0' '1' '0' '1' '0' '0' '0' '0' '0' '1'
'1'
'1' '0' '1' '0' '1' '1' '0' '1' '0' '0' '1' '0' '0' '0' '1' '0' '1'
'0'
'0' '1' '1' '0']
First prediction: 1
The news is true.
Class labels: ['0' '1']

#3.Decision Tree Model

# Cell 1: Training and Saving the Decision Tree Model


import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Assume X_train4, y_train4, X_val4, y_val4 are defined

def decision_tree_grid_learn(X_train, y_train, X_val, y_val):


dtree = DecisionTreeClassifier(random_state=0)
param_grid = {'max_depth': [None, 10, 20, 30, 40, 50],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]}
grid_search = GridSearchCV(dtree, param_grid, cv=3, refit=True,
n_jobs=-1, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Save the trained model


Pkl_Filename = "Pickle_DecisionTree_Model.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump(grid_search, file)

# Call the function and save the result


decision_tree_grid_learn(X_train4, y_train4, X_val4, y_val4)

Fitting 3 folds for each of 54 candidates, totalling 162 fits

# Loading the Decision Tree model


Pkl_Filename = "Pickle_DecisionTree_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
Pickled_grid_search = pickle.load(file)

# Printing the parameters of the model


print("Best Parameters:", Pickled_grid_search.best_params_)
print("Best Score:", Pickled_grid_search.best_score_)
print("Accuracy on Validation Set: ", accuracy_score(y_val4,
Pickled_grid_search.predict(X_val4)))

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1,


'min_samples_split': 2}
Best Score: 0.7224717458735227
Accuracy on Validation Set: 0.8162729658792651
# Printing classification report on the validation set
print("Classification Report on Validation Set:")
print(classification_report(y_val4,
Pickled_grid_search.predict(X_val4)))

Classification Report on Validation Set:


precision recall f1-score support

0 0.75 0.86 0.80 161


1 0.88 0.79 0.83 220

accuracy 0.82 381


macro avg 0.81 0.82 0.81 381
weighted avg 0.82 0.82 0.82 381

# Plotting the learning curve


train_sizes, train_scores, test_scores = learning_curve(
Pickled_grid_search.best_estimator_, X_train4, y_train4, cv=3,
scoring='accuracy', n_jobs=-1, verbose=1, shuffle=True
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o',
label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean -
train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--',
marker='s', label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean -
test_std, alpha=0.15, color='green')
plt.title("Learning Curve for Decision Tree")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.legend(loc='best')
plt.show()

# Assuming you have defined X_test4 before


X_new = X_test4

[learning_curve] Training set sizes: [ 203 661 1118 1576 2034]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent


workers.
[Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 15.4s finished
# Load the pickled Decision Tree model
Pkl_Filename = "Pickle_DecisionTree_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
loaded_model = pickle.load(file)

# Make predictions using the loaded model


predictions = loaded_model.predict(X_new)

# Print some information for debugging


print("Predictions:", predictions)
print("First prediction:", predictions[0])

# Check the truth value of the first element based on the model's
class mapping
if predictions[0] == '0':
print('The news is fake.')
elif predictions[0] == '1':
print('The news is true.')
else:
print('Unexpected prediction value.')

# Alternatively, you can check the class labels used in training


class_labels = loaded_model.classes_
print("Class labels:", class_labels)

Predictions: ['1' '1' '1' '0' '0' '0' '1' '1' '1' '0' '1' '0' '0' '1'
'1' '0' '1' '1'
'1' '0' '1' '1' '1' '0' '1' '1' '1' '0' '1' '1' '1' '1' '1' '0' '0'
'0'
'1' '1' '1' '1' '0' '0' '0' '0' '1' '1' '0' '0' '0' '0' '0' '0' '1'
'1'
'1' '0' '1' '1' '1' '0' '1' '0' '1' '0' '0' '1' '1' '0' '1' '0' '0'
'1'
'1' '1' '1' '1' '1' '0' '1' '1' '0' '1' '1' '1' '0' '1' '1' '1' '0'
'0'
'1' '1' '1' '0' '0' '1' '1' '1' '1' '1' '1' '0' '1' '1' '1' '1' '1'
'0'
'0' '0' '0' '1' '0' '0' '0' '0' '0' '1' '1' '1' '0' '0' '0' '0' '1'
'0'
'1' '1' '1' '0' '0' '1' '0' '1' '1' '1' '0' '1' '1' '1' '0' '1' '1'
'1'
'0' '0' '0' '1' '0' '0' '0' '1' '0' '1' '0' '1' '1' '1' '1' '1' '1'
'0'
'1' '1' '0' '1' '1' '0' '0' '1' '0' '0' '1' '0' '1' '1' '0' '0' '0'
'1'
'1' '0' '0' '1' '1' '0' '1' '0' '0' '1' '1' '0' '0' '1' '1' '1' '1'
'1'
'1' '0' '1' '1' '1' '1' '1' '1' '0' '1' '0' '1' '0' '0' '0' '0' '0'
'1'
'0' '0' '1' '0' '0' '0' '0' '0' '1' '0' '0' '0' '1' '0' '0' '1' '0'
'1'
'1' '1' '1' '0' '0' '1' '0' '0' '1' '0' '0' '1' '1' '1' '1' '0' '0'
'1'
'0' '1' '1' '1' '0' '0' '0' '1' '1' '1' '0' '0' '0' '0' '0' '1' '1'
'1'
'1' '1' '0' '0' '1' '0' '0' '1' '1' '1' '0' '1' '0' '0' '0' '0' '1'
'1'
'1' '0' '0' '1' '0' '0' '0' '1' '0' '0' '0' '1' '0' '0' '0' '1' '1'
'1'
'1' '1' '1' '0' '0' '1' '1' '1' '0' '1' '1' '0' '1' '1' '0' '0' '1'
'1'
'1' '0' '1' '1' '1' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '1' '1'
'1'
'1' '0' '0' '1' '0' '1' '0' '0' '1' '0' '0' '0' '1' '0' '0' '0' '1'
'0'
'0' '1' '1' '1' '1' '1' '0' '1' '1' '1' '1' '0' '0' '1' '1' '0' '1'
'0'
'0' '1' '0' '0']
First prediction: 1
The news is true.
Class labels: ['0' '1']
#4.Random Forest Model

# Cell 1: Training and Saving the Random Forest Model


import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Assume X_train4, y_train4, X_val4, y_val4 are defined

def random_forest_grid_learn(X_train, y_train, X_val, y_val):


rf = RandomForestClassifier(random_state=0)
param_grid = {'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30, 40, 50],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]}
grid_search = GridSearchCV(rf, param_grid, cv=3, refit=True,
n_jobs=-1, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)
# Save the trained model
Pkl_Filename = "Pickle_RandomForest_Model.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump(grid_search, file)

# Call the function and save the result


random_forest_grid_learn(X_train4, y_train4, X_val4, y_val4)

Fitting 3 folds for each of 162 candidates, totalling 486 fits

----------------------------------------------------------------------
-----
KeyboardInterrupt Traceback (most recent call
last)
<ipython-input-55-5542e53a1d77> in <cell line: 25>()
23
24 # Call the function and save the result
---> 25 random_forest_grid_learn(X_train4, y_train4, X_val4, y_val4)
26

<ipython-input-55-5542e53a1d77> in random_forest_grid_learn(X_train,
y_train, X_val, y_val)
16 'min_samples_leaf': [1, 2, 4]}
17 grid_search = GridSearchCV(rf, param_grid, cv=3,
refit=True, n_jobs=-1, scoring='accuracy', verbose=1)
---> 18 grid_search.fit(X_train, y_train)
19 # Save the trained model
20 Pkl_Filename = "Pickle_RandomForest_Model.pkl"
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_searc
h.py in fit(self, X, y, groups, **fit_params)
872 return results
873
--> 874 self._run_search(evaluate_candidates)
875
876 # multimetric is determined here because in the
case of a callable

/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_searc
h.py in _run_search(self, evaluate_candidates)
1386 def _run_search(self, evaluate_candidates):
1387 """Search all candidates in param_grid"""
-> 1388 evaluate_candidates(ParameterGrid(self.param_grid))
1389
1390

/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_searc
h.py in evaluate_candidates(candidate_params, cv, more_results)
819 )
820
--> 821 out = parallel(
822 delayed(_fit_and_score)(
823 clone(base_estimator),

/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py in
__call__(self, iterable)
61 for delayed_func, args, kwargs in iterable
62 )
---> 63 return super().__call__(iterable_with_config)
64
65

/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in
__call__(self, iterable)
1950 next(output)
1951
-> 1952 return output if self.return_generator else
list(output)
1953
1954 def __repr__(self):

/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in
_get_outputs(self, iterator, pre_dispatch)
1593
1594 with self._backend.retrieval_context():
-> 1595 yield from self._retrieve()
1596
1597 except GeneratorExit:
/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in
_retrieve(self)
1705 (self._jobs[0].get_status(
1706 timeout=self.timeout) == TASK_PENDING)):
-> 1707 time.sleep(0.01)
1708 continue
1709

KeyboardInterrupt:

# Loading the Random Forest model


Pkl_Filename = "Pickle_RandomForest_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
Pickled_grid_search = pickle.load(file)

# Printing the parameters of the model


print("Best Parameters:", Pickled_grid_search.best_params_)
print("Best Score:", Pickled_grid_search.best_score_)
print("Accuracy on Validation Set: ", accuracy_score(y_val4,
Pickled_grid_search.predict(X_val4)))

# Printing classification report on the validation set


print("Classification Report on Validation Set:")
print(classification_report(y_val4,
Pickled_grid_search.predict(X_val4)))

# Plotting the learning curve


train_sizes, train_scores, test_scores = learning_curve(
Pickled_grid_search.best_estimator_, X_train4, y_train4, cv=3,
scoring='accuracy', n_jobs=-1, verbose=1, shuffle=True
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o',
label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean -
train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--',
marker='s', label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean -
test_std, alpha=0.15, color='green')
plt.title("Learning Curve for Random Forest")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.legend(loc='best')
plt.show()
# Assuming you have defined X_test4 before
X_new = X_test4

# Load the pickled Random Forest model


Pkl_Filename = "Pickle_RandomForest_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
loaded_model = pickle.load(file)

# Make predictions using the loaded model


predictions = loaded_model.predict(X_new)

# Print some information for debugging


print("Predictions:", predictions)
print("First prediction:", predictions[0])

# Check the truth value of the first element based on the model's
class mapping
if predictions[0] == '0':
print('The news is fake.')
elif predictions[0] == '1':
print('The news is true.')
else:
print('Unexpected prediction value.')

# Alternatively, you can check the class labels used in training


class_labels = loaded_model.classes_
print("Class labels:", class_labels)

#5.ADABoost Model

# Cell 1: Training and Saving the ADABoost Model


import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Assume X_train4, y_train4, X_val4, y_val4 are defined

def adaboost_grid_learn(X_train, y_train, X_val, y_val):


adaboost = AdaBoostClassifier(random_state=0)
param_grid = {'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 1.0, 2.0]}
grid_search = GridSearchCV(adaboost, param_grid, cv=3, refit=True,
n_jobs=-1, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Save the trained model


Pkl_Filename = "Pickle_AdaBoost_Model.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump(grid_search, file)

# Call the function and save the result


adaboost_grid_learn(X_train4, y_train4, X_val4, y_val4)

# Loading the ADABoost model


Pkl_Filename = "Pickle_AdaBoost_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
Pickled_grid_search = pickle.load(file)

# Printing the parameters of the model


print("Best Parameters:", Pickled_grid_search.best_params_)
print("Best Score:", Pickled_grid_search.best_score_)
print("Accuracy on Validation Set: ", accuracy_score(y_val4,
Pickled_grid_search.predict(X_val4)))

# Printing classification report on the validation set


print("Classification Report on Validation Set:")
print(classification_report(y_val4,
Pickled_grid_search.predict(X_val4)))

# Plotting the learning curve


train_sizes, train_scores, test_scores = learning_curve(
Pickled_grid_search.best_estimator_, X_train4, y_train4, cv=3,
scoring='accuracy', n_jobs=-1, verbose=1, shuffle=True
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o',
label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean -
train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--',
marker='s', label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean -
test_std, alpha=0.15, color='green')
plt.title("Learning Curve for ADABoost")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.legend(loc='best')
plt.show()

# Assuming you have defined X_test4 before


X_new = X_test4

# Load the pickled ADABoost model


Pkl_Filename = "Pickle_AdaBoost_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
loaded_model = pickle.load(file)

# Make predictions using the loaded model


predictions = loaded_model.predict(X_new)

# Print some information for debugging


print("Predictions:", predictions)
print("First prediction:", predictions[0])

# Check the truth value of the first element based on the model's
class mapping
if predictions[0] == '0':
print('The news is fake.')
elif predictions[0] == '1':
print('The news is true.')
else:
print('Unexpected prediction value.')

# Alternatively, you can check the class labels used in training


class_labels = loaded_model.classes_
print("Class labels:", class_labels)

#6.Support Vector Machine Model

# Cell 1: Training and Saving the SVM Model


import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Assume X_train4, y_train4, X_val4, y_val4 are defined

def svm_grid_learn(X_train, y_train, X_val, y_val):


svm_model = SVC(random_state=0)
param_grid = {'C': [0.1, 1, 10],
'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(svm_model, param_grid, cv=3,
refit=True, n_jobs=-1, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Save the trained model


Pkl_Filename = "Pickle_SVM_Model.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump(grid_search, file)

# Call the function and save the result


svm_grid_learn(X_train4, y_train4, X_val4, y_val4)
Fitting 3 folds for each of 6 candidates, totalling 18 fits

----------------------------------------------------------------------
-----
KeyboardInterrupt Traceback (most recent call
last)
<ipython-input-56-4120cac957bd> in <cell line: 24>()
22
23 # Call the function and save the result
---> 24 svm_grid_learn(X_train4, y_train4, X_val4, y_val4)

<ipython-input-56-4120cac957bd> in svm_grid_learn(X_train, y_train,


X_val, y_val)
14 'kernel': ['linear', 'rbf']}
15 grid_search = GridSearchCV(svm_model, param_grid, cv=3,
refit=True, n_jobs=-1, scoring='accuracy', verbose=1)
---> 16 grid_search.fit(X_train, y_train)
17
18 # Save the trained model

/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_searc
h.py in fit(self, X, y, groups, **fit_params)
872 return results
873
--> 874 self._run_search(evaluate_candidates)
875
876 # multimetric is determined here because in the
case of a callable

/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_searc
h.py in _run_search(self, evaluate_candidates)
1386 def _run_search(self, evaluate_candidates):
1387 """Search all candidates in param_grid"""
-> 1388 evaluate_candidates(ParameterGrid(self.param_grid))
1389
1390

/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_searc
h.py in evaluate_candidates(candidate_params, cv, more_results)
819 )
820
--> 821 out = parallel(
822 delayed(_fit_and_score)(
823 clone(base_estimator),

/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py in
__call__(self, iterable)
61 for delayed_func, args, kwargs in iterable
62 )
---> 63 return super().__call__(iterable_with_config)
64
65

/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in
__call__(self, iterable)
1950 next(output)
1951
-> 1952 return output if self.return_generator else
list(output)
1953
1954 def __repr__(self):

/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in
_get_outputs(self, iterator, pre_dispatch)
1593
1594 with self._backend.retrieval_context():
-> 1595 yield from self._retrieve()
1596
1597 except GeneratorExit:

/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in
_retrieve(self)
1705 (self._jobs[0].get_status(
1706 timeout=self.timeout) == TASK_PENDING)):
-> 1707 time.sleep(0.01)
1708 continue
1709

KeyboardInterrupt:

# Loading the SVM model


Pkl_Filename = "Pickle_SVM_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
Pickled_grid_search = pickle.load(file)

# Printing the parameters of the model


print("Best Parameters:", Pickled_grid_search.best_params_)
print("Best Score:", Pickled_grid_search.best_score_)
print("Accuracy on Validation Set: ", accuracy_score(y_val4,
Pickled_grid_search.predict(X_val4)))

# Printing classification report on the validation set


print("Classification Report on Validation Set:")
print(classification_report(y_val4,
Pickled_grid_search.predict(X_val4)))

# Plotting the learning curve


train_sizes, train_scores, test_scores = learning_curve(
Pickled_grid_search.best_estimator_, X_train4, y_train4, cv=3,
scoring='accuracy', n_jobs=-1, verbose=1, shuffle=True
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o',
label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean -
train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--',
marker='s', label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean -
test_std, alpha=0.15, color='green')
plt.title("Learning Curve for SVM")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.legend(loc='best')
plt.show()

# Assuming you have defined X_test4 before


X_new = X_test4

# Load the pickled SVM model


Pkl_Filename = "Pickle_SVM_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
loaded_model = pickle.load(file)

# Make predictions using the loaded model


predictions = loaded_model.predict(X_new)

# Print some information for debugging


print("Predictions:", predictions)
print("First prediction:", predictions[0])

# Check the truth value of the first element based on the model's
class mapping
if predictions[0] == '0':
print('The news is fake.')
elif predictions[0] == '1':
print('The news is true.')
else:
print('Unexpected prediction value.')

# Alternatively, you can check the class labels used in training


class_labels = loaded_model.classes_
print("Class labels:", class_labels)

#7.MLP Model

# Cell 1: Training and Saving the MLP Model


import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Assume X_train4, y_train4, X_val4, y_val4 are defined

def mlp_grid_learn(X_train, y_train, X_val, y_val):


mlp_model = MLPClassifier(random_state=0, max_iter=200)
param_grid = {
'hidden_layer_sizes': [(100,), (50, 50), (30, 30, 30)],
'activation': ['logistic', 'tanh', 'relu'],
'alpha': [0.0001, 0.001, 0.01]
}
grid_search = GridSearchCV(mlp_model, param_grid, cv=3,
refit=True, n_jobs=-1, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)
# Save the trained model
Pkl_Filename = "Pickle_MLP_Model.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump(grid_search, file)

# Call the function and save the result


mlp_grid_learn(X_train4, y_train4, X_val4, y_val4)

Fitting 3 folds for each of 27 candidates, totalling 81 fits

# Loading the MLP model


Pkl_Filename = "Pickle_MLP_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
Pickled_grid_search = pickle.load(file)

# Printing the parameters of the model


print("Best Parameters:", Pickled_grid_search.best_params_)
print("Best Score:", Pickled_grid_search.best_score_)
print("Accuracy on Validation Set: ", accuracy_score(y_val4,
Pickled_grid_search.predict(X_val4)))

# Printing classification report on the validation set


print("Classification Report on Validation Set:")
print(classification_report(y_val4,
Pickled_grid_search.predict(X_val4)))

# Plotting the learning curve


train_sizes, train_scores, test_scores = learning_curve(
Pickled_grid_search.best_estimator_, X_train4, y_train4, cv=3,
scoring='accuracy', n_jobs=-1, verbose=1, shuffle=True
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o',
label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean -
train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--',
marker='s', label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean -
test_std, alpha=0.15, color='green')
plt.title("Learning Curve for MLP")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.legend(loc='best')
plt.show()

# Assuming you have defined X_test4 before


X_new = X_test4

# Load the pickled MLP model


Pkl_Filename = "Pickle_MLP_Model.pkl"
with open(Pkl_Filename, 'rb') as file:
loaded_model = pickle.load(file)

# Make predictions using the loaded model


predictions = loaded_model.predict(X_new)

# Print some information for debugging


print("Predictions:", predictions)
print("First prediction:", predictions[0])

# Check the truth value of the first element based on the model's
class mapping
if predictions[0] == '0':
print('The news is fake.')
elif predictions[0] == '1':
print('The news is true.')
else:
print('Unexpected prediction value.')

# Alternatively, you can check the class labels used in training


class_labels = loaded_model.classes_
print("Class labels:", class_labels)

#8.MLP Model With PCA

# Cell 1: Training and Saving the MLP Model with PCA


import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
# Assume X_train4, y_train4, X_val4, y_val4 are defined

def mlp_pca_grid_learn(X_train, y_train, X_val, y_val,


n_components=50):
# Apply PCA for dimensionality reduction
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

# MLP model
mlp_model = MLPClassifier(random_state=0)
param_grid = {'hidden_layer_sizes': [(100,), (50, 50), (50, 30,
10)],
'alpha': [0.0001, 0.001, 0.01]}

grid_search = GridSearchCV(mlp_model, param_grid, cv=3,


refit=True, n_jobs=-1, scoring='accuracy', verbose=1)
grid_search.fit(X_train_pca, y_train)

# Save the trained model


Pkl_Filename = "Pickle_MLP_Model_PCA.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump((grid_search, pca), file)

# Call the function and save the result


mlp_pca_grid_learn(X_train4, y_train4, X_val4, y_val4)

# Loading the MLP model with PCA


Pkl_Filename = "Pickle_MLP_Model_PCA.pkl"
with open(Pkl_Filename, 'rb') as file:
Pickled_grid_search, pca = pickle.load(file)

# Printing the parameters of the model


print("Best Parameters:", Pickled_grid_search.best_params_)
print("Best Score:", Pickled_grid_search.best_score_)

# Transform validation data using PCA


X_val_pca = pca.transform(X_val4)

# Printing accuracy on the validation set


print("Accuracy on Validation Set: ", accuracy_score(y_val4,
Pickled_grid_search.predict(X_val_pca)))

# Printing classification report on the validation set


print("Classification Report on Validation Set:")
print(classification_report(y_val4,
Pickled_grid_search.predict(X_val_pca)))
# Plotting the learning curve
train_sizes, train_scores, test_scores = learning_curve(
Pickled_grid_search.best_estimator_, X_train4, y_train4, cv=3,
scoring='accuracy', n_jobs=-1, verbose=1, shuffle=True
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o',
label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean -
train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--',
marker='s', label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean -
test_std, alpha=0.15, color='green')
plt.title("Learning Curve for MLP with PCA")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.legend(loc='best')
plt.show()

# Assuming you have defined X_test4 before


X_test_pca = pca.transform(X_test4)

# Make predictions using the loaded model on test data


predictions = Pickled_grid_search.predict(X_test_pca)

# Print some information for debugging


print("Predictions:", predictions)
print("First prediction:", predictions[0])

# Check the truth value of the first element based on the model's
class mapping
if predictions[0] == '0':
print('The news is fake.')
elif predictions[0] == '1':
print('The news is true.')
else:
print('Unexpected prediction value.')

# Alternatively, you can check the class labels used in training


class_labels = Pickled_grid_search.classes_
print("Class labels:", class_labels)

#9.MLP with t-SNE

# Cell 1: Training and Saving the MLP Model with t-SNE


import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Assume X_train4, y_train4, X_val4, y_val4 are defined

def mlp_tsne_grid_learn(X_train, y_train, X_val, y_val,


n_components=2, perplexity=30, learning_rate=200):
# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=n_components, perplexity=perplexity,
learning_rate=learning_rate)
X_train_tsne = tsne.fit_transform(X_train)
X_val_tsne = tsne.transform(X_val)

# MLP model
mlp_model = MLPClassifier(random_state=0)
param_grid = {'hidden_layer_sizes': [(100,), (50, 50), (50, 30,
10)],
'alpha': [0.0001, 0.001, 0.01]}

grid_search = GridSearchCV(mlp_model, param_grid, cv=3,


refit=True, n_jobs=-1, scoring='accuracy', verbose=1)
grid_search.fit(X_train_tsne, y_train)
# Save the trained model
Pkl_Filename = "Pickle_MLP_Model_TSNE.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump((grid_search, tsne), file)

# Call the function and save the result


mlp_tsne_grid_learn(X_train4, y_train4, X_val4, y_val4)

# Loading the MLP model with t-SNE


Pkl_Filename = "Pickle_MLP_Model_TSNE.pkl"
with open(Pkl_Filename, 'rb') as file:
Pickled_grid_search, tsne = pickle.load(file)

# Printing the parameters of the model


print("Best Parameters:", Pickled_grid_search.best_params_)
print("Best Score:", Pickled_grid_search.best_score_)

# Transform validation data using t-SNE


X_val_tsne = tsne.transform(X_val4)

# Printing accuracy on the validation set


print("Accuracy on Validation Set: ", accuracy_score(y_val4,
Pickled_grid_search.predict(X_val_tsne)))

# Printing classification report on the validation set


print("Classification Report on Validation Set:")
print(classification_report(y_val4,
Pickled_grid_search.predict(X_val_tsne)))

# Plotting the learning curve


train_sizes, train_scores, test_scores = learning_curve(
Pickled_grid_search.best_estimator_, X_train4, y_train4, cv=3,
scoring='accuracy', n_jobs=-1, verbose=1, shuffle=True
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o',
label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean -
train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--',
marker='s', label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean -
test_std, alpha=0.15, color='green')
plt.title("Learning Curve for MLP with t-SNE")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.legend(loc='best')
plt.show()

# Assuming you have defined X_test4 before


X_test_tsne = tsne.transform(X_test4)

# Make predictions using the loaded model on test data


predictions = Pickled_grid_search.predict(X_test_tsne)

# Print some information for debugging


print("Predictions:", predictions)
print("First prediction:", predictions[0])

# Check the truth value of the first element based on the model's
class mapping
if predictions[0] == '0':
print('The news is fake.')
elif predictions[0] == '1':
print('The news is true.')
else:
print('Unexpected prediction value.')

# Alternatively, you can check the class labels used in training


class_labels = Pickled_grid_search.classes_
print("Class labels:", class_labels)
#Ensemble Method for the Above Classifiers

import pickle
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Assuming you have defined X_train4, y_train4, X_val4, y_val4,


X_test4
# Load the trained models using pickle

# Load the Gaussian Naive Bayes model


with open("Pickle_GaussianNB_Model.pkl", 'rb') as file:
gnb_model = pickle.load(file)

# Load the Logistic Regression model


with open("Pickle_LogReg_Model.pkl", 'rb') as file:
logreg_model = pickle.load(file)

# Load the Decision Tree model


with open("Pickle_DecisionTree_Model.pkl", 'rb') as file:
dt_model = pickle.load(file)

# Load the Random Forest model


with open("Pickle_RandomForest_Model.pkl", 'rb') as file:
rf_model = pickle.load(file)

# Load the ADABoost model


with open("Pickle_AdaBoost_Model.pkl", 'rb') as file:
adaboost_model = pickle.load(file)

# Load the Support Vector Machine model


with open("Pickle_SVM_Model.pkl", 'rb') as file:
svm_model = pickle.load(file)

# Load the MLP model


with open("Pickle_MLP_Model.pkl", 'rb') as file:
mlp_model = pickle.load(file)

# Load the MLP with PCA model


with open("Pickle_MLP_Model_PCA.pkl", 'rb') as file:
mlp_pca_model = pickle.load(file)

# Load the MLP with t-SNE model


with open("Pickle_MLP_Model_TSNE.pkl", 'rb') as file:
mlp_tsne_model, tsne = pickle.load(file)

# Create a list of models


models = [('GaussianNB', gnb_model),
('LogisticRegression', logreg_model),
('DecisionTree', dt_model),
('RandomForest', rf_model),
('ADABoost', adaboost_model),
('SVM', svm_model),
('MLP', mlp_model),
('MLP_PCA', mlp_pca_model),
('MLP_TSNE', mlp_tsne_model)]

# Create a Voting Classifier


voting_classifier = VotingClassifier(estimators=models, voting='hard')

# Fit the ensemble model on the training data


voting_classifier.fit(X_train4, y_train4)

# Evaluate the ensemble model on the validation set


y_pred_ensemble = voting_classifier.predict(X_val4)

# Print classification report and accuracy


print("Ensemble Model - Classification Report:")
print(classification_report(y_val4, y_pred_ensemble))
print("Ensemble Model - Accuracy: ", accuracy_score(y_val4,
y_pred_ensemble))

# Plotting the learning curve for the ensemble model


train_sizes, train_scores, test_scores = learning_curve(
voting_classifier, X_train4, y_train4, cv=3, scoring='accuracy',
n_jobs=-1, verbose=1, shuffle=True
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o',
label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean -
train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--',
marker='s', label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean -
test_std, alpha=0.15, color='green')
plt.title("Learning Curve for Ensemble Model")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.legend(loc='best')
plt.show()

# Save the ensemble model


with open("Pickle_Ensemble_Model.pkl", 'wb') as file:
pickle.dump(voting_classifier, file)
# Assuming you have defined X_test4 before
X_new = X_test4

# Load the pickled ensemble model


with open("Pickle_Ensemble_Model.pkl", 'rb') as file:
loaded_ensemble_model = pickle.load(file)

# Make predictions using the ensemble model


predictions_ensemble = loaded_ensemble_model.predict(X_new)

# Print some information for debugging


print("Predictions for the ensemble model:", predictions_ensemble)
print("First prediction for the ensemble model:",
predictions_ensemble[0])

# Check the truth value of the first element based on the ensemble
model's class mapping
if predictions_ensemble[0] == '0':
print('The news is fake according to the ensemble model.')
elif predictions_ensemble[0] == '1':
print('The news is true according to the ensemble model.')
else:
print('Unexpected prediction value for the ensemble model.')

You might also like