0% found this document useful (0 votes)
15 views2 pages

Fake News Detection

FND

Uploaded by

jaayseme
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
15 views2 pages

Fake News Detection

FND

Uploaded by

jaayseme
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 2

In [1]: import pandas as pd

# Load the datasets


true_news = pd.read_csv('True.csv')
fake_news = pd.read_csv('Fake.csv')
#print (true_news.info())
#print (fake_news.info())

# Add a column to differentiate between fake and real news


true_news['label'] = 1 # 1 for real
fake_news['label'] = 0 # 0 for fake
print (true_news.info())
#print (fake_news.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 title 21417 non-null object
1 text 21417 non-null object
2 subject 21417 non-null object
3 date 21417 non-null object
4 label 21417 non-null int64
dtypes: int64(1), object(4)
memory usage: 836.7+ KB
None

In [2]: # Combine the datasets


data = pd.concat([true_news, fake_news])
print (data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 0 to 23480
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 title 44898 non-null object
1 text 44898 non-null object
2 subject 44898 non-null object
3 date 44898 non-null object
4 label 44898 non-null int64
dtypes: int64(1), object(4)
memory usage: 2.1+ MB
None
title \
0 As U.S. budget fight looms, Republicans flip t...
1 U.S. military to accept transgender recruits o...
2 Senior U.S. Republican senator: 'Let Mr. Muell...
3 FBI Russia probe helped by Australian diplomat...
4 Trump wants Postal Service to charge 'much mor...

text subject \
0 WASHINGTON (Reuters) - The head of a conservat... politicsNews
1 WASHINGTON (Reuters) - Transgender people will... politicsNews
2 WASHINGTON (Reuters) - The special counsel inv... politicsNews
3 WASHINGTON (Reuters) - Trump campaign adviser ... politicsNews
4 SEATTLE/WASHINGTON (Reuters) - President Donal... politicsNews

date label
0 December 31, 2017 1
1 December 29, 2017 1
2 December 31, 2017 1
3 December 30, 2017 1
4 December 29, 2017 1

In [3]: #Data Preprocessing


"""
Convert text to lowercase.
Remove punctuation, numbers, and stopwords.
Apply tokenization and stemming/lemmatization.
"""
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenize the text
words = word_tokenize(text)
# Remove stopwords and apply stemming
words = [stemmer.stem(word) for word in words if word not in stop_words]
return ' '.join(words)
#The proprocessing is applied to the text column and the output is returned and added as a new column which is cleaned_text.
data['cleaned_text'] = data['text'].apply(preprocess)
print (data.head())

title \
0 As U.S. budget fight looms, Republicans flip t...
1 U.S. military to accept transgender recruits o...
2 Senior U.S. Republican senator: 'Let Mr. Muell...
3 FBI Russia probe helped by Australian diplomat...
4 Trump wants Postal Service to charge 'much mor...

text subject \
0 WASHINGTON (Reuters) - The head of a conservat... politicsNews
1 WASHINGTON (Reuters) - Transgender people will... politicsNews
2 WASHINGTON (Reuters) - The special counsel inv... politicsNews
3 WASHINGTON (Reuters) - Trump campaign adviser ... politicsNews
4 SEATTLE/WASHINGTON (Reuters) - President Donal... politicsNews

date label \
0 December 31, 2017 1
1 December 29, 2017 1
2 December 31, 2017 1
3 December 30, 2017 1
4 December 29, 2017 1

cleaned_text
0 washington reuter head conserv republican fact...
1 washington reuter transgend peopl allow first ...
2 washington reuter special counsel investig lin...
3 washington reuter trump campaign advis georg p...
4 seattlewashington reuter presid donald trump c...

In [9]: #Feature Extraction


"""Bag of Words (BoW): Convert text into numerical vectors.
TF-IDF: Alternatively, you can use TF-IDF for a better representation of the text"""

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['cleaned_text'])

# The labels
y = data['label']
print(X)
print (y)

(0, 4979) 0.02927448869206572


(0, 3743) 0.03180899244323748
(0, 102) 0.04958282046249698
(0, 1657) 0.05655575530504557
(0, 1738) 0.028044900055669533
(0, 752) 0.0379670205483232
(0, 3801) 0.051477707026325946
(0, 3549) 0.05094916005811628
(0, 4484) 0.037153135804899214
(0, 2222) 0.04809636412006563
(0, 599) 0.06437000260114123
(0, 1544) 0.03540936006229215
(0, 3943) 0.043467381494883016
(0, 3800) 0.04869167619779146
(0, 1996) 0.03333317064256021
(0, 1835) 0.07421442369046749
(0, 3929) 0.03805312553131785
(0, 4893) 0.021910056107812624
(0, 2417) 0.040827597090537335
(0, 4008) 0.03904774333334839
(0, 4298) 0.03944884961747107
(0, 4869) 0.039592671880409006
(0, 1843) 0.025271906782534402
(0, 1387) 0.028484754697966942
(0, 2598) 0.046385916706794345
: :
(44897, 4973) 0.012235299964176561
(44897, 3299) 0.011587446793135905
(44897, 3863) 0.03734240903154764
(44897, 0) 0.022246228042812344
(44897, 1968) 0.01788274256320097
(44897, 3017) 0.015514876847674662
(44897, 3884) 0.008284239042034843
(44897, 2197) 0.013991213735492643
(44897, 4125) 0.02449035039437561
(44897, 1977) 0.02717251437647199
(44897, 3917) 0.045594119722024776
(44897, 3546) 0.017032958724693053
(44897, 296) 0.011789978521654797
(44897, 2872) 0.09849675672796286
(44897, 3453) 0.04138412063689714
(44897, 762) 0.015253092787549404
(44897, 1522) 0.029147244242323678
(44897, 1639) 0.014448301551072098
(44897, 3389) 0.01772707489674373
(44897, 2659) 0.025107158300442352
(44897, 4599) 0.016193630289571506
(44897, 3776) 0.021673051807779578
(44897, 754) 0.013182701009688056
(44897, 4730) 0.1281617686183972
(44897, 4846) 0.01459308602078768
0 1
1 1
2 1
3 1
4 1
..
23476 0
23477 0
23478 0
23479 0
23480 0
Name: label, Length: 44898, dtype: int64

In [21]: """
Model Selection
Common algorithms for text classification:
Logistic Regression
Naive Bayes
Support Vector Machines (SVM)
Random Forest"""
#Train/Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print (X_train)
print (X_test)
print (y_train)
print (y_test)

(0, 4097) 0.6214863713867282


(0, 2167) 0.07975118299473853
(0, 4357) 0.06696758648999868
(0, 42) 0.07865252837342171
(0, 2332) 0.07098796933193335
(0, 1733) 0.15381827993404826
(0, 2045) 0.08035280572579839
(0, 4978) 0.06767848478375141
(0, 1534) 0.07378297514129759
(0, 1693) 0.06174416375535703
(0, 1226) 0.062308438131677295
(0, 28) 0.08592258686629516
(0, 2843) 0.056516788263405116
(0, 4368) 0.06340260977728616
(0, 2949) 0.12389169130416453
(0, 3729) 0.06255887587869023
(0, 1143) 0.06200400834288353
(0, 2954) 0.08254080921472215
(0, 932) 0.08405861873865984
(0, 4000) 0.059616910222330566
(0, 44) 0.07486269831818537
(0, 301) 0.05338226573022805
(0, 1246) 0.049875907271615734
(0, 3349) 0.06316835671081979
(0, 4205) 0.08212409093797246
: :
(35917, 1844) 0.10683822299478586
(35917, 985) 0.018862939074196537
(35917, 2761) 0.019112015440846546
(35917, 3144) 0.014330749215001683
(35917, 4253) 0.027479539032735662
(35917, 4697) 0.053488928545480686
(35917, 3760) 0.04612755360585819
(35917, 4973) 0.030911688264838737
(35917, 1678) 0.023789955569305063
(35917, 3863) 0.04717158183375135
(35917, 3972) 0.02085878286889635
(35917, 3884) 0.08371836098127787
(35917, 3917) 0.014398821643967536
(35917, 295) 0.024787463709587655
(35917, 3389) 0.0223931499293025
(35917, 2252) 0.027310109449703985
(35917, 554) 0.0292014727337325
(35917, 4867) 0.022318883549805803
(35917, 3776) 0.02737777672763701
(35917, 316) 0.02548385113721787
(35917, 754) 0.016652617642933156
(35917, 1202) 0.028097311227429065
(35917, 2998) 0.016619736215065274
(35917, 2934) 0.02012789820838177
(35917, 3778) 0.04110480929329898
(0, 2814) 0.10304299548516296
(0, 607) 0.10623124719522835
(0, 2301) 0.09285850620010636
(0, 3460) 0.0854198270354133
(0, 1183) 0.10509672709228313
(0, 729) 0.09858264944361735
(0, 4069) 0.1602822179823832
(0, 3078) 0.09019434699675923
(0, 2413) 0.07221090671608253
(0, 3587) 0.07694706540738962
(0, 3588) 0.06473973301522373
(0, 4382) 0.07412381323808884
(0, 4331) 0.09574507007368047
(0, 2595) 0.08335858734351426
(0, 2556) 0.08335858734351426
(0, 361) 0.05592137737006954
(0, 4423) 0.10296286717811716
(0, 4075) 0.07581650285890687
(0, 1764) 0.08277937204350767
(0, 1989) 0.061884817554261895
(0, 3064) 0.05687265260880162
(0, 2009) 0.4018084442187585
(0, 542) 0.08319429103099739
(0, 3488) 0.10737097965503539
(0, 1154) 0.08507488562730098
: :
(8979, 404) 0.11568432719236223
(8979, 4855) 0.14915479599321876
(8979, 2886) 0.09269400535457108
(8979, 3472) 0.10866568311464483
(8979, 4516) 0.17476108656976955
(8979, 1431) 0.13447721337130317
(8979, 992) 0.0862438492270973
(8979, 1079) 0.12939306237147583
(8979, 2309) 0.09836301427308335
(8979, 1862) 0.07541934767314483
(8979, 2357) 0.14320656467000112
(8979, 3040) 0.07263135827686319
(8979, 3743) 0.10820434730157925
(8979, 1141) 0.06705802252020926
(8979, 2744) 0.06437268769556824
(8979, 4951) 0.05129391811743965
(8979, 3972) 0.08049572732817088
(8979, 3884) 0.08076897871111123
(8979, 3917) 0.05556621535324998
(8979, 3546) 0.08303325589963
(8979, 296) 0.05747447166771348
(8979, 3453) 0.050435428373976805
(8979, 3389) 0.08641697370635497
(8979, 4195) 0.09709986971990049
(8979, 3778) 0.052875592686725925
14918 0
12384 1
3002 0
3323 0
5622 0
..
11284 1
23315 0
16741 0
860 1
15795 1
Name: label, Length: 35918, dtype: int64
799 0
6500 0
3590 0
1377 1
11059 0
..
20702 0
4068 1
1081 0
14658 1
15236 1
Name: label, Length: 8980, dtype: int64

In [33]: #Train the Model


from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

Out[33]: ▾ MultinomialNB i ?

MultinomialNB()

In [35]: """Model Evaluation


Accuracy: Check how many predictions the model got right.
Confusion Matrix: Evaluate precision, recall, and F1-score."""
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')


print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9276169265033407
[[4344 306]
[ 344 3986]]
precision recall f1-score support

0 0.93 0.93 0.93 4650


1 0.93 0.92 0.92 4330

accuracy 0.93 8980


macro avg 0.93 0.93 0.93 8980
weighted avg 0.93 0.93 0.93 8980

In [37]: #Deployment

#Saving the model


import joblib

joblib.dump(model, 'fake_news_detector.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

Out[37]: ['tfidf_vectorizer.pkl']

In [41]: #Loading and Predicting:


model = joblib.load('fake_news_detector.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

def predict_news(text):
cleaned_text = preprocess(text)
vectorized_text = vectorizer.transform([cleaned_text])
return model.predict(vectorized_text)

print(predict_news("Breaking News! Donald Trump is Dead"))


[0]

In [ ]:

You might also like