0% found this document useful (0 votes)

15 views2 pages

Fake News Detection

FND

Uploaded by

jaayseme

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

15 views2 pages

Fake News Detection

FND

Uploaded by

jaayseme

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 2

In [1]: import pandas as pd

# Load the datasets

true_news = pd.read_csv('True.csv')
fake_news = pd.read_csv('Fake.csv')
#print (true_news.info())
#print (fake_news.info())

# Add a column to differentiate between fake and real news

true_news['label'] = 1 # 1 for real
fake_news['label'] = 0 # 0 for fake
print (true_news.info())
#print (fake_news.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 title 21417 non-null object
1 text 21417 non-null object
2 subject 21417 non-null object
3 date 21417 non-null object
4 label 21417 non-null int64
dtypes: int64(1), object(4)
memory usage: 836.7+ KB
None

In [2]: # Combine the datasets

data = pd.concat([true_news, fake_news])
print (data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 0 to 23480
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 title 44898 non-null object
1 text 44898 non-null object
2 subject 44898 non-null object
3 date 44898 non-null object
4 label 44898 non-null int64
dtypes: int64(1), object(4)
memory usage: 2.1+ MB
None
title \
0 As U.S. budget fight looms, Republicans flip t...
1 U.S. military to accept transgender recruits o...
2 Senior U.S. Republican senator: 'Let Mr. Muell...
3 FBI Russia probe helped by Australian diplomat...
4 Trump wants Postal Service to charge 'much mor...

text subject \
0 WASHINGTON (Reuters) - The head of a conservat... politicsNews
1 WASHINGTON (Reuters) - Transgender people will... politicsNews
2 WASHINGTON (Reuters) - The special counsel inv... politicsNews
3 WASHINGTON (Reuters) - Trump campaign adviser ... politicsNews
4 SEATTLE/WASHINGTON (Reuters) - President Donal... politicsNews

date label
0 December 31, 2017 1
1 December 29, 2017 1
2 December 31, 2017 1
3 December 30, 2017 1
4 December 29, 2017 1

In [3]: #Data Preprocessing

"""
Convert text to lowercase.
Remove punctuation, numbers, and stopwords.
Apply tokenization and stemming/lemmatization.
"""
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenize the text
words = word_tokenize(text)
# Remove stopwords and apply stemming
words = [stemmer.stem(word) for word in words if word not in stop_words]
return ' '.join(words)
#The proprocessing is applied to the text column and the output is returned and added as a new column which is cleaned_text.
data['cleaned_text'] = data['text'].apply(preprocess)
print (data.head())

title \
0 As U.S. budget fight looms, Republicans flip t...
1 U.S. military to accept transgender recruits o...
2 Senior U.S. Republican senator: 'Let Mr. Muell...
3 FBI Russia probe helped by Australian diplomat...
4 Trump wants Postal Service to charge 'much mor...

date label \
0 December 31, 2017 1
1 December 29, 2017 1
2 December 31, 2017 1
3 December 30, 2017 1
4 December 29, 2017 1

cleaned_text
0 washington reuter head conserv republican fact...
1 washington reuter transgend peopl allow first ...
2 washington reuter special counsel investig lin...
3 washington reuter trump campaign advis georg p...
4 seattlewashington reuter presid donald trump c...

In [9]: #Feature Extraction

"""Bag of Words (BoW): Convert text into numerical vectors.
TF-IDF: Alternatively, you can use TF-IDF for a better representation of the text"""

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['cleaned_text'])

# The labels
y = data['label']
print(X)
print (y)

(0, 4979) 0.02927448869206572

(0, 3743) 0.03180899244323748
(0, 102) 0.04958282046249698
(0, 1657) 0.05655575530504557
(0, 1738) 0.028044900055669533
(0, 752) 0.0379670205483232
(0, 3801) 0.051477707026325946
(0, 3549) 0.05094916005811628
(0, 4484) 0.037153135804899214
(0, 2222) 0.04809636412006563
(0, 599) 0.06437000260114123
(0, 1544) 0.03540936006229215
(0, 3943) 0.043467381494883016
(0, 3800) 0.04869167619779146
(0, 1996) 0.03333317064256021
(0, 1835) 0.07421442369046749
(0, 3929) 0.03805312553131785
(0, 4893) 0.021910056107812624
(0, 2417) 0.040827597090537335
(0, 4008) 0.03904774333334839
(0, 4298) 0.03944884961747107
(0, 4869) 0.039592671880409006
(0, 1843) 0.025271906782534402
(0, 1387) 0.028484754697966942
(0, 2598) 0.046385916706794345
: :
(44897, 4973) 0.012235299964176561
(44897, 3299) 0.011587446793135905
(44897, 3863) 0.03734240903154764
(44897, 0) 0.022246228042812344
(44897, 1968) 0.01788274256320097
(44897, 3017) 0.015514876847674662
(44897, 3884) 0.008284239042034843
(44897, 2197) 0.013991213735492643
(44897, 4125) 0.02449035039437561
(44897, 1977) 0.02717251437647199
(44897, 3917) 0.045594119722024776
(44897, 3546) 0.017032958724693053
(44897, 296) 0.011789978521654797
(44897, 2872) 0.09849675672796286
(44897, 3453) 0.04138412063689714
(44897, 762) 0.015253092787549404
(44897, 1522) 0.029147244242323678
(44897, 1639) 0.014448301551072098
(44897, 3389) 0.01772707489674373
(44897, 2659) 0.025107158300442352
(44897, 4599) 0.016193630289571506
(44897, 3776) 0.021673051807779578
(44897, 754) 0.013182701009688056
(44897, 4730) 0.1281617686183972
(44897, 4846) 0.01459308602078768
0 1
1 1
2 1
3 1
4 1
..
23476 0
23477 0
23478 0
23479 0
23480 0
Name: label, Length: 44898, dtype: int64

In [21]: """
Model Selection
Common algorithms for text classification:
Logistic Regression
Naive Bayes
Support Vector Machines (SVM)
Random Forest"""
#Train/Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print (X_train)
print (X_test)
print (y_train)
print (y_test)

(0, 4097) 0.6214863713867282

(0, 2167) 0.07975118299473853
(0, 4357) 0.06696758648999868
(0, 42) 0.07865252837342171
(0, 2332) 0.07098796933193335
(0, 1733) 0.15381827993404826
(0, 2045) 0.08035280572579839
(0, 4978) 0.06767848478375141
(0, 1534) 0.07378297514129759
(0, 1693) 0.06174416375535703
(0, 1226) 0.062308438131677295
(0, 28) 0.08592258686629516
(0, 2843) 0.056516788263405116
(0, 4368) 0.06340260977728616
(0, 2949) 0.12389169130416453
(0, 3729) 0.06255887587869023
(0, 1143) 0.06200400834288353
(0, 2954) 0.08254080921472215
(0, 932) 0.08405861873865984
(0, 4000) 0.059616910222330566
(0, 44) 0.07486269831818537
(0, 301) 0.05338226573022805
(0, 1246) 0.049875907271615734
(0, 3349) 0.06316835671081979
(0, 4205) 0.08212409093797246
: :
(35917, 1844) 0.10683822299478586
(35917, 985) 0.018862939074196537
(35917, 2761) 0.019112015440846546
(35917, 3144) 0.014330749215001683
(35917, 4253) 0.027479539032735662
(35917, 4697) 0.053488928545480686
(35917, 3760) 0.04612755360585819
(35917, 4973) 0.030911688264838737
(35917, 1678) 0.023789955569305063
(35917, 3863) 0.04717158183375135
(35917, 3972) 0.02085878286889635
(35917, 3884) 0.08371836098127787
(35917, 3917) 0.014398821643967536
(35917, 295) 0.024787463709587655
(35917, 3389) 0.0223931499293025
(35917, 2252) 0.027310109449703985
(35917, 554) 0.0292014727337325
(35917, 4867) 0.022318883549805803
(35917, 3776) 0.02737777672763701
(35917, 316) 0.02548385113721787
(35917, 754) 0.016652617642933156
(35917, 1202) 0.028097311227429065
(35917, 2998) 0.016619736215065274
(35917, 2934) 0.02012789820838177
(35917, 3778) 0.04110480929329898
(0, 2814) 0.10304299548516296
(0, 607) 0.10623124719522835
(0, 2301) 0.09285850620010636
(0, 3460) 0.0854198270354133
(0, 1183) 0.10509672709228313
(0, 729) 0.09858264944361735
(0, 4069) 0.1602822179823832
(0, 3078) 0.09019434699675923
(0, 2413) 0.07221090671608253
(0, 3587) 0.07694706540738962
(0, 3588) 0.06473973301522373
(0, 4382) 0.07412381323808884
(0, 4331) 0.09574507007368047
(0, 2595) 0.08335858734351426
(0, 2556) 0.08335858734351426
(0, 361) 0.05592137737006954
(0, 4423) 0.10296286717811716
(0, 4075) 0.07581650285890687
(0, 1764) 0.08277937204350767
(0, 1989) 0.061884817554261895
(0, 3064) 0.05687265260880162
(0, 2009) 0.4018084442187585
(0, 542) 0.08319429103099739
(0, 3488) 0.10737097965503539
(0, 1154) 0.08507488562730098
: :
(8979, 404) 0.11568432719236223
(8979, 4855) 0.14915479599321876
(8979, 2886) 0.09269400535457108
(8979, 3472) 0.10866568311464483
(8979, 4516) 0.17476108656976955
(8979, 1431) 0.13447721337130317
(8979, 992) 0.0862438492270973
(8979, 1079) 0.12939306237147583
(8979, 2309) 0.09836301427308335
(8979, 1862) 0.07541934767314483
(8979, 2357) 0.14320656467000112
(8979, 3040) 0.07263135827686319
(8979, 3743) 0.10820434730157925
(8979, 1141) 0.06705802252020926
(8979, 2744) 0.06437268769556824
(8979, 4951) 0.05129391811743965
(8979, 3972) 0.08049572732817088
(8979, 3884) 0.08076897871111123
(8979, 3917) 0.05556621535324998
(8979, 3546) 0.08303325589963
(8979, 296) 0.05747447166771348
(8979, 3453) 0.050435428373976805
(8979, 3389) 0.08641697370635497
(8979, 4195) 0.09709986971990049
(8979, 3778) 0.052875592686725925
14918 0
12384 1
3002 0
3323 0
5622 0
..
11284 1
23315 0
16741 0
860 1
15795 1
Name: label, Length: 35918, dtype: int64
799 0
6500 0
3590 0
1377 1
11059 0
..
20702 0
4068 1
1081 0
14658 1
15236 1
Name: label, Length: 8980, dtype: int64

In [33]: #Train the Model

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

Out[33]: ▾ MultinomialNB i ?

MultinomialNB()

In [35]: """Model Evaluation

Accuracy: Check how many predictions the model got right.
Confusion Matrix: Evaluate precision, recall, and F1-score."""
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9276169265033407
[[4344 306]
[ 344 3986]]
precision recall f1-score support

0 0.93 0.93 0.93 4650

1 0.93 0.92 0.92 4330

accuracy 0.93 8980

macro avg 0.93 0.93 0.93 8980
weighted avg 0.93 0.93 0.93 8980

In [37]: #Deployment

#Saving the model

import joblib

joblib.dump(model, 'fake_news_detector.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

Out[37]: ['tfidf_vectorizer.pkl']

In [41]: #Loading and Predicting:

model = joblib.load('fake_news_detector.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

def predict_news(text):
cleaned_text = preprocess(text)
vectorized_text = vectorizer.transform([cleaned_text])
return model.predict(vectorized_text)

print(predict_news("Breaking News! Donald Trump is Dead"))

[0]

In [ ]:

5.1 Python Workbook
0% (1)
5.1 Python Workbook
177 pages
Python For Data Science Cheat Sheet 2.0
100% (1)
Python For Data Science Cheat Sheet 2.0
11 pages
Retail Analysis With Walmart Data
No ratings yet
Retail Analysis With Walmart Data
10 pages
Multicollinearity
100% (1)
Multicollinearity
25 pages
DS II Mid Term 2017 Solution
No ratings yet
DS II Mid Term 2017 Solution
20 pages
(Lecture Notes in Computer Science 6871 Lecture Notes in Artificial Intelligence) Tatsuya Yokota, Yukihiko Yamashita (Auth.), Petra Perner (Eds.) - Machine Learning and Data Mining in Pattern Recognit
No ratings yet
(Lecture Notes in Computer Science 6871 Lecture Notes in Artificial Intelligence) Tatsuya Yokota, Yukihiko Yamashita (Auth.), Petra Perner (Eds.) - Machine Learning and Data Mining in Pattern Recognit
624 pages
Pandas Course Slides
No ratings yet
Pandas Course Slides
90 pages
mlr3 Tutorial
100% (2)
mlr3 Tutorial
271 pages
Session 17
No ratings yet
Session 17
61 pages
Bakerydata Solution
No ratings yet
Bakerydata Solution
7 pages
Untitled Document
No ratings yet
Untitled Document
66 pages
Analisis Faktor-Faktor Produksi Yang Mempengaruhi Produksi Cabai Merah Keriting (Capsicum Annum L) Di Kecamatan Sumowono Kabupaten Semarang
No ratings yet
Analisis Faktor-Faktor Produksi Yang Mempengaruhi Produksi Cabai Merah Keriting (Capsicum Annum L) Di Kecamatan Sumowono Kabupaten Semarang
20 pages
Practise Problem3
0% (1)
Practise Problem3
6 pages
List Comprehensions: Hugo Bowne-Anderson
No ratings yet
List Comprehensions: Hugo Bowne-Anderson
30 pages
Computational Methods For Mixed Models
No ratings yet
Computational Methods For Mixed Models
21 pages
Unit 01 - Describing Data and Its Distributions - 1 Per Page
No ratings yet
Unit 01 - Describing Data and Its Distributions - 1 Per Page
79 pages
Approved Dessertation
No ratings yet
Approved Dessertation
85 pages
Latent Dirichlet Allocation
No ratings yet
Latent Dirichlet Allocation
44 pages
Assumption of Regresion
No ratings yet
Assumption of Regresion
18 pages
Lab 02
No ratings yet
Lab 02
47 pages
ECQ Manual PDF
No ratings yet
ECQ Manual PDF
29 pages
Chapter1 PDF
No ratings yet
Chapter1 PDF
25 pages
Lampiran Data Hasil Penelitian: Case Summaries
No ratings yet
Lampiran Data Hasil Penelitian: Case Summaries
5 pages
Chapter 1
No ratings yet
Chapter 1
28 pages
(Written Examination Scheme) : (MCQ S)
100% (1)
(Written Examination Scheme) : (MCQ S)
4 pages
4-14 - 21 - Fathia Azzahra Madjid - Latbab3
No ratings yet
4-14 - 21 - Fathia Azzahra Madjid - Latbab3
6 pages
2latihan 2 Pertemuan 3 Statistika
No ratings yet
2latihan 2 Pertemuan 3 Statistika
6 pages
Activity 3 General
No ratings yet
Activity 3 General
21 pages
CHAPTER 4 and 5 New Hate Speech
No ratings yet
CHAPTER 4 and 5 New Hate Speech
21 pages
Introductiontocourse: 1 The Python Programming Language: Functions
No ratings yet
Introductiontocourse: 1 The Python Programming Language: Functions
11 pages
Ch2 Slides
No ratings yet
Ch2 Slides
29 pages
Dia 4
No ratings yet
Dia 4
24 pages
Chapter 4 pt.4
No ratings yet
Chapter 4 pt.4
19 pages
Assignment 4
No ratings yet
Assignment 4
3 pages
Lecture 1
No ratings yet
Lecture 1
14 pages
Python FP (Slides) (2018 Loidll)
No ratings yet
Python FP (Slides) (2018 Loidll)
7 pages
04-Introduction To Python
No ratings yet
04-Introduction To Python
78 pages
04 Introduction To Python-1
No ratings yet
04 Introduction To Python-1
29 pages
Bayesian Structural Time Series
No ratings yet
Bayesian Structural Time Series
2 pages
Traversing Dataframe Elements Using: Iterrows, Iteritems and Itertuples
No ratings yet
Traversing Dataframe Elements Using: Iterrows, Iteritems and Itertuples
8 pages
Introduction To Basic Python
No ratings yet
Introduction To Basic Python
31 pages
Rcihards 1993 Spurius Correlation
No ratings yet
Rcihards 1993 Spurius Correlation
14 pages
01 Python I 08-02-24
No ratings yet
01 Python I 08-02-24
228 pages
Types in Python
No ratings yet
Types in Python
14 pages
Fake News Classification - Ipynb - Colaboratory
No ratings yet
Fake News Classification - Ipynb - Colaboratory
6 pages
4.10. Text Data Pre-Processing - Use Case - Ipynb - Colaboratory
No ratings yet
4.10. Text Data Pre-Processing - Use Case - Ipynb - Colaboratory
2 pages
Unit 4
No ratings yet
Unit 4
15 pages
Chapter 9: Serial Correlation
No ratings yet
Chapter 9: Serial Correlation
7 pages
Python Basics 1612354525
No ratings yet
Python Basics 1612354525
129 pages
Python Data Science Toolbox
No ratings yet
Python Data Science Toolbox
14 pages
Text-Summarizer-Using-Nlp-Advanced-Copy1 Updated
No ratings yet
Text-Summarizer-Using-Nlp-Advanced-Copy1 Updated
31 pages
Text Summarizer
No ratings yet
Text Summarizer
30 pages
06 - Latin Square Design (LSD)
No ratings yet
06 - Latin Square Design (LSD)
21 pages
Assignment No 4 - Spring 2024 MBA Weekend 23052024 092133am
No ratings yet
Assignment No 4 - Spring 2024 MBA Weekend 23052024 092133am
2 pages
AI - Phase 4
No ratings yet
AI - Phase 4
11 pages
Wa0007.3307303433096114618
No ratings yet
Wa0007.3307303433096114618
20 pages
Getting Started With Python Cheat Sheet
No ratings yet
Getting Started With Python Cheat Sheet
1 page
Python143 Week4
No ratings yet
Python143 Week4
19 pages
Data Pre-Processing (Pandas)
No ratings yet
Data Pre-Processing (Pandas)
19 pages
Python Cheat Sheet
No ratings yet
Python Cheat Sheet
11 pages
Prediction of Diabetes Using Machine Learning Techniques
No ratings yet
Prediction of Diabetes Using Machine Learning Techniques
10 pages
Chapter 26 Text Mining - Introduction To Data Science
No ratings yet
Chapter 26 Text Mining - Introduction To Data Science
20 pages
Map Reduce Filter Lambda Generator
No ratings yet
Map Reduce Filter Lambda Generator
27 pages
Python For Data Science Cheat Sheet 2.0
No ratings yet
Python For Data Science Cheat Sheet 2.0
11 pages
Certified Python Programmer
No ratings yet
Certified Python Programmer
6 pages
Tukey Test For Additivity
No ratings yet
Tukey Test For Additivity
3 pages
Python BasicsGUIA PYTHON-01
No ratings yet
Python BasicsGUIA PYTHON-01
1 page
Python Cheat Sheet For Beginners
No ratings yet
Python Cheat Sheet For Beginners
1 page
Python
No ratings yet
Python
18 pages
Functional Programming
No ratings yet
Functional Programming
14 pages
Python Advance Cheatsheet
No ratings yet
Python Advance Cheatsheet
11 pages
DSBDAL - Assignment No 4
No ratings yet
DSBDAL - Assignment No 4
15 pages
Python Cheat Sheet For Excel Users
No ratings yet
Python Cheat Sheet For Excel Users
5 pages
Artificial Neural Network Proposal
No ratings yet
Artificial Neural Network Proposal
5 pages
01 Introduction To Python
No ratings yet
01 Introduction To Python
36 pages
DWV Assignment
No ratings yet
DWV Assignment
13 pages
01 Introduction To Python
No ratings yet
01 Introduction To Python
36 pages
Python Cheet Sheet
No ratings yet
Python Cheet Sheet
2 pages
Built An NLP Model To Detect Fake News Accurately 1746681940
No ratings yet
Built An NLP Model To Detect Fake News Accurately 1746681940
96 pages
EDA - Python Basics
No ratings yet
EDA - Python Basics
10 pages
Introduction+to+Python+Programming+ (Programming-Based) New+size
No ratings yet
Introduction+to+Python+Programming+ (Programming-Based) New+size
4 pages
Project Report
No ratings yet
Project Report
12 pages
ML 2
No ratings yet
ML 2
25 pages
Methodology
No ratings yet
Methodology
9 pages
Python Variables Collections
No ratings yet
Python Variables Collections
19 pages
Dimensionality Reduction, PCA, and Kernel Methods
No ratings yet
Dimensionality Reduction, PCA, and Kernel Methods
3 pages
Python Cheat Sheet
No ratings yet
Python Cheat Sheet
11 pages
All The Python Language Features You Need - DeriveIt
No ratings yet
All The Python Language Features You Need - DeriveIt
4 pages
10 Python Built-In Functions That Will Simplify Your Code
No ratings yet
10 Python Built-In Functions That Will Simplify Your Code
8 pages

Fake News Detection

Uploaded by

Fake News Detection

Uploaded by

In [1]: import pandas as pd

# Load the datasets

# Add a column to differentiate between fake and real news

In [2]: # Combine the datasets

In [3]: #Data Preprocessing

In [9]: #Feature Extraction

from sklearn.feature_extraction.text import TfidfVectorizer

(0, 4979) 0.02927448869206572

(0, 4097) 0.6214863713867282

In [33]: #Train the Model

In [35]: """Model Evaluation

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

0 0.93 0.93 0.93 4650

accuracy 0.93 8980

#Saving the model

In [41]: #Loading and Predicting:

print(predict_news("Breaking News! Donald Trump is Dead"))

You might also like