0% found this document useful (0 votes)

17 views6 pages

Aped For Fake News

The document loads necessary libraries and modules for text preprocessing, modeling, and evaluation. It then loads true and fake news datasets, cleans the text data by removing stopwords and lemmatizing words, and builds various classification models including Logistic Regression, Naive Bayes, SVM, Random Forest, and Decision Tree. It evaluates the models on test data and reports the classification report, accuracy, F1 score, and confusion matrix.

Uploaded by

Bless Co

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

17 views6 pages

Aped For Fake News

Uploaded by

Bless Co

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 6

import numpy as np

import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from keras.preprocessing import text, sequence
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
true = pd.read_csv("./True.csv")
true.head()
fake = pd.read_csv("./Fake.csv")
fake.head()
true['category'] = 1
fake['category'] = 0

df = pd.concat([true,fake])
df.head()
df.shape
df.describe(include="object")
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'category' is a column in the 'df' DataFrame

print(df['category'].value_counts())

# Convert the Series to a DataFrame

data_subset = df['category'].value_counts().to_frame()

# Reset the index to convert the 'category' values to a column

data_subset.reset_index(inplace=True)

# Rename the columns to have appropriate names

data_subset.columns = ['category', 'count']

# Set the seaborn style

sns.set_style("darkgrid")

# Create the count plot

plt.figure(figsize=(10, 10))
sns.barplot(x='category', y='count', data=data_subset)

# Show the plot

plt.show()

sns.countplot(x='subject',hue='category',data=df,)
plt.xticks(rotation=90)

plt.show()
df["text"] =df["title"]+df["text"]+df['subject']
df=df[["text","category"]]
!python -m spacy download en_core_web_sm

import spacy
nlp = spacy.load('en_core_web_sm')
list1 = nlp.Defaults.stop_words
list2 = stopwords.words('english')
punctuation = list(string.punctuation)
Stopwords = set((set(list1)|set(list2)|set(punctuation)))
len(Stopwords)
#creating instance
lemma=WordNetLemmatizer()

#text cleaning function

def clean_text(text):

"""
It takes text as an input and clean it by applying several methods

"""

string = ""

#lower casing
text=text.lower()

#simplifying text
text=re.sub(r"i'm","i am",text)
text=re.sub(r"he's","he is",text)
text=re.sub(r"she's","she is",text)
text=re.sub(r"that's","that is",text)
text=re.sub(r"what's","what is",text)
text=re.sub(r"where's","where is",text)
text=re.sub(r"\'ll"," will",text)
text=re.sub(r"\'ve"," have",text)
text=re.sub(r"\'re"," are",text)
text=re.sub(r"\'d"," would",text)
text=re.sub(r"won't","will not",text)
text=re.sub(r"can't","cannot",text)

#removing any special character
text=re.sub(r"[-()\"#!@$%^&*{}?.,:]"," ",text)
text=re.sub(r"\s+"," ",text)
text=re.sub('[^A-Za-z0-9]+',' ', text)

for word in text.split():
if word not in Stopwords:
string+=lemma.lemmatize(word)+" "

return string
import nltk

# Download the WordNet resource

nltk.download('wordnet')

# Assuming the NLTK data path issue has been resolved, and the necessary libraries are imported
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (if not already downloaded)

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define the function for text cleaning

def clean_text(text):
Stopwords = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
string = ""
for word in nltk.word_tokenize(text):
if word not in Stopwords:
string += lemma.lemmatize(word) + " "
return string

# Clean the "text" column in the DataFrame

df["text"] = df["text"].apply(clean_text)

import nltk
nltk.download('omw-1.4')
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (if you haven't done it already)

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the DataFrame

# (Assuming your DataFrame is named 'df' and contains a column named 'text')
# df = pd.read_csv('your_data.csv')

# Function to clean and tokenize the text

# Check Data Types

print(df.dtypes)

# Check for Null or NaN Values

print(df.isnull().sum())

# Clean the "text" column in the DataFrame

df["text"] = df["text"].astype(str).apply(clean_text)

# Drop any rows with null or NaN values in the "text" column
df = df.dropna(subset=["text"])

# Verify Tokenization
print(df["text"].head())

# Continue with word cloud generation, classifier training, and evaluation

plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords =
Stopwords).generate(" ".join(df[df.category == 0].text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()

plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords =
Stopwords).generate(" ".join(df[df.category == 1].text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords =
Stopwords,background_color='white').generate(" ".join(df.text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
#Split the dataset into Train And Test Dataset.
X=df["text"] #feature
y=df["category"] # traget

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",LogisticRegression())])
clf_text.fit(X_train,y_train)
#making prediction using the model
predictions=clf_text.predict(X_test)

print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",MultinomialNB(alpha=0.5))])
clf_text.fit(X_train,y_train)
#making prediction using the model
predictions=clf_text.predict(X_test)

print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model
clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",LinearSVC())])
clf_text.fit(X_train,y_train)
#making prediction using the model
predictions=clf_text.predict(X_test)

print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

clf_rf=Pipeline([("tfidf",TfidfVectorizer()),("clf",RandomForestClassifier(random_state=0))])
clf_rf.fit(X_train,y_train)

#making prediction using the model

predictions=clf_rf.predict(X_test)

print(metrics.classification_report(y_test,predictions))

#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

clf_dt=Pipeline([("tfidf",TfidfVectorizer()),("clf",DecisionTreeClassifier(random_state=2))])
clf_dt.fit(X_train,y_train)

#making prediction using the model

predictions=clf_dt.predict(X_test)

print(metrics.classification_report(y_test,predictions))

#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))

The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life
From Everand
The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life
Mark Manson
4/5 (6471)
Principles: Life and Work
From Everand
Principles: Life and Work
Ray Dalio
4/5 (650)
The Gifts of Imperfection: Let Go of Who You Think You're Supposed to Be and Embrace Who You Are
From Everand
The Gifts of Imperfection: Let Go of Who You Think You're Supposed to Be and Embrace Who You Are
Brené Brown
4/5 (1175)
Never Split the Difference: Negotiating As If Your Life Depended On It
From Everand
Never Split the Difference: Negotiating As If Your Life Depended On It
Chris Voss
4.5/5 (1005)
The Glass Castle: A Memoir
From Everand
The Glass Castle: A Memoir
Jeannette Walls
4.5/5 (1859)
Grit: The Power of Passion and Perseverance
From Everand
Grit: The Power of Passion and Perseverance
Angela Duckworth
4/5 (651)
The Perks of Being a Wallflower
From Everand
The Perks of Being a Wallflower
Stephen Chbosky
4.5/5 (4104)
Sing, Unburied, Sing: A Novel
From Everand
Sing, Unburied, Sing: A Novel
Jesmyn Ward
4/5 (1278)
Her Body and Other Parties: Stories
From Everand
Her Body and Other Parties: Stories
Carmen Maria Machado
4/5 (903)
Shoe Dog: A Memoir by the Creator of Nike
From Everand
Shoe Dog: A Memoir by the Creator of Nike
Phil Knight
4.5/5 (629)
Hidden Figures: The American Dream and the Untold Story of the Black Women Mathematicians Who Helped Win the Space Race
From Everand
Hidden Figures: The American Dream and the Untold Story of the Black Women Mathematicians Who Helped Win the Space Race
Margot Lee Shetterly
4/5 (1022)
The Hard Thing About Hard Things: Building a Business When There Are No Easy Answers
From Everand
The Hard Thing About Hard Things: Building a Business When There Are No Easy Answers
Ben Horowitz
4.5/5 (361)
The Emperor of All Maladies: A Biography of Cancer
From Everand
The Emperor of All Maladies: A Biography of Cancer
Siddhartha Mukherjee
4.5/5 (298)
Steve Jobs
From Everand
Steve Jobs
Walter Isaacson
4.5/5 (1139)
Elon Musk: Tesla, SpaceX, and the Quest for a Fantastic Future
From Everand
Elon Musk: Tesla, SpaceX, and the Quest for a Fantastic Future
Ashlee Vance
4.5/5 (582)
A Man Called Ove: A Novel
From Everand
A Man Called Ove: A Novel
Fredrik Backman
4.5/5 (5181)
Angela's Ashes: A Memoir
From Everand
Angela's Ashes: A Memoir
Frank McCourt
4.5/5 (945)
Brooklyn: A Novel
From Everand
Brooklyn: A Novel
Colm Toibin
3.5/5 (2141)
The Little Book of Hygge: Danish Secrets to Happy Living
From Everand
The Little Book of Hygge: Danish Secrets to Happy Living
Meik Wiking
3.5/5 (464)
The World Is Flat 3.0: A Brief History of the Twenty-first Century
From Everand
The World Is Flat 3.0: A Brief History of the Twenty-first Century
Thomas L. Friedman
3.5/5 (2289)
Yes Please
From Everand
Yes Please
Amy Poehler
4/5 (2016)
The Yellow House: A Memoir (2019 National Book Award Winner)
From Everand
The Yellow House: A Memoir (2019 National Book Award Winner)
Sarah M. Broom
4/5 (100)
Devil in the Grove: Thurgood Marshall, the Groveland Boys, and the Dawn of a New America
From Everand
Devil in the Grove: Thurgood Marshall, the Groveland Boys, and the Dawn of a New America
Gilbert King
4.5/5 (280)
Bad Feminist: Essays
From Everand
Bad Feminist: Essays
Roxane Gay
4/5 (1090)
The Art of Racing in the Rain: A Novel
From Everand
The Art of Racing in the Rain: A Novel
Garth Stein
4/5 (4372)
A Tree Grows in Brooklyn
From Everand
A Tree Grows in Brooklyn
Betty Smith
4.5/5 (2033)
The Woman in Cabin 10
From Everand
The Woman in Cabin 10
Ruth Ware
3.5/5 (2815)
The Outsider: A Novel
From Everand
The Outsider: A Novel
Stephen King
4/5 (2886)
A Heartbreaking Work Of Staggering Genius: A Memoir Based on a True Story
From Everand
A Heartbreaking Work Of Staggering Genius: A Memoir Based on a True Story
Dave Eggers
3.5/5 (233)
The Sympathizer: A Novel (Pulitzer Prize for Fiction)
From Everand
The Sympathizer: A Novel (Pulitzer Prize for Fiction)
Viet Thanh Nguyen
4.5/5 (141)
Team of Rivals: The Political Genius of Abraham Lincoln
From Everand
Team of Rivals: The Political Genius of Abraham Lincoln
Doris Kearns Goodwin
4.5/5 (244)
Wolf Hall: A Novel
From Everand
Wolf Hall: A Novel
Hilary Mantel
4/5 (4135)
On Fire: The (Burning) Case for a Green New Deal
From Everand
On Fire: The (Burning) Case for a Green New Deal
Naomi Klein
4/5 (78)
Rise of ISIS: A Threat We Can't Ignore
From Everand
Rise of ISIS: A Threat We Can't Ignore
Jay Sekulow
3.5/5 (144)
Manhattan Beach: A Novel
From Everand
Manhattan Beach: A Novel
Jennifer Egan
3.5/5 (929)
Fear: Trump in the White House
From Everand
Fear: Trump in the White House
Bob Woodward
3.5/5 (841)
John Adams
From Everand
John Adams
David McCullough
4.5/5 (2547)
Spider Rtu Document
No ratings yet
Spider Rtu Document
3 pages
The Unwinding: An Inner History of the New America
From Everand
The Unwinding: An Inner History of the New America
George Packer
4/5 (45)
The Light Between Oceans: A Novel
From Everand
The Light Between Oceans: A Novel
M.L. Stedman
4.5/5 (815)
Little Women
From Everand
Little Women
Louisa May Alcott
4.5/5 (2369)
The Constant Gardener: A Novel
From Everand
The Constant Gardener: A Novel
John le Carré
4/5 (278)
Text Similarity Measures in News Articles by Vector Space Model Using NLP
No ratings yet
Text Similarity Measures in News Articles by Vector Space Model Using NLP
10 pages
TMS-6016Service Manual
No ratings yet
TMS-6016Service Manual
42 pages
Bolloju Satya Akhil Kumar Mobile: +91-7013868886: Tirumala Junior College
No ratings yet
Bolloju Satya Akhil Kumar Mobile: +91-7013868886: Tirumala Junior College
2 pages
Unit 03 - Part 02
No ratings yet
Unit 03 - Part 02
26 pages
Lecture Introduction
No ratings yet
Lecture Introduction
34 pages
Virtual Product Creation in Industry The Difficult Transformation From It Enabler Technology To Core Engineering Competence Rainer Stark PDF Download
No ratings yet
Virtual Product Creation in Industry The Difficult Transformation From It Enabler Technology To Core Engineering Competence Rainer Stark PDF Download
81 pages
NDE 3703 AL Data Sheet enUS 121021001867
No ratings yet
NDE 3703 AL Data Sheet enUS 121021001867
6 pages
Web Development14-03-2024-1
No ratings yet
Web Development14-03-2024-1
82 pages
Statistical Methods For Social Sciences
No ratings yet
Statistical Methods For Social Sciences
31 pages
Aeeron Academy Profile
No ratings yet
Aeeron Academy Profile
19 pages
Lab Manual 5 Semester Experiment No.02 Addressing Modes and Data Transfer Instructions
No ratings yet
Lab Manual 5 Semester Experiment No.02 Addressing Modes and Data Transfer Instructions
3 pages
Svyatoslav Nastenko BA
No ratings yet
Svyatoslav Nastenko BA
3 pages
Introduction To Email
No ratings yet
Introduction To Email
20 pages
Sudoku Game Project Report
No ratings yet
Sudoku Game Project Report
21 pages
Telecommunications Security Code of Practice
No ratings yet
Telecommunications Security Code of Practice
150 pages
Wban Thesis
75% (4)
Wban Thesis
4 pages
IT212 Data Communication and Networking 2 (Cisco 2) : Inter-VLAN Routing
No ratings yet
IT212 Data Communication and Networking 2 (Cisco 2) : Inter-VLAN Routing
11 pages
Employee Managament System
No ratings yet
Employee Managament System
15 pages
YSB Install Guide
No ratings yet
YSB Install Guide
17 pages
Cisco Iou
No ratings yet
Cisco Iou
3 pages
Enhancing User Experience in Mobile Human
No ratings yet
Enhancing User Experience in Mobile Human
7 pages
What Is HRIS - The Best Software For A Human Resources Information System - Process Street - Checklist, Workflow and SOP Software
No ratings yet
What Is HRIS - The Best Software For A Human Resources Information System - Process Street - Checklist, Workflow and SOP Software
24 pages
Content Planning Calendar Template
No ratings yet
Content Planning Calendar Template
13 pages
Integrated Circuit Design Based On CMOS Technology
No ratings yet
Integrated Circuit Design Based On CMOS Technology
6 pages
Diploma in Computer Applications
No ratings yet
Diploma in Computer Applications
7 pages
BILLON BIPAC User - Manual - 2073N
No ratings yet
BILLON BIPAC User - Manual - 2073N
66 pages
Vsphere Esxi Vcenter 802 Storage Guide
No ratings yet
Vsphere Esxi Vcenter 802 Storage Guide
411 pages
Geometric Interpration of Derivative
No ratings yet
Geometric Interpration of Derivative
64 pages
Ms Word: 1. Creating and Editing Documents
No ratings yet
Ms Word: 1. Creating and Editing Documents
12 pages

Aped For Fake News

Uploaded by

Aped For Fake News

Uploaded by

import numpy as np

# Assuming 'category' is a column in the 'df' DataFrame

# Convert the Series to a DataFrame

# Reset the index to convert the 'category' values to a column

# Rename the columns to have appropriate names

# Set the seaborn style

# Create the count plot

# Show the plot

#text cleaning function

# Download the WordNet resource

# Download NLTK resources (if not already downloaded)

# Define the function for text cleaning

# Clean the "text" column in the DataFrame

# Download NLTK resources (if you haven't done it already)

# Load the DataFrame

# Function to clean and tokenize the text

# Check Data Types

# Check for Null or NaN Values

# Clean the "text" column in the DataFrame

# Continue with word cloud generation, classifier training, and evaluation

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#making prediction using the model

#making prediction using the model

You might also like