import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from keras.preprocessing import text, sequence
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
true = pd.read_csv("./True.csv")
true.head()
fake = pd.read_csv("./Fake.csv")
fake.head()
true['category'] = 1
fake['category'] = 0
df = pd.concat([true,fake])
df.head()
df.shape
df.describe(include="object")
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'category' is a column in the 'df' DataFrame
print(df['category'].value_counts())
# Convert the Series to a DataFrame
data_subset = df['category'].value_counts().to_frame()
# Reset the index to convert the 'category' values to a column
data_subset.reset_index(inplace=True)
# Rename the columns to have appropriate names
data_subset.columns = ['category', 'count']
# Set the seaborn style
sns.set_style("darkgrid")
# Create the count plot
plt.figure(figsize=(10, 10))
sns.barplot(x='category', y='count', data=data_subset)
# Show the plot
plt.show()
sns.countplot(x='subject',hue='category',data=df,)
plt.xticks(rotation=90)
plt.show()
df["text"] =df["title"]+df["text"]+df['subject']
df=df[["text","category"]]
!python -m spacy download en_core_web_sm
import spacy
nlp = spacy.load('en_core_web_sm')
list1 = nlp.Defaults.stop_words
list2 = stopwords.words('english')
punctuation = list(string.punctuation)
Stopwords = set((set(list1)|set(list2)|set(punctuation)))
len(Stopwords)
#creating instance
lemma=WordNetLemmatizer()
#text cleaning function
def clean_text(text):
"""
It takes text as an input and clean it by applying several methods
"""
string = ""
#lower casing
text=text.lower()
#simplifying text
text=re.sub(r"i'm","i am",text)
text=re.sub(r"he's","he is",text)
text=re.sub(r"she's","she is",text)
text=re.sub(r"that's","that is",text)
text=re.sub(r"what's","what is",text)
text=re.sub(r"where's","where is",text)
text=re.sub(r"\'ll"," will",text)
text=re.sub(r"\'ve"," have",text)
text=re.sub(r"\'re"," are",text)
text=re.sub(r"\'d"," would",text)
text=re.sub(r"won't","will not",text)
text=re.sub(r"can't","cannot",text)
#removing any special character
text=re.sub(r"[-()\"#!@$%^&*{}?.,:]"," ",text)
text=re.sub(r"\s+"," ",text)
text=re.sub('[^A-Za-z0-9]+',' ', text)
for word in text.split():
if word not in Stopwords:
string+=lemma.lemmatize(word)+" "
return string
import nltk
# Download the WordNet resource
nltk.download('wordnet')
# Assuming the NLTK data path issue has been resolved, and the necessary libraries are imported
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Define the function for text cleaning
def clean_text(text):
Stopwords = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
string = ""
for word in nltk.word_tokenize(text):
if word not in Stopwords:
string += lemma.lemmatize(word) + " "
return string
# Clean the "text" column in the DataFrame
df["text"] = df["text"].apply(clean_text)
import nltk
nltk.download('omw-1.4')
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Download NLTK resources (if you haven't done it already)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Load the DataFrame
# (Assuming your DataFrame is named 'df' and contains a column named 'text')
# df = pd.read_csv('your_data.csv')
# Function to clean and tokenize the text
def clean_text(text):
Stopwords = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
string = ""
for word in nltk.word_tokenize(text):
if word not in Stopwords:
string += lemma.lemmatize(word) + " "
return string
# Check Data Types
print(df.dtypes)
# Check for Null or NaN Values
print(df.isnull().sum())
# Clean the "text" column in the DataFrame
df["text"] = df["text"].astype(str).apply(clean_text)
# Drop any rows with null or NaN values in the "text" column
df = df.dropna(subset=["text"])
# Verify Tokenization
print(df["text"].head())
# Continue with word cloud generation, classifier training, and evaluation
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords =
Stopwords).generate(" ".join(df[df.category == 0].text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords =
Stopwords).generate(" ".join(df[df.category == 1].text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords =
Stopwords,background_color='white').generate(" ".join(df.text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
#Split the dataset into Train And Test Dataset.
X=df["text"] #feature
y=df["category"] # traget
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model
clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",LogisticRegression())])
clf_text.fit(X_train,y_train)
#making prediction using the model
predictions=clf_text.predict(X_test)
print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))
#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model
clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",MultinomialNB(alpha=0.5))])
clf_text.fit(X_train,y_train)
#making prediction using the model
predictions=clf_text.predict(X_test)
print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))
#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model
clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",LinearSVC())])
clf_text.fit(X_train,y_train)
#making prediction using the model
predictions=clf_text.predict(X_test)
print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))
#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model
clf_rf=Pipeline([("tfidf",TfidfVectorizer()),("clf",RandomForestClassifier(random_state=0))])
clf_rf.fit(X_train,y_train)
#making prediction using the model
predictions=clf_rf.predict(X_test)
print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))
#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model
clf_dt=Pipeline([("tfidf",TfidfVectorizer()),("clf",DecisionTreeClassifier(random_state=2))])
clf_dt.fit(X_train,y_train)
#making prediction using the model
predictions=clf_dt.predict(X_test)
print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))
#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))