Aped For Fake News
Aped For Fake News
import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from keras.preprocessing import text, sequence
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
true = pd.read_csv("./True.csv")
true.head()
fake = pd.read_csv("./Fake.csv")
fake.head()
true['category'] = 1
fake['category'] = 0
df = pd.concat([true,fake])
df.head()
df.shape
df.describe(include="object")
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.countplot(x='subject',hue='category',data=df,)
plt.xticks(rotation=90)
plt.show()
df["text"] =df["title"]+df["text"]+df['subject']
df=df[["text","category"]]
!python -m spacy download en_core_web_sm
import spacy
nlp = spacy.load('en_core_web_sm')
list1 = nlp.Defaults.stop_words
list2 = stopwords.words('english')
punctuation = list(string.punctuation)
Stopwords = set((set(list1)|set(list2)|set(punctuation)))
len(Stopwords)
#creating instance
lemma=WordNetLemmatizer()
# Assuming the NLTK data path issue has been resolved, and the necessary libraries are imported
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('omw-1.4')
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Drop any rows with null or NaN values in the "text" column
df = df.dropna(subset=["text"])
# Verify Tokenization
print(df["text"].head())
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords =
Stopwords).generate(" ".join(df[df.category == 0].text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords =
Stopwords).generate(" ".join(df[df.category == 1].text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords =
Stopwords,background_color='white').generate(" ".join(df.text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
#Split the dataset into Train And Test Dataset.
X=df["text"] #feature
y=df["category"] # traget
clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",LogisticRegression())])
clf_text.fit(X_train,y_train)
#making prediction using the model
predictions=clf_text.predict(X_test)
print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))
#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model
clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",MultinomialNB(alpha=0.5))])
clf_text.fit(X_train,y_train)
#making prediction using the model
predictions=clf_text.predict(X_test)
print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))
#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model
clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",LinearSVC())])
clf_text.fit(X_train,y_train)
#making prediction using the model
predictions=clf_text.predict(X_test)
print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))
#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model
clf_rf=Pipeline([("tfidf",TfidfVectorizer()),("clf",RandomForestClassifier(random_state=0))])
clf_rf.fit(X_train,y_train)
print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))
#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model
clf_dt=Pipeline([("tfidf",TfidfVectorizer()),("clf",DecisionTreeClassifier(random_state=2))])
clf_dt.fit(X_train,y_train)
print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))
#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))