0% found this document useful (0 votes)
17 views6 pages

Aped For Fake News

The document loads necessary libraries and modules for text preprocessing, modeling, and evaluation. It then loads true and fake news datasets, cleans the text data by removing stopwords and lemmatizing words, and builds various classification models including Logistic Regression, Naive Bayes, SVM, Random Forest, and Decision Tree. It evaluates the models on test data and reports the classification report, accuracy, F1 score, and confusion matrix.

Uploaded by

Bless Co
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
17 views6 pages

Aped For Fake News

The document loads necessary libraries and modules for text preprocessing, modeling, and evaluation. It then loads true and fake news datasets, cleans the text data by removing stopwords and lemmatizing words, and builds various classification models including Logistic Regression, Naive Bayes, SVM, Random Forest, and Decision Tree. It evaluates the models on test data and reports the classification report, accuracy, F1 score, and confusion matrix.

Uploaded by

Bless Co
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 6

import numpy as np

import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from keras.preprocessing import text, sequence
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
true = pd.read_csv("./True.csv")
true.head()
fake = pd.read_csv("./Fake.csv")
fake.head()
true['category'] = 1
fake['category'] = 0

df = pd.concat([true,fake])
df.head()
df.shape
df.describe(include="object")
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'category' is a column in the 'df' DataFrame


print(df['category'].value_counts())

# Convert the Series to a DataFrame


data_subset = df['category'].value_counts().to_frame()

# Reset the index to convert the 'category' values to a column


data_subset.reset_index(inplace=True)

# Rename the columns to have appropriate names


data_subset.columns = ['category', 'count']

# Set the seaborn style


sns.set_style("darkgrid")

# Create the count plot


plt.figure(figsize=(10, 10))
sns.barplot(x='category', y='count', data=data_subset)

# Show the plot


plt.show()

sns.countplot(x='subject',hue='category',data=df,)
plt.xticks(rotation=90)

plt.show()
df["text"] =df["title"]+df["text"]+df['subject']
df=df[["text","category"]]
!python -m spacy download en_core_web_sm

import spacy
nlp = spacy.load('en_core_web_sm')
list1 = nlp.Defaults.stop_words
list2 = stopwords.words('english')
punctuation = list(string.punctuation)
Stopwords = set((set(list1)|set(list2)|set(punctuation)))
len(Stopwords)
#creating instance
lemma=WordNetLemmatizer()

#text cleaning function


def clean_text(text):
  
    """
    It takes text as an input and clean it by applying several methods
  
    """
  
    string = ""
  
    #lower casing
    text=text.lower()
  
    #simplifying text
    text=re.sub(r"i'm","i am",text)
    text=re.sub(r"he's","he is",text)
    text=re.sub(r"she's","she is",text)
    text=re.sub(r"that's","that is",text)
    text=re.sub(r"what's","what is",text)
    text=re.sub(r"where's","where is",text)
    text=re.sub(r"\'ll"," will",text)
    text=re.sub(r"\'ve"," have",text)
    text=re.sub(r"\'re"," are",text)
    text=re.sub(r"\'d"," would",text)
    text=re.sub(r"won't","will not",text)
    text=re.sub(r"can't","cannot",text)
  
    #removing any special character
    text=re.sub(r"[-()\"#!@$%^&*{}?.,:]"," ",text)
    text=re.sub(r"\s+"," ",text)
    text=re.sub('[^A-Za-z0-9]+',' ', text)
  
    for word in text.split():
        if word not in Stopwords:
            string+=lemma.lemmatize(word)+" "
  
    return string
import nltk

# Download the WordNet resource


nltk.download('wordnet')

# Assuming the NLTK data path issue has been resolved, and the necessary libraries are imported
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (if not already downloaded)


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define the function for text cleaning


def clean_text(text):
    Stopwords = set(stopwords.words('english'))
    lemma = WordNetLemmatizer()
    string = ""
    for word in nltk.word_tokenize(text):
        if word not in Stopwords:
            string += lemma.lemmatize(word) + " "
    return string

# Clean the "text" column in the DataFrame


df["text"] = df["text"].apply(clean_text)

import nltk
nltk.download('omw-1.4')
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (if you haven't done it already)


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the DataFrame


# (Assuming your DataFrame is named 'df' and contains a column named 'text')
# df = pd.read_csv('your_data.csv')

# Function to clean and tokenize the text


def clean_text(text):
    Stopwords = set(stopwords.words('english'))
    lemma = WordNetLemmatizer()
    string = ""
    for word in nltk.word_tokenize(text):
        if word not in Stopwords:
            string += lemma.lemmatize(word) + " "
    return string

# Check Data Types


print(df.dtypes)

# Check for Null or NaN Values


print(df.isnull().sum())

# Clean the "text" column in the DataFrame


df["text"] = df["text"].astype(str).apply(clean_text)

# Drop any rows with null or NaN values in the "text" column
df = df.dropna(subset=["text"])

# Verify Tokenization
print(df["text"].head())

# Continue with word cloud generation, classifier training, and evaluation

plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords =
Stopwords).generate(" ".join(df[df.category == 0].text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()

plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords =
Stopwords).generate(" ".join(df[df.category == 1].text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords =
Stopwords,background_color='white').generate(" ".join(df.text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
#Split the dataset into Train And Test Dataset.
X=df["text"] #feature
y=df["category"] # traget

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",LogisticRegression())])
clf_text.fit(X_train,y_train)
#making prediction using the model
predictions=clf_text.predict(X_test)

print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",MultinomialNB(alpha=0.5))])
clf_text.fit(X_train,y_train)
#making prediction using the model
predictions=clf_text.predict(X_test)

print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model
clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",LinearSVC())])
clf_text.fit(X_train,y_train)
#making prediction using the model
predictions=clf_text.predict(X_test)

print(metrics.classification_report(y_test,predictions))
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

clf_rf=Pipeline([("tfidf",TfidfVectorizer()),("clf",RandomForestClassifier(random_state=0))])
clf_rf.fit(X_train,y_train)

#making prediction using the model


predictions=clf_rf.predict(X_test)

print(metrics.classification_report(y_test,predictions))

#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

clf_dt=Pipeline([("tfidf",TfidfVectorizer()),("clf",DecisionTreeClassifier(random_state=2))])
clf_dt.fit(X_train,y_train)

#making prediction using the model


predictions=clf_dt.predict(X_test)

print(metrics.classification_report(y_test,predictions))

#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))

You might also like