ML Report Fake News Detection
ML Report Fake News Detection
Project
Submitted By:
Name: Archita
Shankar
Submitted To:
2
Index
Sr. No. Content used Page No.
1. Introduction 3
2 Libraries used 4
3
1. Introduction
https://docs.google.com/spreadsheets/d/e/2PACX-
1vRP31dOhVmg97Yf0mIy5lD1Z7k4vr57xDS0NQWELeSnm0Pd
Hi5fE5304uDBkRgPb5NbzcIYuLJpxx1i/pub?
gid=562236535&single=true&output=csv
1.3 Description -
A Fake News Detection ML project uses natural language processing
(NLP) and machine learning algorithms to identify misleading or false
information in text. It involves data collection, preprocessing, feature
extraction, and classification models like logistic regression or
transformers. The system improves online information credibility by
analyzing patterns and flagging fake content.
1. Detect Fake News Patterns: Analyze textual data to identify linguistic and
stylistic patterns common in fake news articles.
This project uses Python libraries like pandas, nltk and scikit-learn for
data preprocessing, NLP tasks, and model building. It highlights a practical application
of data science techniques to address the critical issue of fake news.
4
2. Libraries Used
3. Algorithm Used
1. Data Preprocessing
o The dataset comprises labeled news articles with text and label columns.
o Preprocessing includes:
Removing punctuation, special characters, and stopwords.
Tokenizing and stemming/lemmatizing text.
Converting text into numerical features using techniques like TF-IDF
vectorization.
3. Model Evaluation
o Models are evaluated using metrics like accuracy, precision, recall, and F1-
score.
o Cross-validation ensures the model generalizes well to unseen data.
4. Making Predictions
o Each classifier predicts whether a news article is real or fake based on
learned patterns.
o Comparative analysis identifies the best-performing model for deployment.
Model Training:
Each classification algorithm, such as Logistic Regression, Random Forest,
AdaBoost, Decision Tree, Multinomial Naive Bayes, and SVM, is trained on the
preprocessed data. The training involves fitting the model to the input features
(numerical representation of text) and corresponding labels (real or fake).
6
Making Predictions:
After training, the models predict the class of new or test data by analyzing the
patterns learned during training. These predictions represent whether a given news
article is classified as real or fake.
Model Evaluation:
The predictions are compared against the actual labels in the test data to compute
evaluation metrics like accuracy, precision, recall, and F1-score. This helps
determine how well each model performs on unseen data.
7
8
4. Code file 1
1. import pandas as pd
2. import matplotlib.pyplot as plt
3. import numpy as np
4.
5. from sklearn.feature_extraction.text import CountVectorizer,
TfidfVectorizer
6. train=pd.read_csv(r'D:/Desktop/ML project final/train (3).csv',
nrows=1800)
7. test=pd.read_csv(r'D:/Desktop/ML project final/test (1).csv',
nrows=500)
8.
9. train
10. test
11. test=test.fillna(' ')
12. train=train.fillna(' ')
13. test['total']=test['title']+' '+test['author']+test['text']
14. train['total']=train['title']+' '+train['author']+train['text']
15.
16. train.head()
17. test.head()
18.
19. from sklearn.feature_extraction.text import TfidfTransformer,
CountVectorizer
20. from nltk.corpus import stopwords
21. from nltk.stem.porter import PorterStemmer
22. import re
23.
24. ps = PorterStemmer()
25. stop_words = set(stopwords.words('english'))
26. regex = re.compile('[^a-zA-Z]')
27.
28. corpus = []
29. for i in range(len(train)):
30. review = regex.sub(' ', train['total'][i])
31. review = review.lower().split()
32. review = [ps.stem(word) for word in review if word not in
stop_words]
33. corpus.append(' '.join(review))
34.
35. count_vectorizer = CountVectorizer(ngram_range=(1, 2))
36. counts = count_vectorizer.fit_transform(corpus)
37. transformer = TfidfTransformer(smooth_idf=False)
38. tfidf = transformer.fit_transform(counts)
39.
40. targets = train['label'].values
9
41.
42. from sklearn.model_selection import train_test_split
43. X_train, X_test, y_train, y_test = train_test_split(tfidf,
targets, random_state=0)
44. # Multinominal NB
45. from sklearn.naive_bayes import MultinomialNB
46.
47. NB = MultinomialNB()
48. NB.fit(X_train, y_train)
49. print('Accuracy of NB classifier on training set: {:.2f}'
50. .format(NB.score(X_train, y_train)))
51. print('Accuracy of NB classifier on test set: {:.2f}'
52. .format(NB.score(X_test, y_test)))
53.
54. # Random Forest
55. from sklearn.ensemble import RandomForestClassifier
56. RnFr = RandomForestClassifier()
57. RnFr.fit(X_train, y_train)
58. print('Accuracy of RandomForest classifier on training set:
{:.2f}'
59. .format(RnFr.score(X_train, y_train)))
60. print('Accuracy of RandomForest classifier on test set: {:.2f}'
61. .format(RnFr.score(X_test, y_test)))
62.
63. # SVM
64. from sklearn.svm import SVC
65. svclassifier = SVC(C=1,kernel='linear',gamma =
'auto',probability=True)
66. svclassifier.fit(X_train, y_train)
67. print('Accuracy of SVM classifier on training set: {:.2f}'
68. .format(RnFr.score(X_train, y_train)))
69. print('Accuracy of SVM classifier on test set: {:.2f}'
70. .format(RnFr.score(X_test, y_test)))
71.
72. # AdaBoostClassifier
73. from sklearn.ensemble import AdaBoostClassifier
74. ada_classifier= AdaBoostClassifier()
75. ada_classifier.fit(X_train, y_train)
76. print('Accuracy of AdaBoostClassifier classifier on training set:
{:.2f}'
77. .format(RnFr.score(X_train, y_train)))
78. print('Accuracy of AdaBoostClassifier classifier on test set:
{:.2f}'
79. .format(RnFr.score(X_test, y_test)))
80.
81. # LogisticRegression
82. from sklearn.linear_model import LogisticRegression
83. log_classifier=LogisticRegression()
84. log_classifier.fit(X_train, y_train)
10
85. print('Accuracy of LogisticRegression classifier on training set:
{:.2f}'
86. .format(RnFr.score(X_train, y_train)))
87. print('Accuracy of LogisticRegression classifier on test set:
{:.2f}'
88. .format(RnFr.score(X_test, y_test)))
89.
90.
Screenshots
This is multinomial NB
11
This is decision Tree Classifier
12
This is SVM
Code file 2
1. import pandas as pd
2. import matplotlib.pyplot as plt
3. import numpy as np
4.
5. from sklearn.feature_extraction.text import CountVectorizer,
TfidfVectorizer
6. train=pd.read_csv(r'D:/Desktop/ML project final/train (3).csv',
nrows=1800)
7. test=pd.read_csv(r'D:/Desktop/ML project final/test (1).csv',
nrows=500)
8.
9. test=test.fillna(' ')
10. train=train.fillna(' ')
11. test['total']=test['title']+' '+test['author']+test['text']
12. train['total']=train['title']+' '+train['author']+train['text']
13.
14. from sklearn.feature_extraction.text import TfidfTransformer,
CountVectorizer
15. from nltk.corpus import stopwords
16. from nltk.stem.porter import PorterStemmer
17. import re
18. ps = PorterStemmer()
19. stop_words = set(stopwords.words('english'))
20. regex = re.compile('[^a-zA-Z]')
21.
22. corpus = []
23. for i in range(len(train)):
13
24. review = regex.sub(' ', train['total'][i])
25. review = review.lower().split()
26. review = [ps.stem(word) for word in review if word not in
stop_words]
27. corpus.append(' '.join(review))
28.
29. count_vectorizer = CountVectorizer(ngram_range=(1, 2))
30. counts = count_vectorizer.fit_transform(corpus)
31. transformer = TfidfTransformer(smooth_idf=False)
32. tfidf = transformer.fit_transform(counts)
33.
34. targets = train['label'].values
35.
36. from sklearn.model_selection import train_test_split
37. X_train, X_test, y_train, y_test = train_test_split(tfidf,
targets, random_state=0)
38.
39. from sklearn.naive_bayes import MultinomialNB
40. NB = MultinomialNB()
41. NB.fit(X_train, y_train)
42. print('Accuracy of NB classifier on training set: {:.2f}'
43. .format(NB.score(X_train, y_train)))
44. print('Accuracy of NB classifier on test set: {:.2f}'
45. .format(NB.score(X_test, y_test)))
46.
47. from sklearn.ensemble import RandomForestClassifier
48. RnFr = RandomForestClassifier()
49. RnFr.fit(X_train, y_train)
50. print('Accuracy of RandomForest classifier on training set:
{:.2f}'
51. .format(RnFr.score(X_train, y_train)))
52. print('Accuracy of RandomForest classifier on test set: {:.2f}'
53. .format(RnFr.score(X_test, y_test)))
54.
55. from sklearn.svm import SVC
56. svclassifier = SVC(C=1,kernel='linear',gamma =
'auto',probability=True)
57. svclassifier.fit(X_train, y_train)
58. print('Accuracy of SVM classifier on training set: {:.2f}'
59. .format(RnFr.score(X_train, y_train)))
60. print('Accuracy of SVM classifier on test set: {:.2f}'
61. .format(RnFr.score(X_test, y_test)))
62.
14
63. from sklearn.ensemble import AdaBoostClassifier
64. ada_classifier= AdaBoostClassifier()
65. ada_classifier.fit(X_train, y_train)
66. print('Accuracy of AdaBoostClassifier classifier on training set:
{:.2f}'
67. .format(RnFr.score(X_train, y_train)))
68. print('Accuracy of AdaBoostClassifier classifier on test set:
{:.2f}'
69. .format(RnFr.score(X_test, y_test)))
70.
71. from sklearn.linear_model import LogisticRegression
72. log_classifier=LogisticRegression()
73. log_classifier.fit(X_train, y_train)
74. print('Accuracy of LogisticRegression classifier on training set:
{:.2f}'
75. .format(RnFr.score(X_train, y_train)))
76. print('Accuracy of LogisticRegression classifier on test set:
{:.2f}'
77. .format(RnFr.score(X_test, y_test)))
78.
79.
Screenshots
15