FakeNewsDetection Student
FakeNewsDetection Student
import numpy as np
import string
# ## Read datasets
fake = pd.read_csv("data/Fake.csv")
true = pd.read_csv("data/True.csv")
fake.shape
true.shape
fake['target'] = 'fake'
true['target'] = 'true'
# Concatenate dataframes
data.shape
data = data.reset_index(drop=True)
data.head()
data.drop(["date"],axis=1,inplace=True)
data.head()
data.drop(["title"],axis=1,inplace=True)
data.head()
# Convert to lowercase
data.head()
# Remove punctuation
#Insert code
data['text'] = data['text'].apply(punctuation_removal)
# Check
data.head()
# Removing stopwords
#Insert code
data.head()
print(data.groupby(['subject'])['text'].count())
data.groupby(['subject'])['text'].count().plot(kind="bar")
plt.show()
# In[17]:
print(data.groupby(['target'])['text'].count())
data.groupby(['target'])['text'].count().plot(kind="bar")
plt.show()
token_space = tokenize.WhitespaceTokenizer()
token_phrase = token_space.tokenize(all_words)
frequency = nltk.FreqDist(token_phrase)
"Frequency": list(frequency.values())})
plt.figure(figsize=(12,8))
ax.set(ylabel = "Count")
plt.xticks(rotation='vertical')
plt.show()
#Insert code
# ## Modeling
import itertools
title='Confusion matrix',
cmap=plt.cm.Blues):
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.yticks(tick_marks, classes)
if normalize:
else:
thresh = cm.max() / 2.
horizontalalignment="center",
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# # **Naive Bayes**
print("------Naive Bayes-----")
dct = dict()
NB_classifier = MultinomialNB()
('tfidf', TfidfTransformer()),
('model', NB_classifier)])
prediction = model.predict(X_test)
cm = metrics.confusion_matrix(y_test, prediction)
# # **Logistic regression**
# # **Decision Tree**
# # **Random Forest**
# ## **SVM**
plt.figure(figsize=(8,7))
plt.bar(list(dct.keys()),list(dct.values()))
plt.ylim(90,100)
plt.yticks((91, 92, 93, 94, 95, 96, 97, 98, 99, 100))
plt.show()