17 - Source Code - nlp-2-5
17 - Source Code - nlp-2-5
top_n = 5
top_categories = df['News
Categories'].value_counts().nlargest(top_n).index
df_top = df[df['News Categories'].isin(top_categories)]
sns.countplot(x='News Categories', data=df_top,
palette='viridis')
plt.title(f'Top {top_n} News Categories')
plt.xlabel('Categories')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.show()
def remove_punc(text):
return text.translate(str.maketrans('', '', punc))
df['has_html_tags'] = df['Content'].apply(has_html_tags)
count_true = df['has_html_tags'].sum()
df = df.drop('has_html_tags', axis=1)
# Remove emojis
def has_emoji(text):
emoji_pattern = regex.compile(r'\p{Emoji}',
flags=regex.UNICODE)
return bool(emoji_pattern.search(text))
def remove_emojis(text):
emoji_pattern = regex.compile(r'\p{Emoji}',
flags=regex.UNICODE)
return emoji_pattern.sub('', text)
df['Content'] = df['Content'].apply(remove_emojis)
# Remove URLs
def remove_url(text):
pattern = re.compile(r'https?://\S+|www\.\S+')
return pattern.sub('', text)
df['Content'] = df['Content'].apply(remove_url)
# Remove stopwords
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
words = text.split()
filtered_words = [word for word in words if word not in
stop_words]
return " ".join(filtered_words)
df['Content'] = df['Content'].apply(lambda x:
remove_stopwords(x))
# Replace abbreviations
abbreviation_dict = {
'LOL': 'laugh out loud',
'BRB': 'be right back',
# ... (other abbreviations)
'TTYL': 'talk to you later'
}
df['Content'] = df['Content'].apply(lambda x:
replace_abbreviations(x, abbreviation_dict))
# Tokenization
def tokenize_text(text):
words_list = [word_tokenize(sentence) for sentence in
sent_tokenize(text)]
return ' '.join(' '.join(words) for words in words_list)
df['Content'] = df['Content'].apply(tokenize_text)
# Encoding labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_weights_train = compute_class_weight('balanced',
classes=np.unique(y_encoded), y=y_encoded)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"MultinomialNB with Bag of Words accuracy:
{accuracy:.3f}")
# Cross-validation
cv_scores = cross_val_score(model, X, y_encoded,
cv=StratifiedKFold(n_splits=3, shuffle=True),
scoring='accuracy')
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Accuracy: {np.mean(cv_scores):.2f}")
# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
param_dist = {
'countvectorizer__max_features': [5000, 10000, None],
'countvectorizer__ngram_range': [(1, 1), (1, 2)],
'multinomialnb__alpha': uniform(0.1, 2.0)
}
best_params = random_search.best_params_
print("Best Parameters:", best_params)
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_best)
print(f"Best Model Accuracy: {accuracy:.3f}")
# Visualization of predictions
labels = ['Correct Predictions', 'Wrong Predictions']
values = [correct_predictions, wrong_predictions]
plt.bar(labels, values, color=['green', 'red'])
plt.title('Correct vs Wrong Predictions')
plt.xlabel('Prediction Outcome')
plt.ylabel('Number of Samples')
plt.show()