0% found this document useful (0 votes)
4 views

17 - Source Code - nlp-2-5

Uploaded by

nicolesaldanha96
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views

17 - Source Code - nlp-2-5

Uploaded by

nicolesaldanha96
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 4

# Visualize top categories

top_n = 5
top_categories = df['News
Categories'].value_counts().nlargest(top_n).index
df_top = df[df['News Categories'].isin(top_categories)]
sns.countplot(x='News Categories', data=df_top,
palette='viridis')
plt.title(f'Top {top_n} News Categories')
plt.xlabel('Categories')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.show()

# Preprocess the 'News Categories'


import string
punc = string.punctuation

def remove_punc(text):
return text.translate(str.maketrans('', '', punc))

df['News Categories'] = df['News


Categories'].apply(remove_punc)

# Convert Date to datetime


df['Date'] = pd.to_datetime(df['Date'], format='mixed',
dayfirst=True)
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day

# Drop the original Date column


df = df.drop('Date', axis=1)

# Lowercase the Content


df['Content'] = df['Content'].str.lower()

# Check for HTML tags


def has_html_tags(text):
soup = BeautifulSoup(text, 'html.parser')
return bool(soup.find())

df['has_html_tags'] = df['Content'].apply(has_html_tags)
count_true = df['has_html_tags'].sum()
df = df.drop('has_html_tags', axis=1)

# Remove emojis
def has_emoji(text):
emoji_pattern = regex.compile(r'\p{Emoji}',
flags=regex.UNICODE)
return bool(emoji_pattern.search(text))
def remove_emojis(text):
emoji_pattern = regex.compile(r'\p{Emoji}',
flags=regex.UNICODE)
return emoji_pattern.sub('', text)

df['Content'] = df['Content'].apply(remove_emojis)

# Remove URLs
def remove_url(text):
pattern = re.compile(r'https?://\S+|www\.\S+')
return pattern.sub('', text)

df['Content'] = df['Content'].apply(remove_url)

# Remove punctuation from Content


df['Content'] = df['Content'].apply(remove_punc)

# Remove stopwords
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
words = text.split()
filtered_words = [word for word in words if word not in
stop_words]
return " ".join(filtered_words)

df['Content'] = df['Content'].apply(lambda x:
remove_stopwords(x))

# Replace abbreviations
abbreviation_dict = {
'LOL': 'laugh out loud',
'BRB': 'be right back',
# ... (other abbreviations)
'TTYL': 'talk to you later'
}

def replace_abbreviations(text, abbreviation_dict):


for abbreviation, full_form in abbreviation_dict.items():
text = text.replace(abbreviation, full_form)
return text

df['Content'] = df['Content'].apply(lambda x:
replace_abbreviations(x, abbreviation_dict))
# Tokenization
def tokenize_text(text):
words_list = [word_tokenize(sentence) for sentence in
sent_tokenize(text)]
return ' '.join(' '.join(words) for words in words_list)

df['Content'] = df['Content'].apply(tokenize_text)

# Prepare for model training


X = df['Content']
y = df['category_grouped']

# Encoding labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_weights_train = compute_class_weight('balanced',
classes=np.unique(y_encoded), y=y_encoded)

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X,
y_encoded, test_size=0.2, random_state=42)

# Multinomial Naive Bayes with Bag of Words


model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"MultinomialNB with Bag of Words accuracy:
{accuracy:.3f}")

# Print classification report


print("Classification Report:\n", classification_report(y_test,
y_pred))

# Cross-validation
cv_scores = cross_val_score(model, X, y_encoded,
cv=StratifiedKFold(n_splits=3, shuffle=True),
scoring='accuracy')
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Accuracy: {np.mean(cv_scores):.2f}")

# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {
'countvectorizer__max_features': [5000, 10000, None],
'countvectorizer__ngram_range': [(1, 1), (1, 2)],
'multinomialnb__alpha': uniform(0.1, 2.0)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


random_search = RandomizedSearchCV(model,
param_distributions=param_dist, n_iter=5, scoring='accuracy',
cv=cv, verbose=1, n_jobs=1)
random_search.fit(X, y_encoded)

best_params = random_search.best_params_
print("Best Parameters:", best_params)

best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_best = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_best)
print(f"Best Model Accuracy: {accuracy:.3f}")

# Inverse transform the predicted labels to get the original


class labels
predicted_labels_original = le.inverse_transform(y_pred_best)
correct_predictions = sum(y_test == y_pred_best)
wrong_predictions = len(y_test) - correct_predictions
print(f'Correct Predictions: {correct_predictions}, Wrong
Predictions: {wrong_predictions}')

# Visualization of predictions
labels = ['Correct Predictions', 'Wrong Predictions']
values = [correct_predictions, wrong_predictions]
plt.bar(labels, values, color=['green', 'red'])
plt.title('Correct vs Wrong Predictions')
plt.xlabel('Prediction Outcome')
plt.ylabel('Number of Samples')
plt.show()

# Final DataFrame with text and predicted labels


final_df = pd.DataFrame({'Content': X_test, 'Predicted_Labels':
predicted_labels_original, 'Actual_Labels':
le.inverse_transform(y_test)})
print(final_df.head())

You might also like