0% found this document useful (0 votes)
7 views

17 - Source Code - nlp-2-5

Uploaded by

nicolesaldanha96
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
7 views

17 - Source Code - nlp-2-5

Uploaded by

nicolesaldanha96
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 4

# Visualize top categories

top_n = 5
top_categories = df['News
Categories'].value_counts().nlargest(top_n).index
df_top = df[df['News Categories'].isin(top_categories)]
sns.countplot(x='News Categories', data=df_top,
palette='viridis')
plt.title(f'Top {top_n} News Categories')
plt.xlabel('Categories')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.show()

# Preprocess the 'News Categories'


import string
punc = string.punctuation

def remove_punc(text):
return text.translate(str.maketrans('', '', punc))

df['News Categories'] = df['News


Categories'].apply(remove_punc)

# Convert Date to datetime


df['Date'] = pd.to_datetime(df['Date'], format='mixed',
dayfirst=True)
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day

# Drop the original Date column


df = df.drop('Date', axis=1)

# Lowercase the Content


df['Content'] = df['Content'].str.lower()

# Check for HTML tags


def has_html_tags(text):
soup = BeautifulSoup(text, 'html.parser')
return bool(soup.find())

df['has_html_tags'] = df['Content'].apply(has_html_tags)
count_true = df['has_html_tags'].sum()
df = df.drop('has_html_tags', axis=1)

# Remove emojis
def has_emoji(text):
emoji_pattern = regex.compile(r'\p{Emoji}',
flags=regex.UNICODE)
return bool(emoji_pattern.search(text))
def remove_emojis(text):
emoji_pattern = regex.compile(r'\p{Emoji}',
flags=regex.UNICODE)
return emoji_pattern.sub('', text)

df['Content'] = df['Content'].apply(remove_emojis)

# Remove URLs
def remove_url(text):
pattern = re.compile(r'https?://\S+|www\.\S+')
return pattern.sub('', text)

df['Content'] = df['Content'].apply(remove_url)

# Remove punctuation from Content


df['Content'] = df['Content'].apply(remove_punc)

# Remove stopwords
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
words = text.split()
filtered_words = [word for word in words if word not in
stop_words]
return " ".join(filtered_words)

df['Content'] = df['Content'].apply(lambda x:
remove_stopwords(x))

# Replace abbreviations
abbreviation_dict = {
'LOL': 'laugh out loud',
'BRB': 'be right back',
# ... (other abbreviations)
'TTYL': 'talk to you later'
}

def replace_abbreviations(text, abbreviation_dict):


for abbreviation, full_form in abbreviation_dict.items():
text = text.replace(abbreviation, full_form)
return text

df['Content'] = df['Content'].apply(lambda x:
replace_abbreviations(x, abbreviation_dict))
# Tokenization
def tokenize_text(text):
words_list = [word_tokenize(sentence) for sentence in
sent_tokenize(text)]
return ' '.join(' '.join(words) for words in words_list)

df['Content'] = df['Content'].apply(tokenize_text)

# Prepare for model training


X = df['Content']
y = df['category_grouped']

# Encoding labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_weights_train = compute_class_weight('balanced',
classes=np.unique(y_encoded), y=y_encoded)

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X,
y_encoded, test_size=0.2, random_state=42)

# Multinomial Naive Bayes with Bag of Words


model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"MultinomialNB with Bag of Words accuracy:
{accuracy:.3f}")

# Print classification report


print("Classification Report:\n", classification_report(y_test,
y_pred))

# Cross-validation
cv_scores = cross_val_score(model, X, y_encoded,
cv=StratifiedKFold(n_splits=3, shuffle=True),
scoring='accuracy')
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Accuracy: {np.mean(cv_scores):.2f}")

# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {
'countvectorizer__max_features': [5000, 10000, None],
'countvectorizer__ngram_range': [(1, 1), (1, 2)],
'multinomialnb__alpha': uniform(0.1, 2.0)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


random_search = RandomizedSearchCV(model,
param_distributions=param_dist, n_iter=5, scoring='accuracy',
cv=cv, verbose=1, n_jobs=1)
random_search.fit(X, y_encoded)

best_params = random_search.best_params_
print("Best Parameters:", best_params)

best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_best = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_best)
print(f"Best Model Accuracy: {accuracy:.3f}")

# Inverse transform the predicted labels to get the original


class labels
predicted_labels_original = le.inverse_transform(y_pred_best)
correct_predictions = sum(y_test == y_pred_best)
wrong_predictions = len(y_test) - correct_predictions
print(f'Correct Predictions: {correct_predictions}, Wrong
Predictions: {wrong_predictions}')

# Visualization of predictions
labels = ['Correct Predictions', 'Wrong Predictions']
values = [correct_predictions, wrong_predictions]
plt.bar(labels, values, color=['green', 'red'])
plt.title('Correct vs Wrong Predictions')
plt.xlabel('Prediction Outcome')
plt.ylabel('Number of Samples')
plt.show()

# Final DataFrame with text and predicted labels


final_df = pd.DataFrame({'Content': X_test, 'Predicted_Labels':
predicted_labels_original, 'Actual_Labels':
le.inverse_transform(y_test)})
print(final_df.head())

You might also like