0% found this document useful (0 votes)

7 views

17 - Source Code - nlp-2-5

Uploaded by

nicolesaldanha96

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

7 views

17 - Source Code - nlp-2-5

Uploaded by

nicolesaldanha96

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 4

# Visualize top categories

top_n = 5
top_categories = df['News
Categories'].value_counts().nlargest(top_n).index
df_top = df[df['News Categories'].isin(top_categories)]
sns.countplot(x='News Categories', data=df_top,
palette='viridis')
plt.title(f'Top {top_n} News Categories')
plt.xlabel('Categories')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.show()

# Preprocess the 'News Categories'

import string
punc = string.punctuation

def remove_punc(text):
return text.translate(str.maketrans('', '', punc))

df['News Categories'] = df['News

Categories'].apply(remove_punc)

# Convert Date to datetime

df['Date'] = pd.to_datetime(df['Date'], format='mixed',
dayfirst=True)
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day

# Drop the original Date column

df = df.drop('Date', axis=1)

# Lowercase the Content

df['Content'] = df['Content'].str.lower()

# Check for HTML tags

def has_html_tags(text):
soup = BeautifulSoup(text, 'html.parser')
return bool(soup.find())

df['has_html_tags'] = df['Content'].apply(has_html_tags)
count_true = df['has_html_tags'].sum()
df = df.drop('has_html_tags', axis=1)

# Remove emojis
def has_emoji(text):
emoji_pattern = regex.compile(r'\p{Emoji}',
flags=regex.UNICODE)
return bool(emoji_pattern.search(text))
def remove_emojis(text):
emoji_pattern = regex.compile(r'\p{Emoji}',
flags=regex.UNICODE)
return emoji_pattern.sub('', text)

df['Content'] = df['Content'].apply(remove_emojis)

# Remove URLs
def remove_url(text):
pattern = re.compile(r'https?://\S+|www\.\S+')
return pattern.sub('', text)

df['Content'] = df['Content'].apply(remove_url)

# Remove punctuation from Content

df['Content'] = df['Content'].apply(remove_punc)

# Remove stopwords
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
words = text.split()
filtered_words = [word for word in words if word not in
stop_words]
return " ".join(filtered_words)

df['Content'] = df['Content'].apply(lambda x:
remove_stopwords(x))

# Replace abbreviations
abbreviation_dict = {
'LOL': 'laugh out loud',
'BRB': 'be right back',
# ... (other abbreviations)
'TTYL': 'talk to you later'
}

def replace_abbreviations(text, abbreviation_dict):

for abbreviation, full_form in abbreviation_dict.items():
text = text.replace(abbreviation, full_form)
return text

df['Content'] = df['Content'].apply(lambda x:
replace_abbreviations(x, abbreviation_dict))
# Tokenization
def tokenize_text(text):
words_list = [word_tokenize(sentence) for sentence in
sent_tokenize(text)]
return ' '.join(' '.join(words) for words in words_list)

df['Content'] = df['Content'].apply(tokenize_text)

# Prepare for model training

X = df['Content']
y = df['category_grouped']

# Encoding labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_weights_train = compute_class_weight('balanced',
classes=np.unique(y_encoded), y=y_encoded)

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X,
y_encoded, test_size=0.2, random_state=42)

# Multinomial Naive Bayes with Bag of Words

model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"MultinomialNB with Bag of Words accuracy:
{accuracy:.3f}")

# Print classification report

print("Classification Report:\n", classification_report(y_test,
y_pred))

# Cross-validation
cv_scores = cross_val_score(model, X, y_encoded,
cv=StratifiedKFold(n_splits=3, shuffle=True),
scoring='accuracy')
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Accuracy: {np.mean(cv_scores):.2f}")

# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {
'countvectorizer__max_features': [5000, 10000, None],
'countvectorizer__ngram_range': [(1, 1), (1, 2)],
'multinomialnb__alpha': uniform(0.1, 2.0)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(model,
param_distributions=param_dist, n_iter=5, scoring='accuracy',
cv=cv, verbose=1, n_jobs=1)
random_search.fit(X, y_encoded)

best_params = random_search.best_params_
print("Best Parameters:", best_params)

best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_best = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_best)
print(f"Best Model Accuracy: {accuracy:.3f}")

# Inverse transform the predicted labels to get the original

class labels
predicted_labels_original = le.inverse_transform(y_pred_best)
correct_predictions = sum(y_test == y_pred_best)
wrong_predictions = len(y_test) - correct_predictions
print(f'Correct Predictions: {correct_predictions}, Wrong
Predictions: {wrong_predictions}')

# Visualization of predictions
labels = ['Correct Predictions', 'Wrong Predictions']
values = [correct_predictions, wrong_predictions]
plt.bar(labels, values, color=['green', 'red'])
plt.title('Correct vs Wrong Predictions')
plt.xlabel('Prediction Outcome')
plt.ylabel('Number of Samples')
plt.show()

# Final DataFrame with text and predicted labels

final_df = pd.DataFrame({'Content': X_test, 'Predicted_Labels':
predicted_labels_original, 'Actual_Labels':
le.inverse_transform(y_test)})
print(final_df.head())

1403765707_5921734125878
100% (1)
1403765707_5921734125878
37 pages
18 22 91 A2 BA02 - Interfaces
No ratings yet
18 22 91 A2 BA02 - Interfaces
92 pages
Mercedes-Benz Greener Manufacturing Ai
0% (1)
Mercedes-Benz Greener Manufacturing Ai
16 pages
Fake News Classifier
No ratings yet
Fake News Classifier
5 pages
Report On - Social Media Research Topic Modeling
No ratings yet
Report On - Social Media Research Topic Modeling
26 pages
Shreya Srivastava-27
No ratings yet
Shreya Srivastava-27
3 pages
Python CA 4
No ratings yet
Python CA 4
9 pages
ML_lab_programs
No ratings yet
ML_lab_programs
8 pages
ML Week10.1
No ratings yet
ML Week10.1
5 pages
cyberbullying code
No ratings yet
cyberbullying code
6 pages
ML Lab Manual
No ratings yet
ML Lab Manual
12 pages
TopicClassifierbyDavidCaleb
No ratings yet
TopicClassifierbyDavidCaleb
7 pages
Lab5 Example Fall 23
No ratings yet
Lab5 Example Fall 23
4 pages
ATUL MLT EXP 4-11
No ratings yet
ATUL MLT EXP 4-11
17 pages
Email Spam Classifier
No ratings yet
Email Spam Classifier
22 pages
ML 1-10
No ratings yet
ML 1-10
53 pages
NLP Tushar
No ratings yet
NLP Tushar
21 pages
Naive Bayes Classification - Jupyter Notebook
No ratings yet
Naive Bayes Classification - Jupyter Notebook
4 pages
ML Program Output
No ratings yet
ML Program Output
22 pages
Lab Report 8
No ratings yet
Lab Report 8
11 pages
code
No ratings yet
code
13 pages
Self Evaluation Exercises (1)
No ratings yet
Self Evaluation Exercises (1)
12 pages
FakeNewsDetection Student
No ratings yet
FakeNewsDetection Student
7 pages
Manual
No ratings yet
Manual
48 pages
SVM
No ratings yet
SVM
8 pages
ccc
No ratings yet
ccc
25 pages
A Comprehensive Guide To Understand and Implement Text Classification in Python
No ratings yet
A Comprehensive Guide To Understand and Implement Text Classification in Python
34 pages
MLA TAB Lecture2
No ratings yet
MLA TAB Lecture2
84 pages
Ir Practical Manual 2
No ratings yet
Ir Practical Manual 2
24 pages
DM ML Practical
No ratings yet
DM ML Practical
13 pages
Aped For Fake News
No ratings yet
Aped For Fake News
6 pages
ML pdf
No ratings yet
ML pdf
30 pages
WDM - Week - I
No ratings yet
WDM - Week - I
24 pages
Extra Feature NLP
No ratings yet
Extra Feature NLP
5 pages
Clp
No ratings yet
Clp
1 page
Python Project
No ratings yet
Python Project
2 pages
Aiml 5-8
No ratings yet
Aiml 5-8
19 pages
ML Lab Programs
No ratings yet
ML Lab Programs
18 pages
ML Lab
No ratings yet
ML Lab
7 pages
ML Python Exercises UOM BDS Classification
No ratings yet
ML Python Exercises UOM BDS Classification
18 pages
School of Engineering: Lab Manual On Machine Learning Lab
No ratings yet
School of Engineering: Lab Manual On Machine Learning Lab
23 pages
implemention of sms spam filtering
No ratings yet
implemention of sms spam filtering
27 pages
Code Day 9 ML (ordinal) - Jupyter Notebook
No ratings yet
Code Day 9 ML (ordinal) - Jupyter Notebook
4 pages
5.2_feature_engineering
No ratings yet
5.2_feature_engineering
57 pages
9 Feature Engineering Text Data
No ratings yet
9 Feature Engineering Text Data
7 pages
Fall Semester 2020-21 AI With Python ECE-4031
No ratings yet
Fall Semester 2020-21 AI With Python ECE-4031
5 pages
1st PGM
No ratings yet
1st PGM
10 pages
NLP Manual
No ratings yet
NLP Manual
21 pages
Personalized Cancer Diagnosis
No ratings yet
Personalized Cancer Diagnosis
100 pages
Machine Learning Lab New
No ratings yet
Machine Learning Lab New
14 pages
7 Aiml
No ratings yet
7 Aiml
4 pages
Source Code
No ratings yet
Source Code
28 pages
Naive Bayes Classification
No ratings yet
Naive Bayes Classification
8 pages
Sentimental
No ratings yet
Sentimental
11 pages
Methodology (Autosaved)
No ratings yet
Methodology (Autosaved)
9 pages
HSU06 Session 5 Trần Thị Bích Hiền - Colab
No ratings yet
HSU06 Session 5 Trần Thị Bích Hiền - Colab
4 pages
Random Forest
No ratings yet
Random Forest
5 pages
Sample
No ratings yet
Sample
6 pages
ml_all_projectpdf_removed
No ratings yet
ml_all_projectpdf_removed
41 pages
Lab Manual ML
No ratings yet
Lab Manual ML
28 pages
code text
No ratings yet
code text
4 pages
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Module IV Microsoft Powerpoint
No ratings yet
Module IV Microsoft Powerpoint
6 pages
Essential Elements 2000 Pluspdf
100% (1)
Essential Elements 2000 Pluspdf
48 pages
Programming Assignment 3: Programming A Simple Controller: Instructions
No ratings yet
Programming Assignment 3: Programming A Simple Controller: Instructions
9 pages
Joint Khmer Word Segmentation and Part-Of-Speech T
No ratings yet
Joint Khmer Word Segmentation and Part-Of-Speech T
12 pages
Anthropometry
No ratings yet
Anthropometry
2 pages
Python All in One
No ratings yet
Python All in One
10 pages
Fire Fighter (Fire Fighters) (1982) (Imagic, Brad Stewart)
No ratings yet
Fire Fighter (Fire Fighters) (1982) (Imagic, Brad Stewart)
4 pages
(Ebook) Global Fragments. (Dis)Orientation in the New World Order. Asnel Papers 10. (Cross Cultures 90) (Cross Cultures: Readings in the Post Colonial Literatures in English) by BARTELS; Anke and Dirk WIEMANN (Eds.) ISBN 9042021829 - The ebook in PDF format is ready for immediate access
100% (1)
(Ebook) Global Fragments. (Dis)Orientation in the New World Order. Asnel Papers 10. (Cross Cultures 90) (Cross Cultures: Readings in the Post Colonial Literatures in English) by BARTELS; Anke and Dirk WIEMANN (Eds.) ISBN 9042021829 - The ebook in PDF format is ready for immediate access
37 pages
Group 14 Monitoring System
No ratings yet
Group 14 Monitoring System
27 pages
Big Data Management and Cloud Computing
No ratings yet
Big Data Management and Cloud Computing
7 pages
Modeling of Electrical Systems
No ratings yet
Modeling of Electrical Systems
51 pages
Muller and Phipps - Google Search
No ratings yet
Muller and Phipps - Google Search
1 page
AccountStatement Report 6038802623 30012024 13 26
No ratings yet
AccountStatement Report 6038802623 30012024 13 26
4 pages
Annual Report 2021-22
No ratings yet
Annual Report 2021-22
220 pages
2015 Chapter 4 MMS ITencrypted
No ratings yet
2015 Chapter 4 MMS ITencrypted
29 pages
Dessler HRM12e PPT 09
No ratings yet
Dessler HRM12e PPT 09
16 pages
Um0144 ST Assemblerlinker Stmicroelectronics
No ratings yet
Um0144 ST Assemblerlinker Stmicroelectronics
89 pages
This Manual Is Part of The Tru-Balance Box-Type Sifter and Should Be Kept Near The Machine For Users To Read
No ratings yet
This Manual Is Part of The Tru-Balance Box-Type Sifter and Should Be Kept Near The Machine For Users To Read
1 page
Galaxy S II Teardown - Splitting 8.9 MM of The Latest Samsung Technologies
No ratings yet
Galaxy S II Teardown - Splitting 8.9 MM of The Latest Samsung Technologies
10 pages
Skin Lesion Using Support Vector Machine-Main
100% (1)
Skin Lesion Using Support Vector Machine-Main
6 pages
Bütün-Beyinli Çocuk - Daniel J. Siegel - Tina Payne Bryson. (2017!04!12 21-09-42 UTC)
No ratings yet
Bütün-Beyinli Çocuk - Daniel J. Siegel - Tina Payne Bryson. (2017!04!12 21-09-42 UTC)
259 pages
T1000 Instruction Manual EN V1.0
No ratings yet
T1000 Instruction Manual EN V1.0
16 pages
APC SmartUPS SUA2200RM2U
No ratings yet
APC SmartUPS SUA2200RM2U
4 pages
Overview of The Architecture, Circuit Design, and Physical Implementation of A First-Generation Cell Processor
No ratings yet
Overview of The Architecture, Circuit Design, and Physical Implementation of A First-Generation Cell Processor
18 pages
AUTRONICA
No ratings yet
AUTRONICA
102 pages
DAA000088S CodeSoft - Easybar 4e 4i UserManual
No ratings yet
DAA000088S CodeSoft - Easybar 4e 4i UserManual
42 pages
Sample Business Continuity Plan Template
No ratings yet
Sample Business Continuity Plan Template
32 pages
Kavli Nanolab LDM To TeXtLib Conversion
No ratings yet
Kavli Nanolab LDM To TeXtLib Conversion
2 pages

17 - Source Code - nlp-2-5

Uploaded by

17 - Source Code - nlp-2-5

Uploaded by

# Visualize top categories

# Preprocess the 'News Categories'

df['News Categories'] = df['News

# Convert Date to datetime

# Drop the original Date column

# Lowercase the Content

# Check for HTML tags

# Remove punctuation from Content

def replace_abbreviations(text, abbreviation_dict):

# Prepare for model training

# Split the data into training and testing sets

# Multinomial Naive Bayes with Bag of Words

# Print classification report

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Inverse transform the predicted labels to get the original

# Final DataFrame with text and predicted labels

You might also like