Q 3
Q 3
In [ ]: import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
In [ ]: nltk.download('punkt')
nltk.download('stopwords')
Creating a dataframe
In [ ]: # Create a DataFrame
df = pd.DataFrame({'label': labels, 'review': reviews})
df['sentiment'] = df['label'].map(sentiment_map)
print(df.tail())
review sentiment
0 Great CD: My lovely Pat has one of the GREAT v... negative
1 One of the best game music soundtracks - for a... negative
2 Batteries died within a year ...: I bought thi... positive
3 works fine, but Maha Energy is better: Check o... negative
4 Great for the non-audiophile: Reviewed quite a... negative
review sentiment
399995 Unbelievable- In a Bad Way: We bought this Tho... positive
399996 Almost Great, Until it Broke...: My son reciev... positive
399997 Disappointed !!!: I bought this toy for my son... positive
399998 Classic Jessica Mitford: This is a compilation... negative
399999 Comedy Scene, and Not Heard: This DVD will be ... positive
Preprocessing
In [ ]: def preprocess_text(text):
text = text.lower() # Convert to lowercase
tokenizer = nltk.RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text) # Tokenize
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens] # Stemming
return ' '.join(tokens)
In [ ]: df['review'] = df['review'].apply(preprocess_text)
Sentiment Distribution
In [ ]: df['sentiment'].value_counts().plot(kind='bar')
plt.title('Product Review Data Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()
In [ ]: y
Accuracy, Precision, Recall, and F1 score for Logistic Regression and Naive Bayes
In [ ]: def print_evaluation_metrics(y_true, y_pred, model_name):
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')