0% found this document useful (0 votes)
13 views2 pages

Q 3

question 3

Uploaded by

Anonymous
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views2 pages

Q 3

question 3

Uploaded by

Anonymous
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 2

3).

Product review dataset

In [ ]: import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [ ]: nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...


[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
Out[ ]: True

Load the dataset


In [ ]: with open('/content/dataset.txt', 'r', encoding='utf-8') as file:
lines = file.readlines()

In [ ]: # Initialize lists to store labels and reviews


labels = []
reviews = []

In [ ]: # Process each line in the dataset


for line in lines:
# Split the line by '__label__'
parts = line.split('__label__')
# Check if there are two parts
if len(parts) == 2:
# Extract label and review
label = '__label__' + parts[1].strip().split()[0] # Extracting the label
review = ' '.join(parts[1].strip().split()[1:]) # Extracting the review
labels.append(label)
reviews.append(review)

Creating a dataframe
In [ ]: # Create a DataFrame
df = pd.DataFrame({'label': labels, 'review': reviews})

# Map labels to sentiments


sentiment_map = {
'__label__1': 'positive',
'__label__2': 'negative'
}

df['sentiment'] = df['label'].map(sentiment_map)

# Drop the 'label' column


df.drop(columns=['label'], inplace=True)

# Display the DataFrame


print(df.head())

print(df.tail())

review sentiment
0 Great CD: My lovely Pat has one of the GREAT v... negative
1 One of the best game music soundtracks - for a... negative
2 Batteries died within a year ...: I bought thi... positive
3 works fine, but Maha Energy is better: Check o... negative
4 Great for the non-audiophile: Reviewed quite a... negative
review sentiment
399995 Unbelievable- In a Bad Way: We bought this Tho... positive
399996 Almost Great, Until it Broke...: My son reciev... positive
399997 Disappointed !!!: I bought this toy for my son... positive
399998 Classic Jessica Mitford: This is a compilation... negative
399999 Comedy Scene, and Not Heard: This DVD will be ... positive

Preprocessing
In [ ]: def preprocess_text(text):
text = text.lower() # Convert to lowercase
tokenizer = nltk.RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text) # Tokenize
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens] # Stemming
return ' '.join(tokens)

In [ ]: df['review'] = df['review'].apply(preprocess_text)

Sentiment Distribution
In [ ]: df['sentiment'].value_counts().plot(kind='bar')
plt.title('Product Review Data Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

Vectorization using TF-IDF


In [ ]: tfidf = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)
X = tfidf.fit_transform(df['review']).toarray()

Split the data into training and testing sets


In [ ]: y = df['sentiment'].map({'positive': 1, 'negative': 0}).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [ ]: y

Out[ ]: array([0, 0, 1, ..., 1, 0, 1])

Logistic Regression classifier


In [ ]: lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)
lr_pred = lr_classifier.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
print("Logistic Regression Accuracy:", lr_accuracy)

Logistic Regression Accuracy: 0.86795

Naive Bayes classifier


In [ ]: nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
nb_pred = nb_classifier.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)
print("Naive Bayes Accuracy:", nb_accuracy)

Naive Bayes Accuracy: 0.827925

In [ ]: def plot_confusion_matrix(y_true, y_pred, model_name):


cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Negative','Positive'],
yticklabels=['Negative','Positive'])
plt.title(f'{model_name} Confusion Matrix')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()

Confusion matrix for Logistic Regression


In [ ]: plot_confusion_matrix(y_test, lr_pred, "Logistic Regression")

Confusion matrix for Naive Bayes


In [ ]: plot_confusion_matrix(y_test, nb_pred, "Naive Bayes")

Accuracy, Precision, Recall, and F1 score for Logistic Regression and Naive Bayes
In [ ]: def print_evaluation_metrics(y_true, y_pred, model_name):
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"----------- {model_name} Evaluation Metrics -----------")


print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [ ]: print_evaluation_metrics(y_test, lr_pred, "Logistic Regression")


print_evaluation_metrics(y_test, nb_pred, "Naive Bayes")

----------- Logistic Regression Evaluation Metrics -----------


Accuracy: 0.8679
Precision: 0.8680
Recall: 0.8679
F1 Score: 0.8679
----------- Naive Bayes Evaluation Metrics -----------
Accuracy: 0.8279
Precision: 0.8280
Recall: 0.8279
F1 Score: 0.8279

You might also like