0% found this document useful (0 votes)
12 views

1 - Sentiment - Analysis - NLP - Ipynb - Codes Only

Uploaded by

racoon97970301
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
12 views

1 - Sentiment - Analysis - NLP - Ipynb - Codes Only

Uploaded by

racoon97970301
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 5

3/26/24, 4:07 PM 1_sentiment_analysis_nlp.

ipynb - Colaboratory

keyboard_arrow_down Mounting the drive


from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDriveb/nlp_project
!ls #checking if files are there or not

keyboard_arrow_down Importing the dataset


import pandas as pd

data = pd.read_csv('amazon_alexa.tsv', sep='\t')


data.head(10) #dataset is labelled in a binary format

mydata = data[['verified_reviews','feedback']] #relevant columns


mydata.columns = ['review','label'] #renaming

mydata.head()

#checking the distribution of label columnn


mydata.value_counts('label')

As can be seen above, dataset is imbalanced. Thus we will be using Undersampling technique to balance the dataset.

# Count the occurrences of each label


label_counts = mydata["label"].value_counts()

# Get the number of rows to drop from the majority class


rows_to_drop = label_counts.max() - label_counts.min()

# Drop rows from the majority class randomly


if rows_to_drop > 0:
data_majority = mydata[mydata["label"] == 1]
data_balanced = mydata.drop(data_majority.sample(rows_to_drop).index)
else:
data_balanced = mydata.copy()

# Check the new class balance


print(data_balanced["label"].value_counts())

The dataset above is balanced.

keyboard_arrow_down Data preprocessing


#defining a function to clean the dataset
import re

def clean_text(text):
# to remove special characters and punctuation
text = re.sub(r"[^\w\s]", " ", text)

# to remove single characters


text = re.sub(r"\b[a-zA-Z]\b", " ", text)

# to remove HTML tags


text = re.sub(r"<[^>]*>", " ", text)

# to lowercase the text


text = text.lower()

# to remove extra whitespace


text = re.sub(r"\s+", " ", text)

# to trim leading and trailing spaces


text = text.strip()

return text

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 1/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

import pandas as pd

# extracting the review colum as a list


reviews = data_balanced['review'].tolist()

# Cleaning the text in the list made


cleaned_reviews = [clean_text(review) for review in reviews] #iterating through each element of reviews column

# Adding the cleaned reviews as a new column to the datafrae


data_balanced['clean_reviews'] = cleaned_reviews

data_balanced

keyboard_arrow_down Splitting the dataset into 5% training and 95% test dataset
import pandas as pd

# Assuming your DataFrame is called "df"


total_rows = len(data_balanced)
test_size = int(total_rows * 0.95)

# Randomly sample train_size rows for the training set


test_set = data_balanced.sample(test_size)

# Get the remaining rows for the test set


train_set = data_balanced.drop(test_set.index)

keyboard_arrow_down Sentiment analysis using LLM


keyboard_arrow_down Setting up GEMINI API
!pip install -q -U google-generativeai

# Necessary packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display


from IPython.display import Markdown

def to_markdown(text):
text = text.replace('•', ' *')
return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Used to securely store your API key


from google.colab import userdata

# Using `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.


GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

for m in genai.list_models():
if 'generateContent' in m.supported_generation_methods:
print(m.name)

#we will be using the gemini pro model


model = genai.GenerativeModel('gemini-pro')

%%time
response = model.generate_content("how great is MS Dhoni?")

to_markdown(response.text)

keyboard_arrow_down Integrating the Gemini pro API to our sentiment analysis task
https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 2/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

test_set_sample = test_set.sample(20) #taking random 20 dataset from test dataset

test_set_sample['pred_label'] = '' #creating a column pred_label for our predicted feedback

test_set_sample

# Convert the DataFrame to JSON using the to_json() method

json_data = test_set_sample[['clean_reviews','pred_label']].to_json(orient='records')

# Print the JSON data


print(json_data)

prompt = f"""
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
{json_data}
```
"""

print(prompt)

#feeding the prompt to gemini model to generate the content


response = model.generate_content(prompt)

print(response.text)

import json

# Clean the data by stripping the backticks


json_data = response.text.strip("`")

# Load the cleaned data and convert to DataFrame


data = json.loads(json_data)
df_sample = pd.DataFrame(data)

df_sample

#Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix

y_true = test_set_sample["label"]
y_pred = test_set_sample["pred_label"]

confusion_matrix(y_true, y_pred)

keyboard_arrow_down Batching GEMINI API calls


test_set.shape

test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

batches = []
batch_size = 25

for i in range(0, len(test_set_total), batch_size):


batches.append(test_set_total[i : i + batch_size]) # Append batches instead of assigning

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 3/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

import time

def gemini_completion_function(batch,current_batch,total_batch):
"""Function works in three steps:
# Step-1: Convert the DataFrame to JSON using the to_json() method.
# Step-2: Preparing the Gemini Prompt
# Step-3: Calling Gemini API
"""

print(f"Now processing batch#: {current_batch+1} of {total_batch}")

json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three backticks below.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.
Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

```
{json_data}
```
"""

print(prompt)
response = model.generate_content(prompt)
time.sleep(5)

return response

batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
responses.append(gemini_completion_function(batches[i],i,batch_count))

import json

df_total = pd.DataFrame() # Initialize an empty DataFrame

for response in responses:


# Clean the data by stripping the backticks
json_data = response.text.strip("`")

# Load the cleaned data and convert to DataFrame


data = json.loads(json_data)
df_temp = pd.DataFrame(data)

# Append the DataFrame to the final DataFrame


df_total = df_total.append(df_temp, ignore_index=True)

print(df_total) # Display the final DataFrame

#Overwriting the pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 4/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 5/5

You might also like