0% found this document useful (0 votes)
8 views

code

NOTHING

Uploaded by

hodcse
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views

code

NOTHING

Uploaded by

hodcse
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 13

import pandas as pd

import re
from textblob import TextBlob
import matplotlib.pyplot as plt
# 2
# Load dataset
file_path = '/content/Nri_Textual_Survey_Data.csv' # Replace with your
file path
survey_data = pd.read_csv(file_path)
# 3
# ### 1. Text Preprocessing ###

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Update preprocessing function to include stop word removal


def preprocess_text_with_stopwords(text):
"""Cleans text by converting to lowercase, removing punctuation,
extra whitespace, and stop words."""
try:
# Convert to lowercase and remove punctuation
text = str(text).lower()
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
text = re.sub(r'\d+', '', text) # Remove digits
text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove
URLs
text = re.sub(r'<.*?>', '', text) # Remove HTML tags
text = re.sub(r'\n', ' ', text) # Replace newlines with spaces
text = re.sub(r'[^\x00-\x7F]+', '', text) # Remove non-ASCII
characters

# Remove stop words


text = " ".join([word for word in text.split() if word not in
ENGLISH_STOP_WORDS])
return text
except Exception as e:
return text

# Apply updated preprocessing


processed_data = survey_data.copy()
for col in processed_data.columns:
processed_data[col] =
processed_data[col].apply(preprocess_text_with_stopwords)
# 4
processed_data.to_csv('preprocessed_data.csv', index=False) # Save to
Colab environment

# Download the file


from google.colab import files
files.download('preprocessed_data.csv')
# 5
### 2. Sentiment Analysis ###
def analyze_sentiment(text):
"""Classifies sentiment as 'happy', 'neutral', or 'unhappy'."""
try:
blob = TextBlob(text)
polarity = blob.sentiment.polarity # Polarity ranges from -1
(negative) to 1 (positive)
if polarity > 0:
return 'happy'
elif polarity == 0:
return 'neutral'
else:
return 'unhappy'
except Exception:
return 'neutral'

# Add sentiment columns for each facility


sentiment_data = processed_data.copy()
for col in sentiment_data.columns:
sentiment_data[col + '_sentiment'] =
sentiment_data[col].apply(analyze_sentiment)
# 6
### 3. Sentiment Analysis Summary ###
# Count sentiments for each facility
facility_sentiment_cols = [col for col in sentiment_data.columns if
'_sentiment' in col]
sentiment_summary =
sentiment_data[facility_sentiment_cols].apply(pd.Series.value_counts).f
illna(0).astype(int)
sentiment_summary = sentiment_summary.T
sentiment_summary.columns = ['happy', 'neutral', 'unhappy']

# Overall sentiment counts


overall_sentiment_counts = sentiment_summary.sum()
# 7
### Visualization ###
# Overall Sentiment Distribution (Bar and Pie Charts)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
overall_sentiment_counts.plot(kind='bar', color=['green', 'orange',
'red'], ax=axes[0])
axes[0].set_title('Overall Sentiment Distribution (Bar Chart)')
axes[0].set_xlabel('Sentiment')
axes[0].set_ylabel('Count')
overall_sentiment_counts.plot(kind='pie', autopct='%1.1f%%',
colors=['green', 'orange', 'red'], ax=axes[1])
axes[1].set_title('Overall Sentiment Distribution (Pie Chart)')
axes[1].set_ylabel('')
plt.tight_layout()
plt.show()
# 8
# Facility-Wise Sentiment Distribution (Stacked Bar Chart)
sentiment_summary.plot(
kind='bar',
stacked=True,
figsize=(12, 8),
title='Facility-Wise Sentiment Distribution',
color=['green', 'orange', 'red']
)
plt.xlabel('Facilities')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.show()
# 9
# Individual Facility Sentiment Pie Charts
rows = (len(sentiment_summary) // 3) + (1 if len(sentiment_summary) % 3
else 0)
fig, axes = plt.subplots(rows, 3, figsize=(18, 5 * rows))

axes = axes.flatten()
for idx, facility in enumerate(sentiment_summary.index):
sentiment_summary.loc[facility].plot(
kind='pie',
ax=axes[idx],
autopct='%1.1f%%',
colors=['green', 'orange', 'red'],
title=f'{facility} Sentiment Distribution'
)
axes[idx].set_ylabel('')

for ax in axes[len(sentiment_summary):]:
ax.axis('off')

plt.tight_layout()
plt.show()
# 10
from sklearn.feature_extraction.text import CountVectorizer,
TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd

# Assuming 'processed_data' contains the preprocessed text data


# Dynamically set the column name
column_name = 'Internship' # Replace this with your desired column

# Bag-of-Words Extraction
bow_vectorizer = CountVectorizer(max_features=1000) # Limit vocabulary
size
bow_features =
bow_vectorizer.fit_transform(processed_data[column_name]) # Use
dynamic column name
bow_term_frequencies = bow_features.sum(axis=0).A1 # Convert sparse
matrix to array

# Create BoW DataFrame


bow_term_df = pd.DataFrame({
'Term': bow_vectorizer.get_feature_names_out(),
'Frequency': bow_term_frequencies
}).sort_values(by='Frequency', ascending=False)

# TF-IDF Extraction
tfidf_vectorizer = TfidfVectorizer(max_features=1000) # Limit
vocabulary size
tfidf_features =
tfidf_vectorizer.fit_transform(processed_data[column_name]) # Use
dynamic column name
tfidf_term_scores = tfidf_features.sum(axis=0).A1 # Convert sparse
matrix to array

# Create TF-IDF DataFrame


tfidf_term_df = pd.DataFrame({
'Term': tfidf_vectorizer.get_feature_names_out(),
'TF-IDF Score': tfidf_term_scores
}).sort_values(by='TF-IDF Score', ascending=False)

# Merge BoW and TF-IDF for comparison


comparison_df = pd.merge(
bow_term_df.rename(columns={"Frequency":
"Frequency_BoW"}).head(20),
tfidf_term_df.rename(columns={"TF-IDF Score": "Frequency_TF-
IDF"}).head(20),
on="Term",
how="outer"
).fillna(0)

# Sort by Bag-of-Words Frequency for consistency


comparison_df = comparison_df.sort_values(by="Frequency_BoW",
ascending=False)

# Plot the comparison graph


plt.figure(figsize=(12, 8))

# Bar width for side-by-side bars


bar_width = 0.35
index = range(len(comparison_df))

# Bag-of-Words Bar
plt.bar(index, comparison_df["Frequency_BoW"], bar_width, label="Bag-
of-Words", color="skyblue")

# TF-IDF Bar
plt.bar([i + bar_width for i in index], comparison_df["Frequency_TF-
IDF"], bar_width, label="TF-IDF", color="orange")

# Add labels and title


plt.xlabel("Terms")
plt.ylabel("Frequency/TF-IDF Score")
plt.title(f"Comparison of Bag-of-Words and TF-IDF Representations for
'{column_name}'")
plt.xticks([i + bar_width / 2 for i in index], comparison_df["Term"],
rotation=45, ha="right")
plt.legend()
plt.tight_layout()
plt.show()

# 11
from sklearn.model_selection import train_test_split

# Target variable based on the dynamic column name


target_column_name = column_name + '_sentiment' # Append '_sentiment'
dynamically
y = sentiment_data[target_column_name] # Use the dynamic sentiment
column name

# Split for BoW features


X_train_bow, X_test_bow, y_train, y_test = train_test_split(
bow_features, y, test_size=0.2, random_state=42
)

# Split for TF-IDF features


X_train_tfidf, X_test_tfidf, _, _ = train_test_split(
tfidf_features, y, test_size=0.2, random_state=42
)

# 12

from sklearn.linear_model import LogisticRegression


from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Initialize models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
random_forest = RandomForestClassifier(random_state=42)
svm = SVC(kernel='linear', random_state=42)

# Train models on BoW features


log_reg.fit(X_train_bow, y_train)
random_forest.fit(X_train_bow, y_train)
svm.fit(X_train_bow, y_train)
# 13
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score

# Dynamically calculate model accuracies


bow_accuracies = [
accuracy_score(y_test, log_reg.predict(X_test_bow)),
accuracy_score(y_test, random_forest.predict(X_test_bow)),
accuracy_score(y_test, svm.predict(X_test_bow))
]

tfidf_accuracies = [
accuracy_score(y_test, log_reg.predict(X_test_tfidf)),
accuracy_score(y_test, random_forest.predict(X_test_tfidf)),
accuracy_score(y_test, svm.predict(X_test_tfidf))
]

# Plotting model accuracy comparison


model_names = ['Logistic Regression', 'Random Forest', 'SVM']
x = range(len(model_names))
bar_width = 0.35

plt.figure(figsize=(10, 6))
plt.bar(x, bow_accuracies, width=bar_width, label='BoW',
color='skyblue')
plt.bar([i + bar_width for i in x], tfidf_accuracies, width=bar_width,
label='TF-IDF', color='orange')

# Add labels and title


plt.xticks([i + bar_width / 2 for i in x], model_names)
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison (BoW vs. TF-IDF)')
plt.legend()
plt.tight_layout()
plt.show()
# 14
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score

# Dynamically calculate metrics for a specific model (e.g., Logistic


Regression)
bow_scores = [
precision_score(y_test, log_reg.predict(X_test_bow),
average='weighted'),
recall_score(y_test, log_reg.predict(X_test_bow),
average='weighted'),
f1_score(y_test, log_reg.predict(X_test_bow), average='weighted')
]

tfidf_scores = [
precision_score(y_test, log_reg.predict(X_test_tfidf),
average='weighted'),
recall_score(y_test, log_reg.predict(X_test_tfidf),
average='weighted'),
f1_score(y_test, log_reg.predict(X_test_tfidf), average='weighted')
]

# Plot grouped bar chart


metrics = ['Precision', 'Recall', 'F1-Score']
x = np.arange(len(metrics))
bar_width = 0.35

plt.figure(figsize=(10, 6))
plt.bar(x, bow_scores, width=bar_width, label='BoW', color='skyblue')
plt.bar(x + bar_width, tfidf_scores, width=bar_width, label='TF-IDF',
color='orange')

# Add labels and title


plt.xticks(x + bar_width / 2, metrics)
plt.ylabel('Score')
plt.title('Model Performance Metrics (BoW vs. TF-IDF)')
plt.legend()
plt.tight_layout()
plt.show()

# 15
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Predictions for BoW features


y_pred_log_reg_bow = log_reg.predict(X_test_bow)
y_pred_rf_bow = random_forest.predict(X_test_bow)
y_pred_svm_bow = svm.predict(X_test_bow)

# Predictions for TF-IDF features


y_pred_log_reg_tfidf = log_reg.predict(X_test_tfidf)
y_pred_rf_tfidf = random_forest.predict(X_test_tfidf)
y_pred_svm_tfidf = svm.predict(X_test_tfidf)

# Confusion Matrices
cm_log_reg_bow = confusion_matrix(y_test, y_pred_log_reg_bow)
cm_rf_bow = confusion_matrix(y_test, y_pred_rf_bow)
cm_svm_bow = confusion_matrix(y_test, y_pred_svm_bow)

cm_log_reg_tfidf = confusion_matrix(y_test, y_pred_log_reg_tfidf)


cm_rf_tfidf = confusion_matrix(y_test, y_pred_rf_tfidf)
cm_svm_tfidf = confusion_matrix(y_test, y_pred_svm_tfidf)

# Plotting all confusion matrices


fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Titles for the matrices


titles = [
"Logistic Regression (BoW)", "Random Forest (BoW)", "SVM (BoW)",
"Logistic Regression (TF-IDF)", "Random Forest (TF-IDF)", "SVM (TF-
IDF)"
]

# All confusion matrices


conf_matrices = [
cm_log_reg_bow, cm_rf_bow, cm_svm_bow,
cm_log_reg_tfidf, cm_rf_tfidf, cm_svm_tfidf
]

# Plotting each heatmap


for i, ax in enumerate(axes.flat):
sns.heatmap(
conf_matrices[i], annot=True, fmt='d', cmap='Blues',
xticklabels=['Happy', 'Neutral', 'Unhappy'],
yticklabels=['Happy', 'Neutral', 'Unhappy'], ax=ax
)
ax.set_title(titles[i])
ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

plt.tight_layout()
plt.show()

from sklearn.metrics.pairwise import cosine_similarity


# 16

# Step 1: Use the already defined `processed_data` from your script.

# Step 2: Combine text from all columns to build a unified vocabulary


from sklearn.metrics.pairwise import cosine_similarity

combined_text_all = processed_data.apply(
lambda row: ' '.join(row.astype(str)), axis=1
)

# Fit the TF-IDF vectorizer on the combined text


tfidf_vectorizer_all = TfidfVectorizer()
tfidf_vectorizer_all.fit(combined_text_all)

# Step 3: Transform each column using the unified vocabulary


tfidf_vectors_all = {col:
tfidf_vectorizer_all.transform(processed_data[col].astype(str)) for col
in processed_data.columns}

# Step 4: Compute Pairwise Cosine Similarity for All Labels


similarity_matrix_all = np.zeros((len(processed_data.columns),
len(processed_data.columns)))

for i, col1 in enumerate(processed_data.columns):


for j, col2 in enumerate(processed_data.columns):
if i == j: # Self-similarity
similarity_matrix_all[i, j] = 1.0
else: # Pairwise similarity
similarity_matrix_all[i, j] = cosine_similarity(
tfidf_vectors_all[col1], tfidf_vectors_all[col2]
).mean()

# Step 5: Visualize the Similarity Matrix for All Labels


plt.figure(figsize=(12, 10))
sns.heatmap(
similarity_matrix_all,
xticklabels=processed_data.columns,
yticklabels=processed_data.columns,
cmap='coolwarm',
annot=True,
fmt=".2f",
annot_kws={"size": 10}, # Customize annotation font size
cbar_kws={"shrink": 0.8, "label": "Similarity Score"} # Color bar
customization
)
plt.title("Text Similarity Between All Labels (Cosine Similarity)",
fontsize=16)
plt.xlabel("Labels", fontsize=12)
plt.ylabel("Labels", fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10) # Rotate x-axis
labels for better readability
plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()

# Step 6: Identify the Most and Least Similar Pairs Across All Labels
similarity_df_all = pd.DataFrame(
similarity_matrix_all,
index=processed_data.columns,
columns=processed_data.columns
)

# Melt the matrix for pairwise comparison


similarity_melted_all = similarity_df_all.reset_index().melt(
id_vars='index',
var_name='Label 2',
value_name='Similarity'
).rename(columns={'index': 'Label 1'})

# Remove self-similarity (diagonal values)


similarity_melted_all =
similarity_melted_all[similarity_melted_all['Label 1'] !=
similarity_melted_all['Label 2']]

# Sort for most and least similar pairs


most_similar_all = similarity_melted_all.sort_values(by='Similarity',
ascending=False).head(1)
least_similar_all = similarity_melted_all.sort_values(by='Similarity',
ascending=True).head(1)

# Output results
print("Most Similar Pair:")
print(most_similar_all)

print("\nLeast Similar Pair:")


print(least_similar_all)
# 17
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,
accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW
from tqdm import tqdm
import matplotlib.pyplot as plt

# Step 1: Load and Preprocess Dataset


file_path = '/content/Nri_Textual_Survey_Data.csv' # Replace with your
dataset path
data = pd.read_csv(file_path)

# Preprocess text
def preprocess_text_dl(text_dl):
"""Clean text."""
text_dl = str(text_dl).lower()
text_dl = re.sub(r'[^\w\s]', '', text_dl) # Remove punctuation
text_dl = re.sub(r'\s+', ' ', text_dl).strip() # Remove extra
spaces
return text_dl

# Apply preprocessing to text column dynamically


text_column = data.columns[0] # Dynamically use the first column as
text
data[text_column] = data[text_column].apply(preprocess_text_dl)

# Analyze sentiment dynamically


def analyze_sentiment(text_dl):
"""Classify sentiment using polarity."""
from textblob import TextBlob
try:
blob = TextBlob(text_dl)
polarity = blob.sentiment.polarity
if polarity > 0:
return 0 # Happy
elif polarity == 0:
return 1 # Neutral
else:
return 2 # Unhappy
except:
return 1

data['label'] = data[text_column].apply(analyze_sentiment)

# Step 2: Split Data


train_texts, test_texts, train_labels, test_labels = train_test_split(
data[text_column], data['label'], test_size=0.2, random_state=42
)

# Step 3: Tokenize Using BERT


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
class SentimentDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len

def __len__(self):
return len(self.texts)

def __getitem__(self, idx):


text = self.texts.iloc[idx]
label = self.labels.iloc[idx]
encoding = self.tokenizer(
text,
truncation=True,
padding='max_length',
max_length=self.max_len,
return_tensors="pt"
)
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
'labels': torch.tensor(label, dtype=torch.long)
}

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)


test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)


test_loader = DataLoader(test_dataset, batch_size=16)

# Step 4: Define BERT Model


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-
uncased', num_labels=3)
model.to(device)

# Step 5: Train Model


optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 1
model.train()

for epoch in range(epochs):


total_loss = 0
for batch in tqdm(train_loader, desc=f"Training Epoch {epoch +
1}"):
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)

outputs = model(input_ids, attention_mask=attention_mask,


labels=labels)
loss = outputs.loss
total_loss += loss.item()
loss.backward()
optimizer.step()
print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

# Step 6: Evaluate Model


model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
for batch in tqdm(test_loader, desc="Evaluating"):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)

outputs = model(input_ids, attention_mask=attention_mask)


preds = torch.argmax(outputs.logits, axis=1)

all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.cpu().numpy())

# Calculate Accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
cm = confusion_matrix(all_labels, all_preds, labels=[0, 1, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=["Happy", "Neutral", "Unhappy"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Sentiment Analysis")
plt.show()

You might also like