0% found this document useful (0 votes)

14 views6 pages

ML101 Graded Assignment 2.ipynb - Colab

The document presents a sentiment classification analysis of COVID-19 tweets using SVM, including exploratory data analysis, preprocessing, baseline model evaluation, and hyperparameter tuning. It also discusses a decision tree model for sales segmentation, detailing dataset preparation, model implementation, and feature importance interpretation. Key findings indicate that product placement, pricing, and advertising significantly influence sales outcomes.

Uploaded by

bhavanasetty95

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

14 views6 pages

ML101 Graded Assignment 2.ipynb - Colab

Uploaded by

bhavanasetty95

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 6

setty bhavana

IITMCS_240681

keyboard_arrow_down Part 1: Sentiment Classification of COVID-19 Tweets

# 1.1 EDA
tweets.columns = [col.strip() for col in tweets.columns]
print(tweets['Sentiment'].value_counts())

plt.figure(figsize=(6,4))
sns.countplot(data=tweets, x="Sentiment", order=tweets['Sentiment'].value_counts().index)
plt.title("Tweet Sentiment Distribution")
plt.xticks(rotation=45)
plt.show()

Sentiment
Negative 1041
Positive 947
Neutral 619
Extremely Positive 599
Extremely Negative 592
Name: count, dtype: int64

# 1.2 Preprocessing
X = tweets['OriginalTweet'].astype(str)
y = tweets['Sentiment']

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

X_tfidf = vectorizer.fit_transform(X)

X_train, X_temp, y_train, y_temp = train_test_split(X_tfidf, y, test_size=0.3, stratify=y, random_st

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, rand
# 1.3 Baseline SVM
baseline = SVC(kernel="linear", probability=True, random_state=42)
baseline.fit(X_train, y_train)
y_pred = baseline.predict(X_test)
print("Baseline Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))
ConfusionMatrixDisplay.from_estimator(baseline, X_test, y_test, cmap="Blues")
plt.show()

Baseline Classification Report:

precision recall f1-score support

Extremely Negative 0.54 0.36 0.43 89

Extremely Positive 0.63 0.32 0.43 90
Negative 0.37 0.54 0.44 156
Neutral 0.57 0.41 0.47 93
Positive 0.34 0.41 0.37 142

accuracy 0.42 570

macro avg 0.49 0.41 0.43 570
weighted avg 0.46 0.42 0.43 570

# 1.4 Hyperparameter Tuning

param_grid = {
"kernel": ["linear", "rbf"],
"C": [0.1, 1, 10],
"gamma": ["scale", "auto"]
}
grid = GridSearchCV(SVC(probability=True, random_state=42), param_grid, cv=3, scoring="f1_macro",
grid.fit(X_train, y_train)
print("Best parameters:", grid.best_params_)
best_svm = grid.best_estimator_
y_pred_best = best_svm.predict(X_test)
print("Tuned Classification Report:")
print(classification_report(y_test, y_pred_best, zero_division=0))

Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}

Tuned Classification Report:
precision recall f1-score support

Extremely Negative 0.53 0.54 0.53 89

Extremely Positive 0.49 0.43 0.46 90
Negative 0.41 0.49 0.45 156
Neutral 0.47 0.38 0.42 93
Positive 0.37 0.36 0.36 142

accuracy 0.44 570

macro avg 0.45 0.44 0.44 570
weighted avg 0.44 0.44 0.44 570

# 1.5 ROC and Precision-Recall Curves

from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
# Import necessary functions
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

classes = sorted(y.unique())
y_test_bin = label_binarize(y_test, classes=classes)

# Wrap the estimator in OneVsRestClassifier for multi-class ROC

classifier = OneVsRestClassifier(best_svm)
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

plt.figure(figsize=(8,6))
# Iterate through each class for ROC Curve
for i, cls in enumerate(classes):
fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f"ROC curve of class {cls} (area = {roc_auc:0.2f})")

plt.plot([0, 1], [0, 1], "k--")

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend(loc="lower right")
plt.show()

plt.figure(figsize=(8,6))
# Iterate through each class for Precision-Recall Curve
for i, cls in enumerate(classes):
precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_score[:, i])
average_precision = average_precision_score(y_test_bin[:, i], y_score[:, i])
plt.plot(recall, precision, label=f"PR curve of class {cls} (AP = {average_precision:0.2f})")

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curves")
plt.legend(loc="lower left")
plt.show()
keyboard_arrow_down Part 2: Decision Tree for Sales Segmentation
# Step 1: Load the Dataset
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder

df = pd.read_csv("/content/Company_Data.csv")
df.head()

Sales CompPrice Income Advertising Population Price ShelveLoc Age Education Urban US
0 9.50 138 73 11 276 120 Bad 42 17 Yes Yes

1 11.22 111 48 16 260 83 Good 65 10 Yes Yes

2 10.06 113 35 10 269 80 Medium 59 12 Yes Yes

3 7.40 117 100 4 466 97 Medium 55 14 Yes Yes

4 4.15 141 64 3 340 128 Bad 38 13 Yes No

#Prepare the Dataset

# Convert Sales into binary 'High' sales label

df['High'] = df['Sales'].apply(lambda x: 1 if x > df['Sales'].median() else 0)
df.drop(columns=['Sales'], inplace=True)

# Encode categorical features

cat_cols = ['ShelveLoc', 'Urban', 'US']
le = LabelEncoder()
for col in cat_cols:
df[col] = le.fit_transform(df[col])

# Define features and target

X = df.drop(columns='High')
y = df['High']

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_stat

df.head()

CompPrice Income Advertising Population Price ShelveLoc Age Education Urban US High
0 138 73 11 276 120 0 42 17 1 1 1

1 111 48 16 260 83 1 65 10 1 1 1

2 113 35 10 269 80 2 59 12 1 1 1

3 117 100 4 466 97 2 55 14 1 1 0

4 141 64 3 340 128 0 38 13 1 0 0

# Step 2: Decision Tree Components from Scratch

def gini(y):
counts = Counter(y)
impurity = 1.0
for lbl in counts:
prob_of_lbl = counts[lbl] / len(y)
impurity -= prob_of_lbl**2
return impurity

def entropy(y):
counts = Counter(y)
impurity = 0.0
for lbl in counts:
prob_of_lbl = counts[lbl] / len(y)
impurity -= prob_of_lbl * np.log2(prob_of_lbl + 1e-9)
return impurity

def split(X_col, threshold):

left_idx = np.where(X_col <= threshold)[0]
right_idx = np.where(X_col > threshold)[0]
return left_idx, right_idx

def best_split(X, y, impurity_fn=gini):

best_gain = -1
best_col, best_thresh = None, None
base_impurity = impurity_fn(y)

for col in range(X.shape[1]):

thresholds = np.unique(X[:, col])
for t in thresholds:
left_idx, right_idx = split(X[:, col], t)
if len(left_idx) == 0 or len(right_idx) == 0:
continue
y_left, y_right = y[left_idx], y[right_idx]
gain = base_impurity - (
len(y_left)/len(y) * impurity_fn(y_left)
+ len(y_right)/len(y) * impurity_fn(y_right)
)
if gain > best_gain:
best_gain = gain
best_col = col
best_thresh = t
return best_col, best_thresh

class TreeNode:
def __init__(self, depth=0, max_depth=3):
self.depth = depth
self.max_depth = max_depth
self.left = None
self.right = None
self.col = None
self.thresh = None
self.pred = None

def fit(self, X, y):

if self.depth == self.max_depth or len(set(y)) == 1:
self.pred = Counter(y).most_common(1)[0][0]
return

col, thresh = best_split(X, y)

if col is None:
self.pred = Counter(y).most_common(1)[0][0]
return

self.col = col
self.thresh = thresh
left_idx, right_idx = split(X[:, col], thresh)

self.left = TreeNode(depth=self.depth+1, max_depth=self.max_depth)

self.left.fit(X[left_idx], y[left_idx])

self.right = TreeNode(depth=self.depth+1, max_depth=self.max_depth)

self.right.fit(X[right_idx], y[right_idx])

def predict_one(self, x):

if self.pred is not None:
return self.pred
if x[self.col] <= self.thresh:
return self.left.predict_one(x)
else:
return self.right.predict_one(x)

def predict(self, X):

return np.array([self.predict_one(x) for x in X])

# Step 4: Interpretation with Feature Importance

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf.fit(X_train, y_train)

importances = pd.Series(clf.feature_importances_, index=df.columns.drop("High"))

top_3 = importances.sort_values(ascending=False).head(3)

print("\nTop 3 Predictors for High Sales:")

print(top_3)

Top 3 Predictors for High Sales:

Price 0.626186
ShelveLoc 0.292196
Advertising 0.063104
dtype: float64

# 5: Interpretation

print("""
Interpretation:
The top three predictors for high sales in this dataset are:
1. ShelveLoc — the placement of the product in the store has the strongest influence on sales.
2. Price — competitive pricing significantly affects whether sales are high.
3. Advertising — marketing investment is a key driver of customer interest and conversions.
This implies that strategic positioning and advertising, alongside competitive pricing, are vital
""")

Interpretation:
The top three predictors for high sales in this dataset are:
1. ShelveLoc — the placement of the product in the store has the strongest influence on sales.
2. Price — competitive pricing significantly affects whether sales are high.
3. Advertising — marketing investment is a key driver of customer interest and conversions.
This implies that strategic positioning and advertising, alongside competitive pricing, are vit

ML Test Questions 3 Confusion Matrix
No ratings yet
ML Test Questions 3 Confusion Matrix
5 pages
22K61A0654 2 Sasi Auto
No ratings yet
22K61A0654 2 Sasi Auto
24 pages
PH and PH Meter-1
100% (1)
PH and PH Meter-1
9 pages
Module 2
No ratings yet
Module 2
151 pages
Machine Learning
No ratings yet
Machine Learning
3 pages
ML Interview Questions Placements
No ratings yet
ML Interview Questions Placements
99 pages
ML Lecture 11 Evaluation
No ratings yet
ML Lecture 11 Evaluation
17 pages
Lec 12 13 Evaluation Measures
No ratings yet
Lec 12 13 Evaluation Measures
45 pages
Summary
No ratings yet
Summary
51 pages
Import As Import As From Import From Import From Import From Import
No ratings yet
Import As Import As From Import From Import From Import From Import
6 pages
ML Functions
No ratings yet
ML Functions
12 pages
ML RECORD EX 5,6,7,8,9 (Without Border)
No ratings yet
ML RECORD EX 5,6,7,8,9 (Without Border)
13 pages
ML CH 5
No ratings yet
ML CH 5
5 pages
1sttask - Ipynb - Colab
No ratings yet
1sttask - Ipynb - Colab
6 pages
Import As Import As From Import From Import From Import From Import
No ratings yet
Import As Import As From Import From Import From Import From Import
4 pages
Experiment 7
No ratings yet
Experiment 7
3 pages
Machine Learning Final Report
No ratings yet
Machine Learning Final Report
8 pages
? Task
No ratings yet
? Task
23 pages
ML Lab 8
No ratings yet
ML Lab 8
9 pages
Detect Fake Profiles in Online Social Networks Using Support Vector Machine
No ratings yet
Detect Fake Profiles in Online Social Networks Using Support Vector Machine
8 pages
Review Paper
No ratings yet
Review Paper
3 pages
Dsbda 10
No ratings yet
Dsbda 10
5 pages
Code ExerciseModelSelection
100% (1)
Code ExerciseModelSelection
19 pages
Lab4 - Jupyter Notebook
No ratings yet
Lab4 - Jupyter Notebook
7 pages
Ann Experiential Learning
No ratings yet
Ann Experiential Learning
43 pages
Lecture 04
No ratings yet
Lecture 04
33 pages
Final Report
No ratings yet
Final Report
17 pages
Lec5 Classification
No ratings yet
Lec5 Classification
27 pages
ML Lab Programs 2
No ratings yet
ML Lab Programs 2
16 pages
Unit-6 Notes PART A
No ratings yet
Unit-6 Notes PART A
20 pages
Print Version
No ratings yet
Print Version
29 pages
WINSEM2024-25 CSE3008 ELA AP2024254001161 2025-02-13 Reference-Material-I
No ratings yet
WINSEM2024-25 CSE3008 ELA AP2024254001161 2025-02-13 Reference-Material-I
2 pages
Classification
No ratings yet
Classification
4 pages
ML Assignment 4
No ratings yet
ML Assignment 4
7 pages
Machine Learning Assignment
No ratings yet
Machine Learning Assignment
8 pages
Machine Learnin1
100% (1)
Machine Learnin1
41 pages
Machine Learning II
No ratings yet
Machine Learning II
61 pages
Lecture 10
No ratings yet
Lecture 10
16 pages
A10 Model Performance v2 2up
No ratings yet
A10 Model Performance v2 2up
11 pages
Maxbox Starter66 Machine Learning4
No ratings yet
Maxbox Starter66 Machine Learning4
10 pages
ML0101EN Clas SVM Cancer Py v1
No ratings yet
ML0101EN Clas SVM Cancer Py v1
10 pages
Confusion Matrix
No ratings yet
Confusion Matrix
5 pages
ML CH 5
No ratings yet
ML CH 5
45 pages
Model Evaluation Metrics - A Comprehensive Guide For Beginners - by Yash - Medium
No ratings yet
Model Evaluation Metrics - A Comprehensive Guide For Beginners - by Yash - Medium
9 pages
Garishav Basra 102103129 2CO5
No ratings yet
Garishav Basra 102103129 2CO5
8 pages
Binary Classification PDF
No ratings yet
Binary Classification PDF
27 pages
Pract5 1
No ratings yet
Pract5 1
3 pages
جلسه 13
No ratings yet
جلسه 13
76 pages
Ads 5
No ratings yet
Ads 5
5 pages
General Systems Theory
100% (1)
General Systems Theory
7 pages
UNIT-1-2.Binary Classification and Related Tasks
No ratings yet
UNIT-1-2.Binary Classification and Related Tasks
22 pages
Deep Dive Into Confusion Matrix - Towards AI
No ratings yet
Deep Dive Into Confusion Matrix - Towards AI
9 pages
Chapter 3
No ratings yet
Chapter 3
77 pages
CHEVRON Maintenance Heat Exchanger
67% (3)
CHEVRON Maintenance Heat Exchanger
23 pages
10 SVMAndEvaluation PDF
No ratings yet
10 SVMAndEvaluation PDF
60 pages
Python Essential Methods in Machine Learning
No ratings yet
Python Essential Methods in Machine Learning
6 pages
Data Structure & Algorithms Lab Manual V1.2-1
No ratings yet
Data Structure & Algorithms Lab Manual V1.2-1
97 pages
Hands On Machine Learning 3 Edition
No ratings yet
Hands On Machine Learning 3 Edition
31 pages
Water Well Drilling Machine and Tools Catalogue
No ratings yet
Water Well Drilling Machine and Tools Catalogue
49 pages
Model Evaluation - II
No ratings yet
Model Evaluation - II
12 pages
Practical 6
No ratings yet
Practical 6
8 pages
Evaluation Measures
No ratings yet
Evaluation Measures
8 pages
Answer 1722791857 NLP and Classification Practical MCQ 4991
No ratings yet
Answer 1722791857 NLP and Classification Practical MCQ 4991
26 pages
G 203008076 - 4 - Christhian Quiñonez - Ex1 - 2 A PDF
No ratings yet
G 203008076 - 4 - Christhian Quiñonez - Ex1 - 2 A PDF
20 pages
The Visualization Handbook 1st Edition Christopher R. Johnson Download
100% (1)
The Visualization Handbook 1st Edition Christopher R. Johnson Download
37 pages
Carbon and Its Compounds Mind Map
No ratings yet
Carbon and Its Compounds Mind Map
1 page
11.2 - Classification Evaluation Metrics
No ratings yet
11.2 - Classification Evaluation Metrics
22 pages
Steel Tips - Base Plates 1
No ratings yet
Steel Tips - Base Plates 1
6 pages
Important Questions For Class 12 Physics Chapter 14 Semiconductor Electronics Materials Devices and Simple Circuits Class 12 Important Questions
No ratings yet
Important Questions For Class 12 Physics Chapter 14 Semiconductor Electronics Materials Devices and Simple Circuits Class 12 Important Questions
105 pages
KENZA MAX Biochemistry: Compliance
No ratings yet
KENZA MAX Biochemistry: Compliance
43 pages
Playing Changes
100% (2)
Playing Changes
13 pages
Omkw 1
No ratings yet
Omkw 1
32 pages
Lab Exp 1 2
No ratings yet
Lab Exp 1 2
26 pages
Project Work
No ratings yet
Project Work
34 pages
Radio Network Planning: in Arcgis
No ratings yet
Radio Network Planning: in Arcgis
12 pages
Introduction
No ratings yet
Introduction
24 pages
Chapter - 1 Introduction:-: Variable Power Supply With Digital Control 2011
No ratings yet
Chapter - 1 Introduction:-: Variable Power Supply With Digital Control 2011
49 pages
Matlab 8
100% (1)
Matlab 8
12 pages
Chapter 2 - FIR Filters - Digital Filter Design
No ratings yet
Chapter 2 - FIR Filters - Digital Filter Design
100 pages
B07-Hybrid Photothermal-Photocatalyst Sheets For Solar-Driven Overall Water Splitting Coupled To Water Purification
No ratings yet
B07-Hybrid Photothermal-Photocatalyst Sheets For Solar-Driven Overall Water Splitting Coupled To Water Purification
14 pages
CST2355 Lab2b Summer2024
No ratings yet
CST2355 Lab2b Summer2024
9 pages
Chapter 4:jfet: Junction Field Effect Transistor
No ratings yet
Chapter 4:jfet: Junction Field Effect Transistor
67 pages
Eloisa Jasmin F. Perez E3Q - Engineering Data Analysis Formative Assessment
No ratings yet
Eloisa Jasmin F. Perez E3Q - Engineering Data Analysis Formative Assessment
2 pages
Tut 2
No ratings yet
Tut 2
2 pages
W2915
No ratings yet
W2915
16 pages
Lab-4 Report
No ratings yet
Lab-4 Report
8 pages
Internship Training in Python
No ratings yet
Internship Training in Python
2 pages
Multiflex Assembly Instructions
No ratings yet
Multiflex Assembly Instructions
52 pages
Experiement 6
No ratings yet
Experiement 6
3 pages
Generalized Fermat Equation
From Everand
Generalized Fermat Equation
Ran Van Vo
No ratings yet

ML101 Graded Assignment 2.ipynb - Colab

Uploaded by

ML101 Graded Assignment 2.ipynb - Colab

Uploaded by

setty bhavana

keyboard_arrow_down Part 1: Sentiment Classification of COVID-19 Tweets

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

X_train, X_temp, y_train, y_temp = train_test_split(X_tfidf, y, test_size=0.3, stratify=y, random_st

Baseline Classification Report:

Extremely Negative 0.54 0.36 0.43 89

accuracy 0.42 570

# 1.4 Hyperparameter Tuning

Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}

Extremely Negative 0.53 0.54 0.53 89

accuracy 0.44 570

# 1.5 ROC and Precision-Recall Curves

# Wrap the estimator in OneVsRestClassifier for multi-class ROC

plt.plot([0, 1], [0, 1], "k--")

1 11.22 111 48 16 260 83 Good 65 10 Yes Yes

2 10.06 113 35 10 269 80 Medium 59 12 Yes Yes

3 7.40 117 100 4 466 97 Medium 55 14 Yes Yes

4 4.15 141 64 3 340 128 Bad 38 13 Yes No

#Prepare the Dataset

# Convert Sales into binary 'High' sales label

# Encode categorical features

# Define features and target

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_stat

3 117 100 4 466 97 2 55 14 1 1 0

4 141 64 3 340 128 0 38 13 1 0 0

# Step 2: Decision Tree Components from Scratch

def split(X_col, threshold):

def best_split(X, y, impurity_fn=gini):

for col in range(X.shape[1]):

def fit(self, X, y):

col, thresh = best_split(X, y)

self.left = TreeNode(depth=self.depth+1, max_depth=self.max_depth)

self.right = TreeNode(depth=self.depth+1, max_depth=self.max_depth)

def predict_one(self, x):

def predict(self, X):

# Step 4: Interpretation with Feature Importance

from sklearn.tree import DecisionTreeClassifier

importances = pd.Series(clf.feature_importances_, index=df.columns.drop("High"))

print("\nTop 3 Predictors for High Sales:")

Top 3 Predictors for High Sales:

You might also like