ML101 Graded Assignment 2.ipynb - Colab
ML101 Graded Assignment 2.ipynb - Colab
IITMCS_240681
plt.figure(figsize=(6,4))
sns.countplot(data=tweets, x="Sentiment", order=tweets['Sentiment'].value_counts().index)
plt.title("Tweet Sentiment Distribution")
plt.xticks(rotation=45)
plt.show()
Sentiment
Negative 1041
Positive 947
Neutral 619
Extremely Positive 599
Extremely Negative 592
Name: count, dtype: int64
# 1.2 Preprocessing
X = tweets['OriginalTweet'].astype(str)
y = tweets['Sentiment']
classes = sorted(y.unique())
y_test_bin = label_binarize(y_test, classes=classes)
plt.figure(figsize=(8,6))
# Iterate through each class for ROC Curve
for i, cls in enumerate(classes):
fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f"ROC curve of class {cls} (area = {roc_auc:0.2f})")
plt.figure(figsize=(8,6))
# Iterate through each class for Precision-Recall Curve
for i, cls in enumerate(classes):
precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_score[:, i])
average_precision = average_precision_score(y_test_bin[:, i], y_score[:, i])
plt.plot(recall, precision, label=f"PR curve of class {cls} (AP = {average_precision:0.2f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curves")
plt.legend(loc="lower left")
plt.show()
keyboard_arrow_down Part 2: Decision Tree for Sales Segmentation
# Step 1: Load the Dataset
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder
df = pd.read_csv("/content/Company_Data.csv")
df.head()
Sales CompPrice Income Advertising Population Price ShelveLoc Age Education Urban US
0 9.50 138 73 11 276 120 Bad 42 17 Yes Yes
CompPrice Income Advertising Population Price ShelveLoc Age Education Urban US High
0 138 73 11 276 120 0 42 17 1 1 1
1 111 48 16 260 83 1 65 10 1 1 1
2 113 35 10 269 80 2 59 12 1 1 1
def gini(y):
counts = Counter(y)
impurity = 1.0
for lbl in counts:
prob_of_lbl = counts[lbl] / len(y)
impurity -= prob_of_lbl**2
return impurity
def entropy(y):
counts = Counter(y)
impurity = 0.0
for lbl in counts:
prob_of_lbl = counts[lbl] / len(y)
impurity -= prob_of_lbl * np.log2(prob_of_lbl + 1e-9)
return impurity
class TreeNode:
def __init__(self, depth=0, max_depth=3):
self.depth = depth
self.max_depth = max_depth
self.left = None
self.right = None
self.col = None
self.thresh = None
self.pred = None
self.col = col
self.thresh = thresh
left_idx, right_idx = split(X[:, col], thresh)
# 5: Interpretation
print("""
Interpretation:
The top three predictors for high sales in this dataset are:
1. ShelveLoc — the placement of the product in the store has the strongest influence on sales.
2. Price — competitive pricing significantly affects whether sales are high.
3. Advertising — marketing investment is a key driver of customer interest and conversions.
This implies that strategic positioning and advertising, alongside competitive pricing, are vital
""")
Interpretation:
The top three predictors for high sales in this dataset are:
1. ShelveLoc — the placement of the product in the store has the strongest influence on sales.
2. Price — competitive pricing significantly affects whether sales are high.
3. Advertising — marketing investment is a key driver of customer interest and conversions.
This implies that strategic positioning and advertising, alongside competitive pricing, are vit