9,12,19,68 - ML Assignment-2
9,12,19,68 - ML Assignment-2
1 import pandas as pd
2 import numpy as np
3 from scipy.sparse import csr_matrix
4 from sklearn.neighbors import NearestNeighbors
5 from sklearn.model_selection import train_test_split
6 from sklearn.metrics import roc_curve, auc
7 import matplotlib.pyplot as plt
8 from tqdm import tqdm
9
10 # Load and prepare data
11 data = pd.read_csv('/content/ratings_Beauty.csv')
12 data.columns = ['user_id', 'item_id', 'rating', 'timestamp']
13 data.dropna(subset=['user_id', 'item_id'], inplace=True)
14
15 # Filter users with minimum number of ratings
16 min_ratings = 5
17 user_counts = data['user_id'].value_counts()
18 valid_users = user_counts[user_counts >= min_ratings].index
19 data = data[data['user_id'].isin(valid_users)]
20
21 # Split data into train and test sets
22 train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
23
24 # Create training user-item matrix
25 train_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating').fillna
26 train_sparse = csr_matrix(train_matrix.values)
27
28 # Train KNN model
29 model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=100)
30 model_knn.fit(train_sparse)
31
32 def get_user_predictions(user_id, train_matrix, test_data, model_knn):
33 """
34 Get predictions for a user on test items with improved error handling
35 """
36 try:
37 user_index = train_matrix.index.get_loc(user_id)
38
39 # Get user's test items
40 user_test_items = test_data[test_data['user_id'] == user_id]['item_id'].unique()
41
42 if len(user_test_items) == 0:
43 return None, None
44
45 # Find similar users
1 of 5 10-11-2024, 09:34 pm
9,12,19,68_ML Assignment-2.ipynb - Colab https://colab.research.google.com/drive/10e_X-Zip69NurIFbktAUKWp...
2 of 5 10-11-2024, 09:34 pm
9,12,19,68_ML Assignment-2.ipynb - Colab https://colab.research.google.com/drive/10e_X-Zip69NurIFbktAUKWp...
96
97 def plot_roc_curve(y_true_all, y_score_all):
98 """
99 Plot ROC curve and calculate AUC
100 """
101 fpr, tpr, _ = roc_curve(y_true_all, y_score_all)
102 roc_auc = auc(fpr, tpr)
103
104 plt.figure(figsize=(10, 8))
105 plt.plot(fpr, tpr, color='darkorange', lw=2,
106 label=f'ROC curve (AUC = {roc_auc:.2f})')
107 plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
108 plt.xlim([0.0, 1.0])
109 plt.ylim([0.0, 1.05])
110 plt.xlabel('False Positive Rate')
111 plt.ylabel('True Positive Rate')
112 plt.title('Receiver Operating Characteristic (ROC) Curve')
113 plt.legend(loc="lower right")
114 plt.grid(True)
115 plt.show()
116
117 return roc_auc
118
119 # Print initial statistics
120 print("Data statistics:")
121 print(f"Total number of ratings: {len(data)}")
122 print(f"Number of unique users: {data['user_id'].nunique()}")
123 print(f"Number of unique items: {data['item_id'].nunique()}")
124
125 # Collect predictions for evaluation
126 y_true_all = []
127 y_score_all = []
128 processed_users = 0
129
130 print("\nCollecting predictions for evaluation...")
131 unique_users = test_data['user_id'].unique()
132
133 for user_id in tqdm(unique_users[:1000], desc="Processing users"):
134 y_true, y_scores = get_user_predictions(user_id, train_matrix, test_data, model_knn)
135 if y_true is not None and len(y_true) > 0:
136 y_true_all.extend(y_true)
137 y_score_all.extend(y_scores)
138 processed_users += 1
139
140 # Convert to numpy arrays
141 y_true_all = np.array(y_true_all)
142 y_score_all = np.array(y_score_all)
143
144 print(f"\nSuccessfully processed {processed_users} users")
145 print(f"Total predictions: {len(y_true_all)}")
146
3 of 5 10-11-2024, 09:34 pm
9,12,19,68_ML Assignment-2.ipynb - Colab https://colab.research.google.com/drive/10e_X-Zip69NurIFbktAUKWp...
146
147 # Plot ROC curve and calculate AUC
148 if len(y_true_all) > 0:
149 print("\nPlotting ROC curve...")
150 roc_auc = plot_roc_curve(y_true_all, y_score_all)
151 print(f"\nOverall AUC Score: {roc_auc:.3f}")
152
153 # Print additional metrics
154 print("\nEvaluation Summary:")
155 print(f"Positive ratings ratio: {np.mean(y_true_all):.2%}")
156 print(f"Average prediction score: {np.mean(y_score_all):.3f}")
157 else:
158 print("No predictions were generated for evaluation.")
Data statistics:
Total number of ratings: 28552
Number of unique users: 3819
Number of unique items: 11523
4 of 5 10-11-2024, 09:34 pm
9,12,19,68_ML Assignment-2.ipynb - Colab https://colab.research.google.com/drive/10e_X-Zip69NurIFbktAUKWp...
Evaluation Summary:
Positive ratings ratio: 77.10%
Average prediction score: 0.051
5 of 5 10-11-2024, 09:34 pm