0% found this document useful (0 votes)
13 views5 pages

9,12,19,68 - ML Assignment-2

Research Paper for Our recommendation system

Uploaded by

case.internshala
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views5 pages

9,12,19,68 - ML Assignment-2

Research Paper for Our recommendation system

Uploaded by

case.internshala
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 5

9,12,19,68_ML Assignment-2.ipynb - Colab https://colab.research.google.com/drive/10e_X-Zip69NurIFbktAUKWp...

1 import pandas as pd
2 import numpy as np
3 from scipy.sparse import csr_matrix
4 from sklearn.neighbors import NearestNeighbors
5 from sklearn.model_selection import train_test_split
6 from sklearn.metrics import roc_curve, auc
7 import matplotlib.pyplot as plt
8 from tqdm import tqdm
9
10 # Load and prepare data
11 data = pd.read_csv('/content/ratings_Beauty.csv')
12 data.columns = ['user_id', 'item_id', 'rating', 'timestamp']
13 data.dropna(subset=['user_id', 'item_id'], inplace=True)
14
15 # Filter users with minimum number of ratings
16 min_ratings = 5
17 user_counts = data['user_id'].value_counts()
18 valid_users = user_counts[user_counts >= min_ratings].index
19 data = data[data['user_id'].isin(valid_users)]
20
21 # Split data into train and test sets
22 train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
23
24 # Create training user-item matrix
25 train_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating').fillna
26 train_sparse = csr_matrix(train_matrix.values)
27
28 # Train KNN model
29 model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=100)
30 model_knn.fit(train_sparse)
31
32 def get_user_predictions(user_id, train_matrix, test_data, model_knn):
33 """
34 Get predictions for a user on test items with improved error handling
35 """
36 try:
37 user_index = train_matrix.index.get_loc(user_id)
38
39 # Get user's test items
40 user_test_items = test_data[test_data['user_id'] == user_id]['item_id'].unique()
41
42 if len(user_test_items) == 0:
43 return None, None
44
45 # Find similar users

1 of 5 10-11-2024, 09:34 pm
9,12,19,68_ML Assignment-2.ipynb - Colab https://colab.research.google.com/drive/10e_X-Zip69NurIFbktAUKWp...

45 # Find similar users


46 distances, indices = model_knn.kneighbors(
47 train_sparse[user_index].reshape(1, -1),
48 n_neighbors=min(100, train_sparse.shape[0])
49 )
50
51 # Add small epsilon to distances to avoid zero division
52 distances = distances + 1e-6
53 similarities = 1 - distances.flatten()
54
55 # Ensure similarities are positive and sum to non-zero
56 similarities = np.maximum(similarities, 0)
57 if np.sum(similarities[1:]) == 0:
58 return None, None
59
60 y_true = []
61 y_scores = []
62
63 for item_id in user_test_items:
64 if item_id in train_matrix.columns:
65 try:
66 # Get actual rating
67 actual_rating = test_data[(test_data['user_id'] == user_id) &
68 (test_data['item_id'] == item_id)]['rating'].iloc
69 y_true.append(1 if actual_rating >= 4 else 0)
70
71 # Calculate prediction score with error handling
72 item_col = train_matrix.columns.get_loc(item_id)
73 similar_ratings = train_matrix.iloc[indices.flatten()[1:], item_col]
74
75 # Only use non-zero weights
76 valid_indices = similarities[1:] > 0
77 if np.any(valid_indices):
78 pred_score = np.average(similar_ratings[valid_indices],
79 weights=similarities[1:][valid_indices])
80 y_scores.append(pred_score)
81 else:
82 # Use mean rating if no valid weights
83 pred_score = similar_ratings.mean()
84 y_scores.append(pred_score)
85
86 except (IndexError, ValueError):
87 continue
88
89 if len(y_true) == 0 or len(y_scores) == 0:
90 return None, None
91
92 return np.array(y_true), np.array(y_scores)
93
94 except (KeyError, IndexError):
95 return None, None

2 of 5 10-11-2024, 09:34 pm
9,12,19,68_ML Assignment-2.ipynb - Colab https://colab.research.google.com/drive/10e_X-Zip69NurIFbktAUKWp...

96
97 def plot_roc_curve(y_true_all, y_score_all):
98 """
99 Plot ROC curve and calculate AUC
100 """
101 fpr, tpr, _ = roc_curve(y_true_all, y_score_all)
102 roc_auc = auc(fpr, tpr)
103
104 plt.figure(figsize=(10, 8))
105 plt.plot(fpr, tpr, color='darkorange', lw=2,
106 label=f'ROC curve (AUC = {roc_auc:.2f})')
107 plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
108 plt.xlim([0.0, 1.0])
109 plt.ylim([0.0, 1.05])
110 plt.xlabel('False Positive Rate')
111 plt.ylabel('True Positive Rate')
112 plt.title('Receiver Operating Characteristic (ROC) Curve')
113 plt.legend(loc="lower right")
114 plt.grid(True)
115 plt.show()
116
117 return roc_auc
118
119 # Print initial statistics
120 print("Data statistics:")
121 print(f"Total number of ratings: {len(data)}")
122 print(f"Number of unique users: {data['user_id'].nunique()}")
123 print(f"Number of unique items: {data['item_id'].nunique()}")
124
125 # Collect predictions for evaluation
126 y_true_all = []
127 y_score_all = []
128 processed_users = 0
129
130 print("\nCollecting predictions for evaluation...")
131 unique_users = test_data['user_id'].unique()
132
133 for user_id in tqdm(unique_users[:1000], desc="Processing users"):
134 y_true, y_scores = get_user_predictions(user_id, train_matrix, test_data, model_knn)
135 if y_true is not None and len(y_true) > 0:
136 y_true_all.extend(y_true)
137 y_score_all.extend(y_scores)
138 processed_users += 1
139
140 # Convert to numpy arrays
141 y_true_all = np.array(y_true_all)
142 y_score_all = np.array(y_score_all)
143
144 print(f"\nSuccessfully processed {processed_users} users")
145 print(f"Total predictions: {len(y_true_all)}")
146

3 of 5 10-11-2024, 09:34 pm
9,12,19,68_ML Assignment-2.ipynb - Colab https://colab.research.google.com/drive/10e_X-Zip69NurIFbktAUKWp...

146
147 # Plot ROC curve and calculate AUC
148 if len(y_true_all) > 0:
149 print("\nPlotting ROC curve...")
150 roc_auc = plot_roc_curve(y_true_all, y_score_all)
151 print(f"\nOverall AUC Score: {roc_auc:.3f}")
152
153 # Print additional metrics
154 print("\nEvaluation Summary:")
155 print(f"Positive ratings ratio: {np.mean(y_true_all):.2%}")
156 print(f"Average prediction score: {np.mean(y_score_all):.3f}")
157 else:
158 print("No predictions were generated for evaluation.")

Data statistics:
Total number of ratings: 28552
Number of unique users: 3819
Number of unique items: 11523

Collecting predictions for evaluation...


Processing users: 100%|██████████| 1000/1000 [00:09<00:00, 100.50it/s]

Successfully processed 865 users


Total predictions: 1808

Plotting ROC curve...

4 of 5 10-11-2024, 09:34 pm
9,12,19,68_ML Assignment-2.ipynb - Colab https://colab.research.google.com/drive/10e_X-Zip69NurIFbktAUKWp...

Overall AUC Score: 0.510

Evaluation Summary:
Positive ratings ratio: 77.10%
Average prediction score: 0.051

5 of 5 10-11-2024, 09:34 pm

You might also like