0% found this document useful (0 votes)
3 views3 pages

Final Code

The document outlines a data analysis process using RFM (Recency, Frequency, Monetary) metrics to evaluate customer behavior from a dataset. It includes steps for calculating RFM scores, identifying and removing outliers, scaling the data, and determining the optimal number of clusters using the Elbow Method and Silhouette Score for KMeans clustering. The final output includes visualizations and metrics for optimal clustering.

Uploaded by

mayurj
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views3 pages

Final Code

The document outlines a data analysis process using RFM (Recency, Frequency, Monetary) metrics to evaluate customer behavior from a dataset. It includes steps for calculating RFM scores, identifying and removing outliers, scaling the data, and determining the optimal number of clusters using the Elbow Method and Silhouette Score for KMeans clustering. The final output includes visualizations and metrics for optimal clustering.

Uploaded by

mayurj
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

# -*- coding: utf-8 -*-

"""New_3.ipynb

Automatically generated by Colab.

Original file is located at


https://colab.research.google.com/drive/1ZlO5nGZT8XhUlvk6fn91RKwTTdAtv004
"""

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import datetime as dt

# Load the dataset


df = pd.read_csv("/content/drive/MyDrive/kaggle.csv")

# Calculate RFM metrics

# Recency
df['Recency'] = df['Recency']
# Frequency
df['Frequency'] = df['NumDealsPurchases'] + df['NumCatalogPurchases'] +
df['NumStorePurchases'] + df['NumWebPurchases']
# Monetary
df['Monetary'] = df['MntFishProducts'] + df['MntMeatProducts'] + df['MntFruits'] +
df['MntSweetProducts'] + df['MntWines'] +df['MntGoldProds']

rfm = df[['Recency', 'Frequency', 'Monetary']]


rfm.head()

rfm.loc[:, "R_Score"] = pd.qcut(rfm['Recency'], 5, labels=[5, 4, 3, 2, 1])


rfm.loc[:, "F_Score"] = pd.qcut(rfm['Frequency'], 5, labels=[1, 2, 3, 4, 5])
rfm.loc[:, "M_Score"] = pd.qcut(rfm['Monetary'], 5, labels=[1, 2, 3, 4, 5])
rfm.loc[:, "RFM_SCORE"] = (rfm['R_Score'].astype(str) + rfm['F_Score'].astype(str)
+ rfm['M_Score'].astype(str))

rfm.head()

from scipy.stats import zscore

# Convert 'RFM_score' to numeric before applying zscore


rfm['RFM_score'] = pd.to_numeric(rfm['RFM_SCORE'])

# Step 1: Calculate z-scores for the columns in rfm_scaled


z_scores = rfm[['RFM_score']].apply(zscore)

# Step 2: Identify outliers (z-score > 3 or < -3)


outliers = (z_scores.abs() > 3).any(axis=1)

print(type(outliers))
outliers.head(10)
rfm_outliers=df[outliers].reset_index(drop=True)
rfm_outliers.head(10)
#print(rfm_outliers)

# Step 3: Remove outliers from the original DataFrame (df)


rfm_cleaned = rfm[~outliers].reset_index(drop=True) # This line keeps data that are
not outliers

rfm_df = rfm_cleaned[['RFM_score']]
print(rfm_df)

# Save the cleaned data to a CSV file


#rfm_df.to_csv("RFM_Cleaned.csv", index=False)

from sklearn.preprocessing import MinMaxScaler


import pandas as pd
from sklearn.preprocessing import RobustScaler

# Initialize the MinMaxScaler


scaler = MinMaxScaler()

# Select only the desired columns for scaling


rfm_scaled = rfm_df[['RFM_score']]

# Apply fit_transform on the selected columns


rfm_scaled_t = scaler.fit_transform(rfm_scaled)

# Create the DataFrame with the correct column names


rfm_scaled_df = pd.DataFrame(rfm_scaled_t, columns=['RFM_score'])

print("Scaled Dataset Using MinMaxScaler")


#print(rfm_scaled_df.head())
print(rfm_scaled_df)

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans # Import KMeans
from sklearn.metrics import silhouette_score # Import silhouette_score

# Step 3: Elbow Method


wcss = [] # Within-Cluster Sum of Squares
k_range = range(2, 11) # Define k_range here
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(rfm_scaled_df)
wcss.append(kmeans.inertia_)

# Finding the optimal k using the Elbow Method (point of maximum curvature)
diff_wcss = np.diff(wcss)
diff_wcss_ratio = diff_wcss[:-1] / diff_wcss[1:]
optimal_k = k_range[np.argmax(diff_wcss_ratio) + 1] # Use the defined k_range
print(f'Optimal number of clusters (k) based on Elbow Method: {optimal_k}')

# Plot the Elbow Curve


plt.figure(figsize=(8, 5))
plt.plot(k_range, wcss, marker='o') # Use k_range here as well
plt.title('Elbow Method')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.show()

# Step 3: Silhouette Coefficient


silhouette_scores = []
for k in range(2, 11): # Silhouette is not defined for k=1
kmeans = KMeans(n_clusters=k, random_state=42)
# Use the numeric data for clustering
labels = kmeans.fit_predict(rfm_scaled_df)
silhouette_scores.append(silhouette_score(rfm_scaled_df, labels))

#Printing the optimal k based on the highest silhouette score


# Convert range to a list to make it subscriptable
k_values = list(range(2, 11))
optimal_k = k_values[np.argmax(silhouette_scores)]
print(f'Optimal number of clusters (k) based on Silhouette Score: {optimal_k}')

# Display the scores


print("Silhouette Scores for Different K Values:")
for k, score in zip(range(2, 11), silhouette_scores): # Assuming k_values was
intended to be range(2, 11)
print(f"K = {k}: Silhouette Score = {score:.2f}")

You might also like