Customer Segmentation With K-Means Clustering and Visualization - Colab
Customer Segmentation With K-Means Clustering and Visualization - Colab
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
data = pd.read_excel('/OnlineRetail.xlsx')
print(data.head())
print(data.isnull().sum())
InvoiceNo 0
StockCode 0
Description 1454
Quantity 0
InvoiceDate 0
UnitPrice 0
CustomerID 135080
Country 0
dtype: int64
data['Description'].fillna('Unknown', inplace=True)
<ipython-input-12-b328947c4b82>:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained ass
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col
data['Description'].fillna('Unknown', inplace=True)
data['CustomerID'].fillna(0, inplace=True)
<ipython-input-14-3d3ed6052492>:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained ass
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col
data['CustomerID'].fillna(0, inplace=True)
print(data.isnull().sum())
InvoiceNo 0
StockCode 0
Description 0
Quantity 0
InvoiceDate 0
UnitPrice 0
CustomerID 0
Country 0
dtype: int64
X = customer_summary[['TotalSpend', 'NumTransactions']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
inertia = []
for k in range(1, 11): # Check for 1 to 10 clusters
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_scaled)
inertia.append(kmeans.inertia_)
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o', color='b')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
customer_summary['Cluster'] = kmeans.fit_predict(X_scaled)
plt.figure(figsize=(8, 6))
sns.scatterplot(x=customer_summary['TotalSpend'],
y=customer_summary['NumTransactions'],
hue=customer_summary['Cluster'],
palette='Set2', s=100, alpha=0.6)
plt.title('Customer Segmentation based on Total Spend and Number of Transactions')
plt.xlabel('Total Spend')
plt.ylabel('Number of Transactions')
plt.legend(title='Cluster')
plt.show()