0% found this document useful (0 votes)
7 views6 pages

PMA Experiment 2

Uploaded by

siyebic418
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
7 views6 pages

PMA Experiment 2

Uploaded by

siyebic418
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 6

K-Means Clustering

A Short Case Study


On this notebook, we're gonna dive not so deep on the basics of how to dow a K-Means Clustering on a small example dataset. At the end of
this study, I hope we could achieve the following understandings regarding our problem:

1. What's a good way to segment our dataset on a small set of clusters?


2. How can we achieve quick results using the Pandas, Numpy, Matplotlib, Pyplot and SKLearn modules?
3. How the select the best hyperparameters for K-Means Clustering?
4. How to display and visualize data in the most honest and friendly way to our stakeholders?

Initialization

# Initialization

# Module Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.cluster import KMeans

# Style Definitions
plt.style.use('Solarize_Light2')

# Get dataframe from CSV file


df = pd.read_csv('customers.csv')
df.head()

CustomerID Gender Age Annual Income (k$) Spending Score (1-100)

0 1 Male 19 15 39

1 2 Male 21 15 81

2 3 Female 20 16 6

3 4 Female 23 16 77

4 5 Female 31 17 40

df.shape

(200, 5)

df.describe()

CustomerID Age Annual Income (k$) Spending Score (1-100)

count 200.000000 200.000000 200.000000 200.000000

mean 100.500000 38.850000 60.560000 50.200000

std 57.879185 13.969007 26.264721 25.823522

min 1.000000 18.000000 15.000000 1.000000

25% 50.750000 28.750000 41.500000 34.750000

50% 100.500000 36.000000 61.500000 50.000000

75% 150.250000 49.000000 78.000000 73.000000

max 200.000000 70.000000 137.000000 99.000000

plt.figure(1, figsize=(16,4))
n = 0
for i in ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']:
n += 1
plt.subplot(1 , 3 , n)
plt.subplots_adjust(hspace =0.5 , wspace = 0.5)
sns.distplot(df[i] , bins = 32)
plt.title(f'Histogram of {i}')
plt.show()

# Assignment Stage

X1 = df.loc[:, ['Age', 'Spending Score (1-100)']].values


inertia = []
for n in range(1 , 11):
model = KMeans(n_clusters = n,
init='k-means++',
max_iter=500,
random_state=42)
model.fit(X1)
inertia.append(model.inertia_)

plt.figure(1 , figsize = (15 ,6))


plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()

model = KMeans(n_clusters = 4,
init='k-means++',
max_iter=500,
random_state=42)
model.fit(X1)
labels = model.labels_
centroids = model.cluster_centers_
y_kmeans = model.fit_predict(X1)

plt.figure(figsize=(20,10))
plt.scatter(X1[y_kmeans == 0, 0], X1[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X1[y_kmeans == 1, 0], X1[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X1[y_kmeans == 2, 0], X1[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X1[y_kmeans == 3, 0], X1[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X1[y_kmeans == 4, 0], X1[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Clusters of Customers - Age X Spending Score')
plt.xlabel('Age')
plt.ylabel('Spending Score')
plt.legend()
plt.show()

Second Clustering
By Annual Income and Spending Score

# Assignment Stage

X2 = df.loc[:, ['Annual Income (k$)', 'Spending Score (1-100)']].values


inertia = []
for n in range(1 , 11):
model = KMeans(n_clusters = n,
init='k-means++',
max_iter=500,
random_state=42)
model.fit(X2)
inertia.append(model.inertia_)

plt.figure(1 , figsize = (20, 10))


plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()
model = KMeans(n_clusters = 5,
init='k-means++',
max_iter=500,
random_state=42)
model.fit(X2)
labels = model.labels_
centroids = model.cluster_centers_
y_kmeans = model.fit_predict(X2)

plt.figure(figsize=(20,10))
plt.scatter(X2[y_kmeans == 0, 0], X2[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X2[y_kmeans == 1, 0], X2[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X2[y_kmeans == 2, 0], X2[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X2[y_kmeans == 3, 0], X2[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X2[y_kmeans == 4, 0], X2[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Clusters of Customers - Annual Income (k$) X Spending Score')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score')
plt.legend()
plt.show()

# Assignment Stage

from sklearn.cluster import KMeans

X3 = df.loc[:, ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].values


inertia = []
for n in range(1 , 11):
model = KMeans(n_clusters = n,
init='k-means++',
max_iter=500,
random_state=42)
model.fit(X3)
inertia.append(model.inertia_)

plt.figure(1 , figsize = (20, 10))


plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()
model = KMeans(n_clusters = 6,
init='k-means++',
max_iter=500,
random_state=42)
model.fit(X3)
labels = model.labels_
#centroids = model.cluster_centers_

df['cluster'] = labels
df

CustomerID Gender Age Annual Income (k$) Spending Score (1-100) cluster

0 1 Male 19 15 39 0

1 2 Male 21 15 81 1

2 3 Female 20 16 6 0

3 4 Female 23 16 77 1

4 5 Female 31 17 40 0

... ... ... ... ... ... ...

195 196 Female 35 120 79 4

196 197 Female 45 126 28 2

197 198 Male 32 126 74 4

198 199 Male 32 137 18 2

199 200 Male 30 137 83 4

200 rows × 6 columns

fig = px.scatter_3d(df,
x="Age",
y="Annual Income (k$)",
z="Spending Score (1-100)",
color='cluster',
hover_data=["Age",
"Annual Income (k$)",
"Spending Score (1-100)"],
category_orders = {"cluster": range(0, 5)},
)

fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))


fig.show()

You might also like