0% found this document useful (0 votes)
7 views5 pages

Kmeans

The document outlines a Python script for performing KMeans clustering on a dataset containing age and income information. It includes data preprocessing steps such as scaling and visualization, followed by fitting the KMeans model and evaluating its performance using inertia and silhouette score. Additionally, it demonstrates the elbow method for determining the optimal number of clusters.

Uploaded by

hetvibhora192
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
7 views5 pages

Kmeans

The document outlines a Python script for performing KMeans clustering on a dataset containing age and income information. It includes data preprocessing steps such as scaling and visualization, followed by fitting the KMeans model and evaluating its performance using inertia and silhouette score. Additionally, it demonstrates the elbow method for determining the optimal number of clusters.

Uploaded by

hetvibhora192
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 5

nexzotk17

December 22, 2024

[ ]: from sklearn.cluster import KMeans


import pandas as pd
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import silhouette_score
from matplotlib import pyplot as plt
%matplotlib inline

[ ]: df = pd.read_csv("income.csv")
df.head()

# df = pd.read_csv("/content/US_violent_crime.csv.xls")
# df.head()

[ ]: Name Age Income($)


0 Rob 27 70000
1 Michael 29 90000
2 Mohan 29 61000
3 Ismail 28 60000
4 Kory 42 150000

[ ]: df = df.drop(['Name'], axis = 1)
df.head()

[ ]: Age Income($)
0 27 70000
1 29 90000
2 29 61000
3 28 60000
4 42 150000

[ ]: sc = StandardScaler()
scaled_df = sc.fit_transform(df)
scaled_df

df = pd.DataFrame(scaled_df)
df.head()

1
[ ]: 0 1
0 -1.356055 -0.480684
1 -1.009157 -0.010159
2 -1.009157 -0.692421
3 -1.182606 -0.715947
4 1.245679 1.401417

[ ]: plt.scatter(df[0],df[1])
plt.xlabel('Age')
plt.ylabel('Income($)')

[ ]: Text(0, 0.5, 'Income($)')

[ ]: km = KMeans(n_clusters=3,init='k-means++', max_iter=300,random_state=42)
km.fit(df)

# km.fit_predict(new_datapoints)---> imp !
# km.n_clusters

[ ]: KMeans(n_clusters=3, random_state=42)

2
[ ]: centroids = km.cluster_centers_
centroids

[ ]: array([[ 0.60143983, 1.40141653],


[-1.1247901 , -0.74862223],
[ 0.98550535, -0.51205261]])

[ ]: labels = km.labels_
labels

[ ]: array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
dtype=int32)

[ ]: plt.scatter(df.iloc[:,0], df.iloc[:,1], c = labels, s = 50, cmap = "viridis")


plt.scatter(centroids[:,0], centroids[:,1], c = "red", s = 100, marker = "x" ,␣
↪alpha = 0.5)

plt.xlabel('Age')
plt.ylabel('Income($)')
plt.show()

[ ]: # Inertia
print("Inertia:", km.inertia_)

# Silhouette Score
silhouette = silhouette_score(df, km.labels_)
print("Silhouette Score:", silhouette)

3
Inertia: 3.8893086571301416
Silhouette Score: 0.7119890290065082

[ ]: # from mpl_toolkits.mplot3d import Axes3D

# # Let's create our sets again, this time it will be 3 dimensional variable

# kmeans = KMeans(n_clusters = 3)
# k_fit = kmeans.fit(df)
# sets = k_fit.labels_
# centers = kmeans.cluster_centers_

[ ]: # # for more then 2 cols


# plt.rcParams['figure.figsize'] = (16, 9)
# fig = plt.figure()
# ax = Axes3D(fig)
# ax.scatter(df.iloc[:, 0], df.iloc[:, 1], df.iloc[:, 2])

# plt.scatter(centers[:,0], centers[:,1], c = "red", s = 100, marker = "x" ,␣


↪alpha = 0.5)

[ ]: # from sklearn.cluster import KMeans


# from sklearn.metrics import silhouette_score

# kmeans = KMeans(n_clusters=3, random_state=42)


# kmeans.fit(df)

# # Inertia
# print("Inertia:", kmeans.inertia_)

# # Silhouette Score
# silhouette = silhouette_score(df, kmeans.labels_)
# print("Silhouette Score:", silhouette)

[ ]: inertia = []
for i in range(1,11):
kmeans = KMeans(n_clusters = i)
kmeans.fit(df)
inertia.append(kmeans.inertia_)
plt.figure(figsize=(10,10))
plt.plot(range(1,11), inertia, marker = 'o', linestyle = '-.')
plt.title("Elbow Method - KMeans Clustering")
plt.xlabel("K")
plt.ylabel("Inertia")
plt.show()

4
5

You might also like