Kmeans
Kmeans
[ ]: df = pd.read_csv("income.csv")
df.head()
# df = pd.read_csv("/content/US_violent_crime.csv.xls")
# df.head()
[ ]: df = df.drop(['Name'], axis = 1)
df.head()
[ ]: Age Income($)
0 27 70000
1 29 90000
2 29 61000
3 28 60000
4 42 150000
[ ]: sc = StandardScaler()
scaled_df = sc.fit_transform(df)
scaled_df
df = pd.DataFrame(scaled_df)
df.head()
1
[ ]: 0 1
0 -1.356055 -0.480684
1 -1.009157 -0.010159
2 -1.009157 -0.692421
3 -1.182606 -0.715947
4 1.245679 1.401417
[ ]: plt.scatter(df[0],df[1])
plt.xlabel('Age')
plt.ylabel('Income($)')
[ ]: km = KMeans(n_clusters=3,init='k-means++', max_iter=300,random_state=42)
km.fit(df)
# km.fit_predict(new_datapoints)---> imp !
# km.n_clusters
[ ]: KMeans(n_clusters=3, random_state=42)
2
[ ]: centroids = km.cluster_centers_
centroids
[ ]: labels = km.labels_
labels
[ ]: array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
dtype=int32)
plt.xlabel('Age')
plt.ylabel('Income($)')
plt.show()
[ ]: # Inertia
print("Inertia:", km.inertia_)
# Silhouette Score
silhouette = silhouette_score(df, km.labels_)
print("Silhouette Score:", silhouette)
3
Inertia: 3.8893086571301416
Silhouette Score: 0.7119890290065082
# # Let's create our sets again, this time it will be 3 dimensional variable
# kmeans = KMeans(n_clusters = 3)
# k_fit = kmeans.fit(df)
# sets = k_fit.labels_
# centers = kmeans.cluster_centers_
# # Inertia
# print("Inertia:", kmeans.inertia_)
# # Silhouette Score
# silhouette = silhouette_score(df, kmeans.labels_)
# print("Silhouette Score:", silhouette)
[ ]: inertia = []
for i in range(1,11):
kmeans = KMeans(n_clusters = i)
kmeans.fit(df)
inertia.append(kmeans.inertia_)
plt.figure(figsize=(10,10))
plt.plot(range(1,11), inertia, marker = 'o', linestyle = '-.')
plt.title("Elbow Method - KMeans Clustering")
plt.xlabel("K")
plt.ylabel("Inertia")
plt.show()
4
5