0% found this document useful (0 votes)
6 views2 pages

Elbow Method

Uploaded by

Prateek Verma
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
6 views2 pages

Elbow Method

Uploaded by

Prateek Verma
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 2

##to create dataframe and import libraries

from sklearn.cluster import KMeans


import pandas as pd
from matplotlib import pyplot as plt

##to read csv file


df = pd.read_csv(‘/kaggle/input/income-dataset-for-k-means/income.csv’)
df
##to check first 5 rows
df = df.head()
## to check the basic statistics of the data
df.describe()
df.shape
df.info
## to plot scatter plot between age and income
plt.scatter(df.Age, df['Income($)'])
plt.xlabel('Age')
plt.ylabel('Income($)')

## to use elbow method to find number of clusters (sse = sum of squared error)
sse = []
k_rng = range(1,10)
for k in k_rng:
km = KMeans(n_clusters=k)
km.fit(df[['Age','Income($)']])
sse.append(km.inertia_)

##print sse
sse

## plot elbow graph


plt.xlabel('K')
plt.ylabel('Sum of squared error')
plt.plot(k_rng,sse)

##to identify the number of clusters


km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(df[['Age','Income($)']])
y_predicted

## print the predicted cluster number for each datapoint


df['cluster']=y_predicted
df.head()
##to check the cluster centers
km.cluster_centers_

##to plot the different datapoints as per their assigned clusters


df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
plt.scatter(df1.Age,df1['Income($)'],color='green')
plt.scatter(df2.Age,df2['Income($)'],color='red')
plt.scatter(df3.Age,df3['Income($)'],color='black')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='purple',marker='*',labe
l='centroid')
plt.xlabel('Age')
plt.ylabel('Income ($)')
plt.legend()

You might also like