Income (K-Means Clustering On A Sample Data Set)
Income (K-Means Clustering On A Sample Data Set)
0 Rob 27 70000
1 Michael 29 90000
2 Mohan 29 61000
3 Ismail 28 60000
4 Kory 42 150000
In [4]:
plt.scatter(df.Age,df['Income($)'])
plt.xlabel('Age')
plt.ylabel('Income($)')
Out[4]:
In [5]:
#We can see 3 clear clusters
#Pre-processing data (Scaling income)
scaler = MinMaxScaler()
scaler.fit(df[['Income($)']])
df['Income($)'] = scaler.transform(df[['Income($)']])
scaler.fit(df[['Age']])
df['Age'] = scaler.transform(df[['Age']])
df.head()
Out[5]:
In [6]:
plt.scatter(df.Age,df['Income($)'])
Out[6]:
<matplotlib.collections.PathCollection at 0x7fa1615b2cd0>
In [7]:
km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(df[['Age','Income($)']])
y_predicted
Out[7]:
array([0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
dtype=int32)
In [11]:
df['cluster']=y_predicted
df.head(3)
Out[11]:
In [12]:
km.cluster_centers_
Out[12]:
array([[0.1372549 , 0.11633428],
[0.85294118, 0.2022792 ],
[0.72268908, 0.8974359 ]])
In [13]:
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
plt.scatter(df1.Age,df1['Income($)'],color='green')
plt.scatter(df2.Age,df2['Income($)'],color='red')
plt.scatter(df3.Age,df3['Income($)'],color='black')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='purple',marker='*',
label='centroid')
plt.legend()
Out[13]:
<matplotlib.legend.Legend at 0x7fa161759c70>
In [14]:
#Elbow Plot Graph verification for number of clusters chosen
sse = []
k_rng = range(1,10)
for k in k_rng:
km = KMeans(n_clusters=k)
km.fit(df[['Age','Income($)']])
sse.append(km.inertia_)
plt.xlabel('K')
plt.ylabel('Sum of squared error')
plt.plot(k_rng,sse)
Out[14]:
[<matplotlib.lines.Line2D at 0x7fa16184a880>]
In [ ]:
#We observe that the elbow point appears at K=3