Assignmnet 5
Assignmnet 5
In [13]: df = pd.read_csv("Mall_Customers.csv")
In [14]: df.head()
Out[14]: CustomerID Genre Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40
In [15]: df.tail()
Out[15]: CustomerID Genre Age Annual Income (k$) Spending Score (1-100)
In [16]: df.shape
Out[16]: (200, 5)
In [17]: df.columns
In [18]: df.drop("CustomerID",axis=1,inplace=True)
In [19]: df
0 Male 19 15 39
1 Male 21 15 81
2 Female 20 16 6
3 Female 23 16 77
4 Female 31 17 40
Missing values:
Out[20]: Genre 0
Age 0
Annual Income (k$) 0
Spending Score (1-100) 0
dtype: int64
In [21]: df.describe()
In [22]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Genre 200 non-null object
1 Age 200 non-null int64
2 Annual Income (k$) 200 non-null int64
3 Spending Score (1-100) 200 non-null int64
dtypes: int64(3), object(1)
memory usage: 6.4+ KB
In [23]: df.nunique()
Out[23]: Genre 2
Age 51
Annual Income (k$) 64
Spending Score (1-100) 84
dtype: int64
In [25]: df['Genre'].value_counts().plot(kind='pie',figsize=(5,5),autopct='%1.1f%%')
plt.title("Total Gender Count")
plt.show()
In [26]: sns.pairplot(df,hue="Genre");
In [29]: df
0 1 19 15 39
1 1 21 15 81
2 0 20 16 6
3 0 23 16 77
4 0 31 17 40
195 0 35 120 79
196 0 45 126 28
197 1 32 126 74
198 1 32 137 18
199 1 30 137 83
In [31]: # Plotting the results onto a line graph, allowing us to observe 'The elbow'
plt.plot(range(1,11),wcss,marker='o')
plt.title('The Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()
Out[32]: Genre Age Annual Income (k$) Spending Score (1-100) label
0 1 19 15 39 4
1 1 21 15 81 4
2 0 20 16 6 2
3 0 23 16 77 4
4 0 31 17 40 2
In [34]: X=data.iloc[:,:4]
y=data.iloc[:,-1]
(160, 4) (160,)
(40, 4) (40,)
#predicting the target value from the model for the samples
y_train_km = km.predict(X_train)
y_test_km = km.predict(X_test)
acc_train_gmm = adjusted_rand_score(y_train,y_train_km)
acc_test_gmm = adjusted_rand_score(y_test,y_test_km)
0 15 39
1 15 81
2 16 6
3 16 77
4 17 40
195 120 79
196 126 28
197 126 74
198 137 18
199 137 83
0 15 39 4
1 15 81 3
2 16 6 4
3 16 77 3
4 17 40 4
195 120 79 2
196 126 28 0
197 126 74 2
198 137 18 0
199 137 83 2
palette=['green','brown','orange','red','dodgerblue'],data = da
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Spending Score (1-100) vs Annual Income (k$)')
plt.show()
In [ ]: