0% found this document useful (0 votes)
6 views11 pages

ML Assignment No 5

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
6 views11 pages

ML Assignment No 5

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 11

# Assignment No = 5

# Name : Prathamesh Dilip Pimpalkar


# Class : TE(IT)
# Roll No : 2233052.
# Batch : C

#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#reading mall_customers.csv file


df = pd.read_csv("/content/Mall_Customers.csv")
print(df)

CustomerID Genre Age Annual Income (k$) Spending Score (1-100)


0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40
.. ... ... ... ... ...
195 196 Female 35 120 79
196 197 Female 45 126 28
197 198 Male 32 126 74
198 199 Male 32 137 18
199 200 Male 30 137 83

[200 rows x 5 columns]

#dimentions of the dataset


df.shape

(200, 5)

#names of all attributes


df.columns

Index(['CustomerID', 'Genre', 'Age', 'Annual Income (k$)',


'Spending Score (1-100)'],
dtype='object')

#represent top 5 rows of dataset


df.head()
CustomerID Genre Age Annual Income (k$) Spending Score (1-100)

0 1 Male 19 15 39

1 2 Male 21 15 81

2 3 Female 20 16 6

3 4 Female 23 16 77

4 5 Female 31 17 40

#represent specific number of top rows of dataset


df.head(10)

CustomerID Genre Age Annual Income (k$) Spending Score (1-100)

0 1 Male 19 15 39

1 2 Male 21 15 81

2 3 Female 20 16 6

3 4 Female 23 16 77

4 5 Female 31 17 40

5 6 Female 22 17 76

6 7 Female 35 18 6

7 8 Female 23 18 94

8 9 Male 64 19 3

9 10 Female 30 19 72

#represent buttom 5 rows of dataset


df.tail()

CustomerID Genre Age Annual Income (k$) Spending Score (1-100)

195 196 Female 35 120 79

196 197 Female 45 126 28

197 198 Male 32 126 74

198 199 Male 32 137 18

199 200 Male 30 137 83

#represent specific number of buttom rows of dataset


df.tail(10)
CustomerID Genre Age Annual Income (k$) Spending Score (1-100)

190 191 Female 34 103 23

191 192 Female 32 103 69

192 193 Male 33 113 8

193 194 Female 38 113 91

194 195 Female 47 120 16

195 196 Female 35 120 79

196 197 Female 45 126 28

197 198 Male 32 126 74

198 199 Male 32 137 18

199 200 Male 30 137 83

#display all the return data types


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CustomerID 200 non-null int64
1 Genre 200 non-null object
2 Age 200 non-null int64
3 Annual Income (k$) 200 non-null int64
4 Spending Score (1-100) 200 non-null int64
dtypes: int64(4), object(1)
memory usage: 7.9+ KB

#display all statastical info


df.describe()
CustomerID Age Annual Income (k$) Spending Score (1-100)

count
#find the 200.000000
missing values 200.000000
in the dataset 200.000000 200.000000
df.isna()
mean 100.500000 38.850000 60.560000 50.200000

std CustomerID
57.879185Genre
13.969007 26.264721
Age Annual Income (k$) Spending Score25.823522
(1-100)
min
0 1.000000 18.000000
False False False 15.000000
False 1.000000
False
25%
1 50.750000 28.750000
False False False 41.500000
False 34.750000
False
50%
2 100.500000 36.000000
False False False 61.500000
False 50.000000
False
75%
3 150.250000 49.000000
False False False 78.000000
False 73.000000
False
max
4 200.000000 70.000000
False False False 137.000000
False 99.000000
False

... ... ... ... ... ...

195 False False False False False

196 False False False False False

197 False False False False False

198 False False False False False

199 False False False False False

200 rows × 5 columns

#to find the total number of missing values in the dataset


df.isna().sum()

CustomerID 0
Genre 0
Age 0
Annual Income (k$) 0
Spending Score (1-100) 0
dtype: int64

#to return all the values from dataset which are equal to zero
df==0
CustomerID Genre Age Annual Income (k$) Spending Score (1-100)

0 False False False False False

1 False False False False False

2 False False False False False

3 False False False False False

4 False False False False False

... ... ... ... ... ...

195 False False False False False


(df==0).sum()
196 False False False False False
CustomerID
197 False False 0
False False False
Genre 0
Age
198 False False 0
False False False
Annual Income (k$) 0
199
Spending False(1-100)
Score False False
0 False False
dtype: int64
200 rows × 5 columns

(df==0).sum().sum()

#to get values of particular coloumn


df["CustomerID"]

0 1
1 2
2 3
3 4
4 5
...
195 196
196 197
197 198
198 199
199 200
Name: CustomerID, Length: 200, dtype: int64

#to get mean of all values of the coloumn


df["CustomerID"].mean()

100.5

#to get values of particular coloumn


df.loc[4]
CustomerID 5
Genre Female
Age 31
Annual Income (k$) 17
Spending Score (1-100) 40
Name: 4, dtype: object

#to get max of all values of the coloumn


df["CustomerID"].max()

200

#to access specific row and specific coloumn using index locations
df.iloc[3, 4]

77

#to get specific row and specific coloumn using names


df.loc[2, "CustomerID"]

#to get all rows and all coloumn


df.iloc[:,:]

CustomerID Genre Age Annual Income (k$) Spending Score (1-100)

0 1 Male 19 15 39

1 2 Male 21 15 81

2 3 Female 20 16 6

3 4 Female 23 16 77

4 5 Female 31 17 40

... ... ... ... ... ...

195 196 Female 35 120 79

196 197 Female 45 126 28

197 198 Male 32 126 74

198 199 Male 32 137 18

199 200 Male 30 137 83

200 rows × 5 columns

#to get all coloumn but only one row


df.iloc[1,:]

CustomerID 2
Genre Male
Age 21
Annual Income (k$) 15
Spending Score (1-100) 81
Name: 1, dtype: object

#to get all rows but only one coloumn


df.loc[:,"CustomerID"]

0 1
1 2
2 3
3 4
4 5
...
195 196
196 197
197 198
198 199
199 200
Name: CustomerID, Length: 200, dtype: int64

#to get all rows but only same specific coloumns


df.loc[:,["Age","CustomerID"]]

Age CustomerID

0 19 1

1 21 2

2 20 3

3 23 4

4 31 5

... ... ...

195 35 196

196 45 197

197 32 198

198 32 199

199 30 200

200 rows × 2 columns

#to get same specific rows but all coloumns


df.loc[[0,1,2],:]

CustomerID Genre Age Annual Income (k$) Spending Score (1-100)

0 1 Male 19 15 39

1 2 Male 21 15 81

2 3 Female 20 16 6

df.dtypes

CustomerID int64
Genre object
Age int64
Annual Income (k$) int64
Spending Score (1-100) int64
dtype: object

# Extracting Independent Variables


# Here we don't need any dependent variables for data pre-processing step as it is a
# we have no idea about what to determine
# get the 'Annual Income (k$)','Spending score (1-100) features
x = df.iloc[:,[3,4]].values
plt.scatter(df['Annual Income (k$)'],df['Spending Score (1-100)'])

<matplotlib.collections.PathCollection at 0x7f078a127a50>

#finding optimal number of clusters using the elbow method


from sklearn.cluster import KMeans
wcss_list = [] #Initializing the list for values of WCSS
#Using for loop for interactions from 1 to 10
for i in range(1,11):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
kmeans.fit(x)
wcss_list.append(kmeans.inertia_)
plt.plot(range(1,11), wcss_list)
plt.title('The Elobw Method Graph')
plt.xlabel('Number of clusters(k')
plt.ylabel('wcss_list')
plt.show()

from sklearn. cluster import KMeans

#training the K-means model on a dataset


kmeans = KMeans (n_clusters = 5, init='k-means++', random_state= 42)
y_predict= kmeans. fit_predict(x)

#centroid
print(" Cluster centroids are \n", kmeans. cluster_centers_)
print(" \n\n predicated clusters for data points are :")
y_predict

Cluster centroids are


[[55.2962963 49.51851852]
[88.2 17.11428571]
[26.30434783 20.91304348]
[25.72727273 79.36363636]
[86.53846154 82.12820513]]

predicated clusters for data points are :


array([2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 0,
2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 4, 0, 4, 1, 4, 1, 4,
0, 4, 1, 4, 1, 4, 1, 4, 1, 4, 0, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4,
1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4,
1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4,
1, 4], dtype=int32)
# visulaizing the clusters
plt.scatter(x[y_predict==0, 0], x[y_predict==0, 1], s=100, c='red', label ='Cluster 1' )
plt.scatter(x[y_predict==1, 0], x[y_predict==1, 1], s=100, c='blue', label ='Cluster 2' )
plt.scatter(x[y_predict==2, 0], x[y_predict==2, 1], s=100, c='green', label ='Cluster 3' )
plt.scatter(x[y_predict==3, 0], x[y_predict==3, 1], s=100, c='cyan', label ='Cluster 4' )
plt.scatter(x[y_predict==4, 0], x[y_predict==4, 1], s=100, c='magenta', label ='Cluster 5')

plt.scatter (kmeans . cluster_centers_[ : , 0], kmeans . cluster_centers_[: , 1], s=300, c='y


plt.title('Clusters of Customers')
plt.xlabel('Annual Income(k$)')
plt.ylabel('Spending Score(1-100)')
plt.legend()
plt.show()

# display to which cluster customer belongs


df ['cluster' ]=y_predict
df
CustomerID Genre Age Annual Income (k$) Spending Score (1-100) cluster clust

0 1 Male 19 15 39 2

1 2 Male 21 15 81 3

2 3 Female 20 16 6 2

3 4 Female 23 16 77 3

4 5 Female 31 17 40 2

... ... ... ... ... ... ...

195 196 Female 35 120 79 4

196 197 Female 45 126 28 1

197 198 Male 32 126 74 4

198 199 Male 32 137 18 1

199 200 Male 30 137 83 4

200 rows × 7 columns

Colab paid products - Cancel contracts here

You might also like