0% found this document useful (0 votes)
13 views9 pages

DACLUSTER

The document outlines a data analysis process using a dataset of 2000 customers, including their demographics and spending behavior. It includes data cleaning, exploratory data analysis with visualizations, and clustering techniques using KMeans and hierarchical clustering. The analysis aims to identify patterns in customer data to inform business decisions.

Uploaded by

iameverywhere792
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views9 pages

DACLUSTER

The document outlines a data analysis process using a dataset of 2000 customers, including their demographics and spending behavior. It includes data cleaning, exploratory data analysis with visualizations, and clustering techniques using KMeans and hierarchical clustering. The analysis aims to identify patterns in customer data to inform business decisions.

Uploaded by

iameverywhere792
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('Customers.csv')

data.head()

{"summary":"{\n \"name\": \"data\",\n \"rows\": 2000,\n \"fields\":


[\n {\n \"column\": \"CustomerID\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 577,\n \"min\": 1,\n
\"max\": 2000,\n \"num_unique_values\": 2000,\n
\"samples\": [\n 1861,\n 354,\n 1334\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"Gender\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
2,\n \"samples\": [\n \"Female\",\n \"Male\"\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 28,\n \"min\": 0,\n \"max\": 99,\n
\"num_unique_values\": 100,\n \"samples\": [\n 90,\n
62\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Annual Income ($)\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 45739,\n \"min\": 0,\n
\"max\": 189974,\n \"num_unique_values\": 1786,\n
\"samples\": [\n 162465,\n 124256\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Spending Score (1-100)\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
27,\n \"min\": 0,\n \"max\": 100,\n
\"num_unique_values\": 101,\n \"samples\": [\n 0,\n
11\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Profession\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 9,\n \"samples\":
[\n \"Homemaker\",\n \"Engineer\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Work Experience\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
3,\n \"min\": 0,\n \"max\": 17,\n
\"num_unique_values\": 18,\n \"samples\": [\n 1,\n
3\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Family Size\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 1,\n \"min\": 1,\n
\"max\": 9,\n \"num_unique_values\": 9,\n \"samples\":
[\n 7,\n 3\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"data"}

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CustomerID 2000 non-null int64
1 Gender 2000 non-null object
2 Age 2000 non-null int64
3 Annual Income ($) 2000 non-null int64
4 Spending Score (1-100) 2000 non-null int64
5 Profession 1965 non-null object
6 Work Experience 2000 non-null int64
7 Family Size 2000 non-null int64
dtypes: int64(6), object(2)
memory usage: 125.1+ KB

data.describe(include='all')

{"summary":"{\n \"name\": \"data\",\n \"rows\": 11,\n \"fields\":


[\n {\n \"column\": \"CustomerID\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 720.5049024934813,\n
\"min\": 1.0,\n \"max\": 2000.0,\n
\"num_unique_values\": 6,\n \"samples\": [\n 2000.0,\n
1000.5,\n 1500.25\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"Gender\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 4,\n \"samples\":
[\n 2,\n \"1186\",\n \"2000\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Age\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 691.4867589780589,\n
\"min\": 0.0,\n \"max\": 2000.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n 48.96,\n
48.0,\n 2000.0\n ],\n \"semantic_type\": \"\",\
n \"description\": \"\"\n }\n },\n {\n
\"column\": \"Annual Income ($)\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 67802.59849983425,\n
\"min\": 0.0,\n \"max\": 189974.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
110731.8215,\n 110045.0,\n 2000.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Spending Score (1-100)\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
691.0208261988552,\n \"min\": 0.0,\n \"max\": 2000.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n 50.9625,\
n 50.0,\n 2000.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Profession\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 4,\n \"samples\": [\n 9,\n
\"612\",\n \"1965\"\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"Work Experience\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 705.3069888546329,\n
\"min\": 0.0,\n \"max\": 2000.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n 4.1025,\n
3.0,\n 2000.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Family Size\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 705.7605915163412,\n \"min\":
1.0,\n \"max\": 2000.0,\n \"num_unique_values\": 8,\n
\"samples\": [\n 3.7685,\n 4.0,\n 2000.0\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n }\n ]\n}","type":"dataframe"}

data.isnull().sum()

CustomerID 0
Gender 0
Age 0
Annual Income ($) 0
Spending Score (1-100) 0
Profession 35
Work Experience 0
Family Size 0
dtype: int64

data['Profession'] =
data['Profession'].fillna(data['Profession'].mode()[0])

data.isnull().sum()

CustomerID 0
Gender 0
Age 0
Annual Income ($) 0
Spending Score (1-100) 0
Profession 0
Work Experience 0
Family Size 0
dtype: int64

plt.figure(figsize=(10,5))
plt.subplot(2,2,1)
sns.boxplot(data['Annual Income ($)'],color='lightgreen')
plt.title('Annual Income')
plt.subplot(2,2,2)
sns.boxplot(data['Spending Score (1-100)'],color='orange')
plt.title('Spending Score')
plt.subplot(2,2,3)
sns.boxplot(data['Age'],color='skyblue')
plt.title('Age')
plt.subplot(2,2,4)
sns.boxplot(data['Work Experience'],color='salmon')
plt.title('Work Experience')
plt.tight_layout()

plt.show()

plt.figure(figsize=(10,5))
plt.subplot(2,2,1)
sns.histplot(data['Annual Income ($)'],color='lightgreen',kde=True,
bins= 30)
plt.title('Annual Income')
plt.subplot(2,2,2)
sns.histplot(data['Spending Score (1-100)'],color='orange',kde=True,
bins= 30)
plt.title('Spending Score')
plt.subplot(2,2,3)
sns.histplot(data['Age'],color='skyblue',kde=True, bins= 30)
plt.title('Age')
plt.subplot(2,2,4)
sns.histplot(data['Work Experience'],color='salmon',kde=True, bins=
30)
plt.title('Work Experience')
plt.tight_layout()

plt.show()

print(f"Annual Income Skewness: {data['Annual Income


($)'].skew(): .4f}")
print(f"Annual Income Kurtosis: {data['Annual Income
($)'].kurt(): .4f}")

Annual Income Skewness: -0.1165


Annual Income Kurtosis: -0.8431

print(f"Spending Score Skewness: {data['Spending Score (1-


100)'].skew(): .4f}")
print(f"Spending Score Kurtosis: {data['Spending Score (1-
100)'].kurt(): .4f}")

Spending Score Skewness: 0.0046


Spending Score Kurtosis: -1.1007

print(f"Age Skewness: {data['Age'].skew(): .4f}")


print(f"Age Kurtosis: {data['Age'].kurt(): .4f}")

Age Skewness: 0.0492


Age Kurtosis: -1.1689
print(f"Work Experience Skewness: {data['Work
Experience'].skew(): .4f}")
print(f"Work Experience Kurtosis: {data['Work
Experience'].kurt(): .4f}")

Work Experience Skewness: 0.6837


Work Experience Kurtosis: -0.4666

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[['Age', 'Annual Income ($)',
'Spending Score (1-100)']])

wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, random_state=42)
kmeans.fit(data_scaled)
wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss, marker='o')


plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
kmeans = KMeans(n_clusters=3, random_state=42)
data['Cluster_KMeans'] = kmeans.fit_predict(data_scaled)

linkage_matrix = linkage(data_scaled, method='ward')

plt.figure(figsize=(10,7))
dendrogram(linkage_matrix)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

data['Cluster_Hierarchical'] = fcluster(linkage_matrix, 3,
criterion='maxclust')

plt.figure(figsize=(8,5))
sns.scatterplot(data=data, x='Annual Income ($)', y='Spending Score
(1-100)', hue='Cluster_KMeans', palette='Set2')
plt.title('K-Means Clustering Result')
plt.show()
plt.figure(figsize=(8,5))
sns.scatterplot(data=data, x='Annual Income ($)', y='Spending Score
(1-100)', hue='Cluster_Hierarchical', palette='Set1')
plt.title('Hierarchical Clustering Result')
plt.show()

You might also like