DACLUSTER
DACLUSTER
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('Customers.csv')
data.head()
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CustomerID 2000 non-null int64
1 Gender 2000 non-null object
2 Age 2000 non-null int64
3 Annual Income ($) 2000 non-null int64
4 Spending Score (1-100) 2000 non-null int64
5 Profession 1965 non-null object
6 Work Experience 2000 non-null int64
7 Family Size 2000 non-null int64
dtypes: int64(6), object(2)
memory usage: 125.1+ KB
data.describe(include='all')
data.isnull().sum()
CustomerID 0
Gender 0
Age 0
Annual Income ($) 0
Spending Score (1-100) 0
Profession 35
Work Experience 0
Family Size 0
dtype: int64
data['Profession'] =
data['Profession'].fillna(data['Profession'].mode()[0])
data.isnull().sum()
CustomerID 0
Gender 0
Age 0
Annual Income ($) 0
Spending Score (1-100) 0
Profession 0
Work Experience 0
Family Size 0
dtype: int64
plt.figure(figsize=(10,5))
plt.subplot(2,2,1)
sns.boxplot(data['Annual Income ($)'],color='lightgreen')
plt.title('Annual Income')
plt.subplot(2,2,2)
sns.boxplot(data['Spending Score (1-100)'],color='orange')
plt.title('Spending Score')
plt.subplot(2,2,3)
sns.boxplot(data['Age'],color='skyblue')
plt.title('Age')
plt.subplot(2,2,4)
sns.boxplot(data['Work Experience'],color='salmon')
plt.title('Work Experience')
plt.tight_layout()
plt.show()
plt.figure(figsize=(10,5))
plt.subplot(2,2,1)
sns.histplot(data['Annual Income ($)'],color='lightgreen',kde=True,
bins= 30)
plt.title('Annual Income')
plt.subplot(2,2,2)
sns.histplot(data['Spending Score (1-100)'],color='orange',kde=True,
bins= 30)
plt.title('Spending Score')
plt.subplot(2,2,3)
sns.histplot(data['Age'],color='skyblue',kde=True, bins= 30)
plt.title('Age')
plt.subplot(2,2,4)
sns.histplot(data['Work Experience'],color='salmon',kde=True, bins=
30)
plt.title('Work Experience')
plt.tight_layout()
plt.show()
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[['Age', 'Annual Income ($)',
'Spending Score (1-100)']])
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, random_state=42)
kmeans.fit(data_scaled)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(10,7))
dendrogram(linkage_matrix)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()
data['Cluster_Hierarchical'] = fcluster(linkage_matrix, 3,
criterion='maxclust')
plt.figure(figsize=(8,5))
sns.scatterplot(data=data, x='Annual Income ($)', y='Spending Score
(1-100)', hue='Cluster_KMeans', palette='Set2')
plt.title('K-Means Clustering Result')
plt.show()
plt.figure(figsize=(8,5))
sns.scatterplot(data=data, x='Annual Income ($)', y='Spending Score
(1-100)', hue='Cluster_Hierarchical', palette='Set1')
plt.title('Hierarchical Clustering Result')
plt.show()