0% found this document useful (0 votes)
14 views6 pages

Subspace Cluster I Nig

The document outlines the implementation of the PROCLUS subspace clustering algorithm using Python, including data preprocessing with StandardScaler and clustering on credit card customer data. It details the initialization of medoids, assignment of points to clusters, and calculation of cluster quality metrics such as the Davies-Bouldin and Dunn indices. The code also includes package installation and data loading steps, demonstrating a complete workflow for clustering analysis.

Uploaded by

12213106
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
14 views6 pages

Subspace Cluster I Nig

The document outlines the implementation of the PROCLUS subspace clustering algorithm using Python, including data preprocessing with StandardScaler and clustering on credit card customer data. It details the initialization of medoids, assignment of points to clusters, and calculation of cluster quality metrics such as the Davies-Bouldin and Dunn indices. The code also includes package installation and data loading steps, demonstrating a complete workflow for clustering analysis.

Uploaded by

12213106
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 6

import numpy as np

import pandas as pd

from sklearn.preprocessing import StandardScaler


from sklearn.metrics import davies_bouldin_score
from scipy.spatial.distance import cdist

import pyclustering.cluster.center_initializer as pyci

pip install pyclustering

Requirement already satisfied: pyclustering in c:\users\manvi\appdata\


local\programs\python\python311\lib\site-packages (0.10.1.2)
Requirement already satisfied: scipy>=1.1.0 in c:\users\manvi\appdata\
local\programs\python\python311\lib\site-packages (from pyclustering)
(1.11.3)
Requirement already satisfied: matplotlib>=3.0.0 in c:\users\manvi\
appdata\local\programs\python\python311\lib\site-packages (from
pyclustering) (3.8.2)
Requirement already satisfied: numpy>=1.15.2 in c:\users\manvi\
appdata\local\programs\python\python311\lib\site-packages (from
pyclustering) (1.26.4)
Requirement already satisfied: Pillow>=5.2.0 in c:\users\manvi\
appdata\local\programs\python\python311\lib\site-packages (from
pyclustering) (9.5.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\manvi\
appdata\local\programs\python\python311\lib\site-packages (from
matplotlib>=3.0.0->pyclustering) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\manvi\appdata\
local\programs\python\python311\lib\site-packages (from
matplotlib>=3.0.0->pyclustering) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\manvi\
appdata\local\programs\python\python311\lib\site-packages (from
matplotlib>=3.0.0->pyclustering) (4.47.2)
Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\manvi\
appdata\local\programs\python\python311\lib\site-packages (from
matplotlib>=3.0.0->pyclustering) (1.4.5)
Requirement already satisfied: packaging>=20.0 in c:\users\manvi\
appdata\local\programs\python\python311\lib\site-packages (from
matplotlib>=3.0.0->pyclustering) (23.2)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\manvi\
appdata\local\programs\python\python311\lib\site-packages (from
matplotlib>=3.0.0->pyclustering) (3.1.1)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\manvi\
appdata\local\programs\python\python311\lib\site-packages (from
matplotlib>=3.0.0->pyclustering) (2.8.2)
Requirement already satisfied: six>=1.5 in c:\users\manvi\appdata\
local\programs\python\python311\lib\site-packages (from python-
dateutil>=2.7->matplotlib>=3.0.0->pyclustering) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
data = pd.read_csv("Credit Card Customer Data.csv")
data.head()

Sl_No Customer Key Avg_Credit_Limit Total_Credit_Cards \


0 1 87073 100000 2
1 2 38414 50000 3
2 3 17341 50000 7
3 4 40496 30000 5
4 5 47437 100000 6

Total_visits_bank Total_visits_online Total_calls_made


0 1 1 0
1 0 10 9
2 1 3 4
3 1 1 4
4 0 12 3

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score
from scipy.spatial.distance import cdist

class PROCLUS:
def __init__(self, k, l, max_iter=10):
"""
PROCLUS Subspace Clustering Algorithm
:param k: Number of clusters
:param l: Number of dimensions per cluster
:param max_iter: Maximum iterations for convergence
"""
self.k = k
self.l = l
self.max_iter = max_iter
self.medoids = None
self.subspaces = None
self.clusters = None

def initialize_medoids(self, data):


""" Select initial medoids using a greedy selection. """
np.random.seed(42)
indices = np.random.choice(len(data), self.k, replace=False)
self.medoids = data[indices]

def compute_subspaces(self, data, clusters):


""" Determine relevant dimensions for each cluster. """
subspaces = []
for cluster in clusters:
if len(cluster) > 0:
cluster_data = data[cluster]
mean_dist = np.mean(np.abs(cluster_data -
np.mean(cluster_data, axis=0)), axis=0)
top_dims = np.argsort(mean_dist)[:self.l] # Select
lowest-variance dimensions
subspaces.append(top_dims)
else:
subspaces.append(np.random.choice(data.shape[1],
self.l, replace=False))
return subspaces

def assign_points(self, data, first_iter=False):


clusters = [[] for _ in range(len(self.medoids))]
for i, point in enumerate(data):
min_dist = float('inf')
best_cluster = -1
for cluster_id, medoid in enumerate(self.medoids):
if first_iter:
dist = np.linalg.norm(point - medoid) # full-
space distance
else:
subspace_dims = self.subspaces[cluster_id]
dist = np.sum(np.abs(point[subspace_dims] -
medoid[subspace_dims])) # Manhattan distance
if dist < min_dist:
min_dist = dist
best_cluster = cluster_id
clusters[best_cluster].append(i)
return clusters

def update_medoids(self, data, clusters):


""" Update medoids by selecting the most central point in each
cluster. """
new_medoids = []
for cluster in clusters:
if len(cluster) > 0:
cluster_data = data[cluster]
distances = np.sum(cdist(cluster_data, cluster_data,
metric='cityblock'), axis=1)
new_medoids.append(cluster_data[np.argmin(distances)])
else:
new_medoids.append(np.random.choice(data, 1)[0])
return np.array(new_medoids)

def fit(self, data):


self.initialize_medoids(data)

# Initial assignment before computing subspaces


initial_clusters = self.assign_points(data, first_iter=True)
self.subspaces = self.compute_subspaces(data, initial_clusters)

for _ in range(self.max_iter):
clusters = self.assign_points(data)
self.subspaces = self.compute_subspaces(data, clusters)
new_medoids = self.update_medoids(data, clusters)

if np.all(new_medoids == self.medoids):
break
self.medoids = new_medoids

return clusters

# Dunn Index Calculation


def dunn_index(self,data, clusters):
intra_dists = []
inter_dists = []

# Ensure clusters are not empty


valid_clusters = [cluster for cluster in clusters if
len(cluster) > 1]

for cluster in valid_clusters:


cluster_data = data[cluster]
intra_dists.append(np.max(cdist(cluster_data,
cluster_data, metric='euclidean')))

for i in range(len(valid_clusters)):
for j in range(i + 1, len(valid_clusters)):

inter_dists.append(np.min(cdist(data[valid_clusters[i]],
data[valid_clusters[j]], metric='euclidean')))

if len(inter_dists) == 0 or len(intra_dists) == 0:
return float("inf") # Return infinity if calculation is
not possible

return min(inter_dists) / max(intra_dists)

# Load Sample Credit Card Data


df = pd.read_csv("Credit Card Customer Data.csv")
df.head()

Sl_No Customer Key Avg_Credit_Limit Total_Credit_Cards \


0 1 87073 100000 2
1 2 38414 50000 3
2 3 17341 50000 7
3 4 40496 30000 5
4 5 47437 100000 6

Total_visits_bank Total_visits_online Total_calls_made


0 1 1 0
1 0 10 9
2 1 3 4
3 1 1 4
4 0 12 3

scaler = StandardScaler()
data_scaled = scaler.fit_transform(df)
print(data_scaled)

[[-1.72942847 1.24691971 1.74018685 ... -0.86045063 -0.54748969


-1.25153737]
[-1.72417983 -0.65320273 0.41029254 ... -1.47373077 2.5205186
1.89185881]
[-1.71893118 -1.47609839 0.41029254 ... -0.86045063 0.13428993
0.1455276 ]
...
[ 1.71893118 -0.05105538 2.93709172 ... -0.86045063 2.17962879
-0.90227113]
[ 1.72417983 0.99629832 3.65523464 ... -0.86045063 4.22496765
-1.25153737]
[ 1.72942847 0.97657819 3.52224521 ... -1.47373077 3.20229822
-0.55300488]]

# Run PROCLUS
k = 5 # Number of clusters
l = 3 # Subspace dimensionality
proclus = PROCLUS(k, l)
clusters = proclus.fit(data_scaled)

# Assign Labels
labels = np.zeros(len(data_scaled))
for cluster_id, cluster in enumerate(clusters):
for index in cluster:
labels[index] = cluster_id

# Compute Cluster Quality


db_index = davies_bouldin_score(data_scaled, labels)
dunn = proclus.dunn_index(data_scaled, clusters)

print(f"Davies-Bouldin Index: {db_index}")


print(f"Dunn Index: {dunn}")
Davies-Bouldin Index: 2.1216181090876955
Dunn Index: 0.05855920277922305

You might also like