0% found this document useful (0 votes)
13 views1 page

Clustering

Uploaded by

xewetef241
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views1 page

Clustering

Uploaded by

xewetef241
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 1

In [1]: from sklearn.

cluster import KMeans


import numpy as np
import matplotlib.pyplot as plt

# Sample data generation (replace with your data)


np.random.seed(0)
X = np.random.rand(100, 2) # 100 points in 2 dimensions

# Initialize KMeans
kmeans = KMeans(n_clusters=3, random_state=0)

# Fit and predict clusters


clusters = kmeans.fit_predict(X)

# Visualize the clusters


plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], marker='x', c='red', s=200, label='Cluster Centers')
plt.title('K-means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.show()

K-medoid Clustering
In [4]: import numpy as np
from sklearn.metrics import pairwise_distances
from random import sample

# Function to compute total cost (sum of distances) for a set of medoids


def compute_cost(X, medoids, clusters):
cost = 0
for medoid, cluster in zip(medoids, clusters):
cost += np.sum(pairwise_distances(X[cluster], X[medoid].reshape(1, -1)))
return cost

# K-medoids clustering using Partitioning Around Medoids (PAM)


def k_medoids(X, k, max_iter=300):
m, n = X.shape
# Randomly initialize medoids
medoids = sample(range(m), k)

for iteration in range(max_iter):


clusters = [[] for _ in range(k)]
# Assign each point to the nearest medoid
for idx, point in enumerate(X):
distances = [np.linalg.norm(point - X[medoid]) for medoid in medoids]
closest_medoid = np.argmin(distances)
clusters[closest_medoid].append(idx)

new_medoids = []
# Update medoids for each cluster
for cluster in clusters:
if len(cluster) == 0:
continue
distances_sum = np.sum(pairwise_distances(X[cluster], X[cluster]), axis=1)
new_medoid = cluster[np.argmin(distances_sum)]
new_medoids.append(new_medoid)

# Check for convergence


if set(medoids) == set(new_medoids):
break

medoids = new_medoids

# Final cluster assignment


final_clusters = [[] for _ in range(k)]
for idx, point in enumerate(X):
distances = [np.linalg.norm(point - X[medoid]) for medoid in medoids]
closest_medoid = np.argmin(distances)
final_clusters[closest_medoid].append(idx)

# Compute final cost


final_cost = compute_cost(X, medoids, final_clusters)

return medoids, final_clusters, final_cost

# Example usage:
if __name__ == "__main__":
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

# Create sample data


X, y = make_blobs(n_samples=300, centers=4, random_state=42)

# Perform K-medoids clustering


k = 4
medoids, clusters, cost = k_medoids(X, k)

# Plot the clusters and medoids


for i, cluster in enumerate(clusters):
plt.scatter(X[cluster, 0], X[cluster, 1], label=f'Cluster {i+1}')
plt.scatter(X[medoids, 0], X[medoids, 1], s=200, c='red', label='Medoids', marker='x')
plt.legend()
plt.title('K-Medoids Clustering')
plt.show()

K mean ++
In [4]: from sklearn.cluster import KMeans
import numpy as np

# Sample data: 2D points


X = np.array([
[1, 2],
[1, 4],
[1, 0],
[10, 2],
[10, 4],
[10, 0]
])

# Initialize KMeans with k-means++ initialization


k = 2
kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42)
kmeans.fit(X)

print("Cluster Centers:")
print(kmeans.cluster_centers_)

print("Labels:")
print(kmeans.labels_)

Cluster Centers:
[[10. 2.]
[ 1. 2.]]
Labels:
[1 1 1 0 0 0]

DBSCAN using scikit-learn


In [6]: # Import necessary libraries
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
import numpy as np

# Generate sample data


X, y = make_moons(n_samples=300, noise=0.05, random_state=42)

# Initialize DBSCAN with parameters


dbscan = DBSCAN(eps=0.2, min_samples=5)

# Fit the model


dbscan.fit(X)

# Extract labels (-1 indicates noise)


labels = dbscan.labels_

# Number of clusters (excluding noise)


n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(f'Number of clusters found: {n_clusters}')

# Plotting the results


unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]

for k, col in zip(unique_labels, colors):


if k == -1:
# Black color for noise
col = [0, 0, 0, 1]

class_member_mask = (labels == k)

xy = X[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)

plt.title(f'DBSCAN Clustering (Number of clusters: {n_clusters})')


plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

Number of clusters found: 2

Hierarchical clustering
In [1]: import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.metrics import silhouette_score, confusion_matrix

# Load Iris Dataset


iris = load_iris()
X = iris.data
y_true = iris.target

# Standardize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform Agglomerative Hierarchical Clustering using Ward's Method


Z = linkage(X_scaled, method='ward')

# Plot Dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z, truncate_mode='level', p=3)
plt.title('Agglomerative Hierarchical Clustering Dendrogram (Iris Dataset)')
plt.xlabel('Sample Index or (Cluster Size)')
plt.ylabel('Distance')
plt.show()

# Determine Cluster Assignments (Assuming 3 Clusters)


clusters = fcluster(Z, t=3, criterion='maxclust')

# Evaluate Clustering Performance


silhouette_avg = silhouette_score(X_scaled, clusters)
print(f"Silhouette Score: {silhouette_avg:.2f}")

# Compare with True Labels (Optional)


cm = confusion_matrix(y_true, clusters)
print("Confusion Matrix:")
print(cm)

Silhouette Score: 0.45


Confusion Matrix:
[[ 0 49 1 0]
[ 0 0 27 23]
[ 0 0 2 48]
[ 0 0 0 0]]

In [2]: import numpy as np


import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, confusion_matrix

# 1. Load Iris Dataset


iris = load_iris()
X = iris.data
y_true = iris.target

# 2. Standardize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Initialize Agglomerative Clustering with Single Linkage


agglom = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='single')

# 4. Fit the Model


agglom.fit(X_scaled)

# 5. Cluster Labels
labels = agglom.labels_

# 6. Evaluate Clustering Performance


silhouette_avg = silhouette_score(X_scaled, labels)
print(f"Silhouette Score: {silhouette_avg:.2f}")

# Optional: Compare with True Labels


cm = confusion_matrix(y_true, labels)
print("Confusion Matrix:")
print(cm)

# 7. Generate the linkage matrix for dendrogram


Z = linkage(X_scaled, method='single')

# 8. Plot Dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z, truncate_mode='level', p=3)
plt.title('Agglomerative Hierarchical Clustering Dendrogram (Single Linkage)')
plt.xlabel('Sample Index or (Cluster Size)')
plt.ylabel('Distance')
plt.show()

# 9. Reduce Dimensions for Visualization


pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# 10. Plot Clusters


plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', label='Cluster')
plt.title('Agglomerative Clustering with Single Linkage (PCA Reduced)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(*scatter.legend_elements(), title="Clusters")
plt.show()

Silhouette Score: 0.50


Confusion Matrix:
[[ 0 1 49]
[50 0 0]
[50 0 0]]

Divisive Hierarchical Clustering using the Single Linkage


In [3]: import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, confusion_matrix

def divisive_clustering(X, max_clusters=3):


"""
Perform Divisive Hierarchical Clustering using KMeans for splitting.

Parameters:
- X: ndarray of shape (n_samples, n_features)
- max_clusters: int, desired number of clusters

Returns:
- labels: ndarray of shape (n_samples,), cluster labels
"""
# Initialize all points in one cluster
clusters = {0: np.arange(X.shape[0])}
current_cluster = 0
next_cluster_id = 1

while len(clusters) < max_clusters:


# Find the cluster to split (e.g., the one with the largest variance)
variances = {cid: X[indices].var(axis=0).mean() for cid, indices in clusters.items()}
cluster_to_split = max(variances, key=variances.get)
indices = clusters.pop(cluster_to_split)

# If the cluster has only one point, cannot split further


if len(indices) <= 1:
continue

# Split the cluster into two using KMeans


kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X[indices])
labels_split = kmeans.labels_

# Assign new cluster IDs


clusters[next_cluster_id] = indices[labels_split == 0]
clusters[next_cluster_id + 1] = indices[labels_split == 1]
next_cluster_id += 2

# Assign labels
labels_final = np.zeros(X.shape[0], dtype=int)
for cluster_id, indices in clusters.items():
labels_final[indices] = cluster_id

return labels_final

# 1. Load Iris Dataset


iris = load_iris()
X = iris.data
y_true = iris.target

# 2. Standardize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Desired number of clusters


desired_clusters = 3

# 4. Perform Divisive Clustering


labels = divisive_clustering(X_scaled, max_clusters=desired_clusters)

# 5. Evaluate Clustering Performance


silhouette_avg = silhouette_score(X_scaled, labels)
print(f"Silhouette Score: {silhouette_avg:.2f}")

# Optional: Compare with True Labels


cm = confusion_matrix(y_true, labels)
print("Confusion Matrix:")
print(cm)

# 6. Reduce Dimensions for Visualization


pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# 7. Plot Clusters
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', label='Cluster')
plt.title('Divisive Clustering with Single Linkage (PCA Reduced)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(*scatter.legend_elements(), title="Clusters")
plt.show()

Silhouette Score: 0.46


Confusion Matrix:
[[ 0 50 0 0 0]
[ 0 0 0 11 39]
[ 0 0 0 36 14]
[ 0 0 0 0 0]
[ 0 0 0 0 0]]

In [5]: import numpy as np


import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import networkx as nx
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, confusion_matrix

def divisive_clustering(X, max_clusters=3):


"""
Perform Divisive Hierarchical Clustering using KMeans for splitting.

Parameters:
- X: ndarray of shape (n_samples, n_features)
- max_clusters: int, desired number of clusters

Returns:
- labels: ndarray of shape (n_samples,), cluster labels
- hierarchy: list of tuples, each representing a split in the format (parent, child1, child2, distance)
"""
# Initialize all points in one cluster
clusters = {0: np.arange(X.shape[0])}
hierarchy = []
current_cluster_id = 0
next_cluster_id = 1

while len(clusters) < max_clusters:


# Select the cluster to split (e.g., the one with the largest variance)
variances = {cid: X[indices].var(axis=0).mean() for cid, indices in clusters.items()}
cluster_to_split = max(variances, key=variances.get)
indices = clusters.pop(cluster_to_split)

# If the cluster has only one point, cannot split further


if len(indices) <= 1:
continue

# Split the cluster into two using KMeans


kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X[indices])
labels_split = kmeans.labels_

# Assign new cluster IDs


child1 = next_cluster_id
child2 = next_cluster_id + 1
clusters[child1] = indices[labels_split == 0]
clusters[child2] = indices[labels_split == 1]
next_cluster_id += 2

# Calculate the distance (e.g., distance between cluster centroids)


centroid_parent = X[indices].mean(axis=0)
centroid_child1 = X[clusters[child1]].mean(axis=0)
centroid_child2 = X[clusters[child2]].mean(axis=0)
distance = np.linalg.norm(centroid_child1 - centroid_parent) + np.linalg.norm(centroid_child2 - centroid_parent)

# Record the split


hierarchy.append((cluster_to_split, child1, child2, distance))

# Assign labels
labels_final = np.zeros(X.shape[0], dtype=int)
for cluster_id, indices in clusters.items():
labels_final[indices] = cluster_id

return labels_final, hierarchy

def build_dendrogram(hierarchy):
"""
Build a dendrogram-like graph from the hierarchy of splits.

Parameters:
- hierarchy: list of tuples, each representing a split in the format (parent, child1, child2, distance)

Returns:
- G: NetworkX graph representing the dendrogram
"""
G = nx.Graph()

for split in hierarchy:


parent, child1, child2, distance = split
G.add_edge(parent, child1, weight=distance)
G.add_edge(parent, child2, weight=distance)

return G

def hierarchy_pos(G, root, width=1., vert_gap=0.2, vert_loc=0, xcenter=0.5):


"""
Assign positions to each node in the graph for hierarchical plotting.

Parameters:
- G: NetworkX graph
- root: the root node of current branch
- width: horizontal space allocated for this branch
- vert_gap: gap between levels of hierarchy
- vert_loc: vertical location of root
- xcenter: horizontal location of root

Returns:
- pos: dict mapping nodes to positions
"""
def _hierarchy_pos(G, root, left, right, vert_gap, vert_loc, pos, parent=None):
children = list(G.neighbors(root))
if parent is not None:
children.remove(parent)
if len(children) != 0:
dx = width / len(children)
nextx = left
for child in children:
nextx += dx
pos = _hierarchy_pos(G, child, nextx - dx/2, nextx + dx/2, vert_gap, vert_loc - vert_gap, pos, root)
pos[root] = (xcenter, vert_loc)
return pos

pos = {}
pos = _hierarchy_pos(G, root, 0, width, vert_gap, vert_loc, pos)
return pos

# 1. Load Iris Dataset


iris = load_iris()
X = iris.data
y_true = iris.target

# 2. Standardize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Desired number of clusters


desired_clusters = 3

# 4. Perform Divisive Clustering


labels, hierarchy = divisive_clustering(X_scaled, max_clusters=desired_clusters)

# 5. Evaluate Clustering Performance


silhouette_avg = silhouette_score(X_scaled, labels)
print(f"Silhouette Score: {silhouette_avg:.2f}")

# Optional: Compare with True Labels


cm = confusion_matrix(y_true, labels)
print("Confusion Matrix:")
print(cm)

# 6. Build the Dendrogram Graph


G = build_dendrogram(hierarchy)

# 7. Reduce Dimensions for Visualization


pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# 8. Plot Clusters Using PCA


plt.figure(figsize=(14, 6))

# Subplot 1: Cluster Visualization


plt.subplot(1, 2, 1)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', label='Cluster')
plt.title('Divisive Clustering (PCA Reduced)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(*scatter.legend_elements(), title="Clusters")

# 9. Plot Dendrogram Using NetworkX


plt.subplot(1, 2, 2)
if G.number_of_nodes() > 0:
root = 0 # Assuming the initial cluster ID is 0
pos = hierarchy_pos(G, root)
weights = [G[u][v]['weight'] for u, v in G.edges()]
# Normalize weights for coloring
weights_norm = (np.array(weights) - min(weights)) / (max(weights) - min(weights))
nx.draw(G, pos, with_labels=True, node_size=500, node_color='lightblue',
edge_color=weights_norm, width=2, edge_cmap=plt.cm.viridis)
plt.title('Divisive Clustering Dendrogram')
else:
plt.text(0.5, 0.5, 'No splits performed', horizontalalignment='center', verticalalignment='center')

plt.axis('off')
plt.tight_layout()
plt.show()

Silhouette Score: 0.46


Confusion Matrix:
[[ 0 50 0 0 0]
[ 0 0 0 11 39]
[ 0 0 0 36 14]
[ 0 0 0 0 0]
[ 0 0 0 0 0]]

Fuzzy C means clustering


In [6]: !pip install scikit-fuzzy
import numpy as np
import skfuzzy as fuzz
from skfuzzy import control as ctrl

# Generate some example data


np.random.seed(0)
data = np.random.rand(100, 2)

# Define the number of clusters


n_clusters = 3

# Apply fuzzy c-means clustering


cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
data.T, n_clusters, 2, error=0.005, maxiter=1000, init=None
)

# Predict cluster membership for each data point


cluster_membership = np.argmax(u, axis=0)

# Print the cluster centers


print('Cluster Centers:', cntr)

# Print the cluster membership for each data point


print('Cluster Membership:', cluster_membership)

Collecting scikit-fuzzy
Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl (920 kB)
Installing collected packages: scikit-fuzzy
Successfully installed scikit-fuzzy-0.5.0
Cluster Centers: [[0.22645397 0.71840176]
[0.52083891 0.18668653]
[0.76252289 0.60239021]]
Cluster Membership: [2 2 0 0 2 2 2 1 0 2 2 0 0 0 1 0 0 0 2 2 1 1 2 1 1 2 1 1 1 1 1 1 0 1 1 2 2
1 1 1 1 0 1 1 2 0 0 1 1 1 1 2 0 2 0 0 1 2 2 2 2 2 0 0 1 2 1 2 2 2 2 0 2 0
2 0 0 0 2 1 2 2 2 0 1 1 1 1 0 1 0 1 2 2 1 1 0 2 1 0]

In [5]: import numpy as np


import matplotlib.pyplot as plt

def initialize_membership_matrix(n_samples, c):


"""
Initialize the membership matrix with random values and normalize so that the sum of memberships for each data point is 1.
"""
U = np.random.rand(n_samples, c)
U = U / np.sum(U, axis=1, keepdims=True)
return U

def calculate_cluster_centers(X, U, m):


"""
Calculate the cluster centers.
"""
um = U ** m
centers = (um.T @ X) / np.sum(um.T, axis=1, keepdims=True)
return centers

def update_membership_matrix(X, centers, m):


"""
Update the membership matrix based on the current cluster centers.
"""
distance = np.linalg.norm(X[:, np.newaxis] - centers, axis=2) + 1e-10 # Add epsilon to avoid division by zero
power = 2 / (m - 1)
inv_distance = distance ** power
U_new = 1 / np.sum((distance[:, :, np.newaxis] / distance[:, np.newaxis, :]) ** power, axis=2)
return U_new

def fuzzy_c_means(X, c, m=2, max_iter=100, error=1e-5):


"""
Perform Fuzzy C-Means clustering on data X.

Parameters:
- X: ndarray of shape (n_samples, n_features)
- c: int, number of clusters
- m: float, fuzziness parameter
- max_iter: int, maximum number of iterations
- error: float, convergence threshold

Returns:
- centers: ndarray of shape (c, n_features)
- U: ndarray of shape (n_samples, c), membership matrix
"""
n_samples = X.shape[0]
# Initialize membership matrix
U = initialize_membership_matrix(n_samples, c)

for iteration in range(max_iter):


U_old = U.copy()
# Calculate cluster centers
centers = calculate_cluster_centers(X, U, m)
# Update membership matrix
distance = np.linalg.norm(X[:, np.newaxis] - centers, axis=2) + 1e-10 # Avoid division by zero
exponent = 2 / (m - 1)
temp = distance[:, :, np.newaxis] / distance[:, np.newaxis, :] # Shape: (n_samples, c, c)
temp = temp ** exponent
U = 1 / np.sum(temp, axis=2)

# Check for convergence


if np.linalg.norm(U - U_old) < error:
print(f"Converged at iteration {iteration}")
break
else:
print("Reached maximum iterations without convergence.")

return centers, U

# Example Usage
if __name__ == "__main__":
# Generate synthetic data
np.random.seed(42)
from sklearn.datasets import make_blobs

X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

# Perform Fuzzy C-Means clustering


c = 4 # Number of clusters
m = 2 # Fuzziness parameter
centers, U = fuzzy_c_means(X, c, m)

# Assign data points to the cluster with the highest membership


cluster_labels = np.argmax(U, axis=1)

# Plot the results


plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=cluster_labels, cmap='viridis', marker='o', edgecolor='k', s=50, alpha=0.6)
plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='X', s=200, label='Centers')
plt.title("Fuzzy C-Means Clustering")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend()
plt.show()

Converged at iteration 15

In [ ]:

You might also like