Clustering
Clustering
# Initialize KMeans
kmeans = KMeans(n_clusters=3, random_state=0)
K-medoid Clustering
In [4]: import numpy as np
from sklearn.metrics import pairwise_distances
from random import sample
new_medoids = []
# Update medoids for each cluster
for cluster in clusters:
if len(cluster) == 0:
continue
distances_sum = np.sum(pairwise_distances(X[cluster], X[cluster]), axis=1)
new_medoid = cluster[np.argmin(distances_sum)]
new_medoids.append(new_medoid)
medoids = new_medoids
# Example usage:
if __name__ == "__main__":
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
K mean ++
In [4]: from sklearn.cluster import KMeans
import numpy as np
print("Cluster Centers:")
print(kmeans.cluster_centers_)
print("Labels:")
print(kmeans.labels_)
Cluster Centers:
[[10. 2.]
[ 1. 2.]]
Labels:
[1 1 1 0 0 0]
class_member_mask = (labels == k)
xy = X[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
Hierarchical clustering
In [1]: import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.metrics import silhouette_score, confusion_matrix
# Standardize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Plot Dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z, truncate_mode='level', p=3)
plt.title('Agglomerative Hierarchical Clustering Dendrogram (Iris Dataset)')
plt.xlabel('Sample Index or (Cluster Size)')
plt.ylabel('Distance')
plt.show()
# 2. Standardize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 5. Cluster Labels
labels = agglom.labels_
# 8. Plot Dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z, truncate_mode='level', p=3)
plt.title('Agglomerative Hierarchical Clustering Dendrogram (Single Linkage)')
plt.xlabel('Sample Index or (Cluster Size)')
plt.ylabel('Distance')
plt.show()
Parameters:
- X: ndarray of shape (n_samples, n_features)
- max_clusters: int, desired number of clusters
Returns:
- labels: ndarray of shape (n_samples,), cluster labels
"""
# Initialize all points in one cluster
clusters = {0: np.arange(X.shape[0])}
current_cluster = 0
next_cluster_id = 1
# Assign labels
labels_final = np.zeros(X.shape[0], dtype=int)
for cluster_id, indices in clusters.items():
labels_final[indices] = cluster_id
return labels_final
# 2. Standardize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 7. Plot Clusters
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', label='Cluster')
plt.title('Divisive Clustering with Single Linkage (PCA Reduced)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(*scatter.legend_elements(), title="Clusters")
plt.show()
Parameters:
- X: ndarray of shape (n_samples, n_features)
- max_clusters: int, desired number of clusters
Returns:
- labels: ndarray of shape (n_samples,), cluster labels
- hierarchy: list of tuples, each representing a split in the format (parent, child1, child2, distance)
"""
# Initialize all points in one cluster
clusters = {0: np.arange(X.shape[0])}
hierarchy = []
current_cluster_id = 0
next_cluster_id = 1
# Assign labels
labels_final = np.zeros(X.shape[0], dtype=int)
for cluster_id, indices in clusters.items():
labels_final[indices] = cluster_id
def build_dendrogram(hierarchy):
"""
Build a dendrogram-like graph from the hierarchy of splits.
Parameters:
- hierarchy: list of tuples, each representing a split in the format (parent, child1, child2, distance)
Returns:
- G: NetworkX graph representing the dendrogram
"""
G = nx.Graph()
return G
Parameters:
- G: NetworkX graph
- root: the root node of current branch
- width: horizontal space allocated for this branch
- vert_gap: gap between levels of hierarchy
- vert_loc: vertical location of root
- xcenter: horizontal location of root
Returns:
- pos: dict mapping nodes to positions
"""
def _hierarchy_pos(G, root, left, right, vert_gap, vert_loc, pos, parent=None):
children = list(G.neighbors(root))
if parent is not None:
children.remove(parent)
if len(children) != 0:
dx = width / len(children)
nextx = left
for child in children:
nextx += dx
pos = _hierarchy_pos(G, child, nextx - dx/2, nextx + dx/2, vert_gap, vert_loc - vert_gap, pos, root)
pos[root] = (xcenter, vert_loc)
return pos
pos = {}
pos = _hierarchy_pos(G, root, 0, width, vert_gap, vert_loc, pos)
return pos
# 2. Standardize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
plt.axis('off')
plt.tight_layout()
plt.show()
Collecting scikit-fuzzy
Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl (920 kB)
Installing collected packages: scikit-fuzzy
Successfully installed scikit-fuzzy-0.5.0
Cluster Centers: [[0.22645397 0.71840176]
[0.52083891 0.18668653]
[0.76252289 0.60239021]]
Cluster Membership: [2 2 0 0 2 2 2 1 0 2 2 0 0 0 1 0 0 0 2 2 1 1 2 1 1 2 1 1 1 1 1 1 0 1 1 2 2
1 1 1 1 0 1 1 2 0 0 1 1 1 1 2 0 2 0 0 1 2 2 2 2 2 0 0 1 2 1 2 2 2 2 0 2 0
2 0 0 0 2 1 2 2 2 0 1 1 1 1 0 1 0 1 2 2 1 1 0 2 1 0]
Parameters:
- X: ndarray of shape (n_samples, n_features)
- c: int, number of clusters
- m: float, fuzziness parameter
- max_iter: int, maximum number of iterations
- error: float, convergence threshold
Returns:
- centers: ndarray of shape (c, n_features)
- U: ndarray of shape (n_samples, c), membership matrix
"""
n_samples = X.shape[0]
# Initialize membership matrix
U = initialize_membership_matrix(n_samples, c)
return centers, U
# Example Usage
if __name__ == "__main__":
# Generate synthetic data
np.random.seed(42)
from sklearn.datasets import make_blobs
Converged at iteration 15
In [ ]: