0% found this document useful (0 votes)
3 views12 pages

Data Science Exercise Hard

The document outlines a K-means clustering implementation in Python, including functions to update assignments and centroids, visualize clustering iterations, and calculate intra-cluster distances. It runs K-means multiple times to find the best clustering results and generates a scree plot to identify the optimal number of clusters. The analysis concludes that the optimal number of clusters for the dataset is 3, as indicated by the elbow point in the scree plot.

Uploaded by

kevinliangisfat
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views12 pages

Data Science Exercise Hard

The document outlines a K-means clustering implementation in Python, including functions to update assignments and centroids, visualize clustering iterations, and calculate intra-cluster distances. It runs K-means multiple times to find the best clustering results and generates a scree plot to identify the optimal number of clusters. The analysis concludes that the optimal number of clusters for the dataset is 3, as indicated by the elbow point in the scree plot.

Uploaded by

kevinliangisfat
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 12

HW4 Exercise 5

DATASCI 3ML3 Winter 2025

Mithun Manivannan: 400309374

5.1

import numpy as np
import matplotlib.pyplot as plt
import copy

def update_assignments(data, centroids):


"""
Assign each data point to the closest centroid

Parameters:
data (numpy.ndarray): Data matrix of shape (n_features, n_samples)
centroids (numpy.ndarray): Centroid matrix of shape (n_features, n_clusters)

Returns:
numpy.ndarray: Cluster assignments for each data point
"""
P = data.shape[1] # Number of data points
assignments = []

for p in range(P):
# Get pth point
x_p = data[:, p][:, np.newaxis]

# Calculate squared distances to all centroids


diffs = np.sum((x_p - centroids)**2, axis=0)

# Find the closest centroid


ind = np.argmin(diffs)
assignments.append(ind)

1
return np.array(assignments)

def update_centroids(data, old_centroids, assignments):


"""
Update centroid locations based on assigned points

Parameters:
data (numpy.ndarray): Data matrix of shape (n_features, n_samples)
old_centroids (numpy.ndarray): Current centroid matrix
assignments (numpy.ndarray): Cluster assignments for each point

Returns:
numpy.ndarray: Updated centroids
"""
K = old_centroids.shape[1] # Number of clusters

# Container for new centroids


centroids = []

for k in range(K):
# Get indices of points assigned to cluster k
S_k = np.argwhere(assignments == k)

# If cluster has points, calculate new centroid


if np.size(S_k) > 0:
c_k = np.mean(data[:, S_k], axis=1)
else:
# If no points in cluster, keep old centroid
c_k = copy.deepcopy(old_centroids[:, k])[:, np.newaxis]

centroids.append(c_k)

# Convert list to array with proper dimensions


centroids = np.array(centroids)[:, :, 0]
return centroids.T

# Load the blobs dataset


data = np.loadtxt('blobs.dat')

# Visualize the dataset


plt.figure(figsize=(8, 6))
plt.scatter(data[0, :], data[1, :], c='black')

2
plt.title('Blobs Dataset')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid(True)
plt.show()

# Initialize 3 centroids by picking 3 random data points


np.random.seed(42) # For reproducibility
K = 3
indices = np.random.choice(data.shape[1], K, replace=False)
init_centroids = data[:, indices]

# Run K-means for 5 iterations


centroids = init_centroids.copy()
max_its = 5

# Store assignments and centroids at each iteration for visualization


all_assignments = []
all_centroids = [centroids.copy()]

for iteration in range(max_its):


# Update assignments
assignments = update_assignments(data, centroids)
all_assignments.append(assignments)

# Update centroids
centroids = update_centroids(data, centroids, assignments)
all_centroids.append(centroids.copy())

# Visualize current iteration


plt.figure(figsize=(8, 6))
plt.scatter(data[0, :], data[1, :], c=assignments, cmap='viridis')
plt.scatter(centroids[0, :], centroids[1, :], c='red', marker='*', s=200)
plt.title(f'K-means Clustering - Iteration {iteration+1}')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid(True)
plt.show()

# Final assignments
final_assignments = update_assignments(data, centroids)

3
# Visualize final clustering
plt.figure(figsize=(10, 8))
plt.scatter(data[0, :], data[1, :], c=final_assignments, cmap='viridis')
plt.scatter(centroids[0, :], centroids[1, :], c='red', marker='*', s=300)
plt.title('Final K-means Clustering (K=3)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid(True)
plt.show()

4
5
6
7
8
9
5.2

def calculate_intra_cluster_distance(data, centroids, assignments):


"""Calculate average distance from points to their assigned centroids"""
P = data.shape[1]
total_distance = 0

for p in range(P):
# Get the point and its assigned centroid
x_p = data[:, p][:, np.newaxis]
c_k = centroids[:, assignments[p]][:, np.newaxis]

10
# Calculate distance
dist = np.sqrt(np.sum((x_p - c_k)**2))
total_distance += dist

return total_distance / P

def run_kmeans(data, K, max_its=5, n_runs=3):


"""Run K-means multiple times and return best result"""
best_distance = float('inf')
best_centroids = None
best_assignments = None

for _ in range(n_runs):
# Initialize centroids
indices = np.random.choice(data.shape[1], K, replace=False)
centroids = data[:, indices]

# Run K-means
for _ in range(max_its):
assignments = update_assignments(data, centroids)
centroids = update_centroids(data, centroids, assignments)

# Final assignments
assignments = update_assignments(data, centroids)

# Calculate intra-cluster distance


distance = calculate_intra_cluster_distance(data, centroids, assignments)

# Keep best result


if distance < best_distance:
best_distance = distance
best_centroids = centroids
best_assignments = assignments

return best_centroids, best_assignments, best_distance

# Generate scree plot


k_range = range(1, 11)
distances = []

for k in k_range:
_, _, distance = run_kmeans(data, k, max_its=5, n_runs=3)

11
distances.append(distance)

# Plot scree plot


plt.figure(figsize=(10, 6))
plt.plot(k_range, distances, 'bo-')
plt.title('Scree Plot: Intra-cluster Distance vs. K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Average Intra-cluster Distance')
plt.grid(True)
plt.xticks(k_range)
plt.show()

Looking at the scree plot, we can identify the optimal number of clusters by finding the “elbow
point” - the point where adding more clusters doesn’t significantly decrease the intra-cluster
distance.
For this dataset with 3 visible blobs, we would expect to see a significant decrease in distance
when moving from K=1 to K=3, and then a much smaller decrease afterward. This creates
an “elbow” in the plot at K=3, thus indicating indicating that this is the optimal number of
clusters (correct value for K).

12

You might also like