Data Science Exercise Hard
Data Science Exercise Hard
5.1
import numpy as np
import matplotlib.pyplot as plt
import copy
Parameters:
data (numpy.ndarray): Data matrix of shape (n_features, n_samples)
centroids (numpy.ndarray): Centroid matrix of shape (n_features, n_clusters)
Returns:
numpy.ndarray: Cluster assignments for each data point
"""
P = data.shape[1] # Number of data points
assignments = []
for p in range(P):
# Get pth point
x_p = data[:, p][:, np.newaxis]
1
return np.array(assignments)
Parameters:
data (numpy.ndarray): Data matrix of shape (n_features, n_samples)
old_centroids (numpy.ndarray): Current centroid matrix
assignments (numpy.ndarray): Cluster assignments for each point
Returns:
numpy.ndarray: Updated centroids
"""
K = old_centroids.shape[1] # Number of clusters
for k in range(K):
# Get indices of points assigned to cluster k
S_k = np.argwhere(assignments == k)
centroids.append(c_k)
2
plt.title('Blobs Dataset')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid(True)
plt.show()
# Update centroids
centroids = update_centroids(data, centroids, assignments)
all_centroids.append(centroids.copy())
# Final assignments
final_assignments = update_assignments(data, centroids)
3
# Visualize final clustering
plt.figure(figsize=(10, 8))
plt.scatter(data[0, :], data[1, :], c=final_assignments, cmap='viridis')
plt.scatter(centroids[0, :], centroids[1, :], c='red', marker='*', s=300)
plt.title('Final K-means Clustering (K=3)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid(True)
plt.show()
4
5
6
7
8
9
5.2
for p in range(P):
# Get the point and its assigned centroid
x_p = data[:, p][:, np.newaxis]
c_k = centroids[:, assignments[p]][:, np.newaxis]
10
# Calculate distance
dist = np.sqrt(np.sum((x_p - c_k)**2))
total_distance += dist
return total_distance / P
for _ in range(n_runs):
# Initialize centroids
indices = np.random.choice(data.shape[1], K, replace=False)
centroids = data[:, indices]
# Run K-means
for _ in range(max_its):
assignments = update_assignments(data, centroids)
centroids = update_centroids(data, centroids, assignments)
# Final assignments
assignments = update_assignments(data, centroids)
for k in k_range:
_, _, distance = run_kmeans(data, k, max_its=5, n_runs=3)
11
distances.append(distance)
Looking at the scree plot, we can identify the optimal number of clusters by finding the “elbow
point” - the point where adding more clusters doesn’t significantly decrease the intra-cluster
distance.
For this dataset with 3 visible blobs, we would expect to see a significant decrease in distance
when moving from K=1 to K=3, and then a much smaller decrease afterward. This creates
an “elbow” in the plot at K=3, thus indicating indicating that this is the optimal number of
clusters (correct value for K).
12