0% found this document useful (0 votes)
5 views

Intro Cluster Problem Python

The document introduces clustering problems in machine learning without supervision using bias to solve the current problem: how to group similar people. It shows various clustering algorithms like K-Means, OPTICS, DBSCAN applied to artificial datasets and discusses metrics to analyze clustering results.
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views

Intro Cluster Problem Python

The document introduces clustering problems in machine learning without supervision using bias to solve the current problem: how to group similar people. It shows various clustering algorithms like K-Means, OPTICS, DBSCAN applied to artificial datasets and discusses metrics to analyze clustering results.
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 13

introclusterproblem

April 27, 2024

Intradata Face Clustering Introdução a problemas de machine learning não “supervisionados”


utilizando o viés para a resolução do problema atual:
1. Como agrupar pessoas similares.
[ ]: import pandas as pd
import numpy as np
import seaborn as sns

from pickle import load


from matplotlib import pyplot as plt
from sklearn.datasets import make_blobs

from sklearn import cluster


from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer

from warnings import filterwarnings

filterwarnings('ignore')

#plt.rcParams['figure.figsize'] = [13, 6]
#plt.rcParams['font.size'] = 13

[ ]: from umap.umap_ import UMAP


from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

0.0.1 1.0. Artificial Datasets


1.1. Make Dataset
[ ]: #np.random.seed(7)
# Random sample with Overlapping and Grouped Poits

X, Y = make_blobs(
n_samples=100,
n_features=2,
centers=2,

1
cluster_std=1.0,
center_box=(-10.0, 10.0),
shuffle=True,
random_state=None,
return_centers=False,
)

fig, ax = plt.subplots(1, 2, figsize=(10, 5))


sns.scatterplot(x=X[:,0], y=X[:,1], hue=Y, ax=ax[0], palette=['red', 'blue'])
ax[1].hist(X[:,0], label='0', color='red');
ax[1].hist(X[:,1], label='1', color='blue');
ax[0].set_title('Clusters for 2-D');
ax[1].set_title('Hists for 2-D');
ax[1].legend();

0.0.2 1.2. Apply Cluster Models


1.2.1. KMeans + Metrics
[ ]: clusters = [2, 3, 4, 5, 6]

kmeans = KElbowVisualizer(cluster.KMeans(), k=clusters, metric='silhouette')


kmeans.fit(X)
ax1 = kmeans.show();

2
[ ]: fig, ax = plt.subplots(2, 2, figsize=(8,6))
ax = ax.flatten()

for k, i in zip(clusters, ax):


kmeans = cluster.KMeans(n_clusters=k, init='k-means++', n_init=10,␣
↪max_iter=10)

kmeans.fit(X)

i.scatter(x=X[:,0], y=X[:,1], c=kmeans.labels_, cmap='Dark2')


i.set_title(f'Number of Clusters: {k}')

plt.tight_layout()

3
[ ]: # Predict closest Cluster for new User Image.
kmeans.predict(X)

[ ]: array([3, 4, 2, 4, 4, 1, 1, 3, 4, 0, 0, 0, 2, 0, 0, 1, 3, 1, 2, 0, 2, 0,
1, 2, 4, 1, 4, 4, 0, 1, 0, 2, 0, 0, 2, 3, 4, 4, 0, 3, 4, 1, 1, 4,
3, 1, 0, 0, 3, 3, 2, 2, 3, 4, 4, 1, 4, 4, 0, 0, 2, 4, 2, 4, 4, 0,
1, 0, 2, 2, 4, 0, 0, 0, 3, 1, 2, 2, 3, 2, 3, 1, 3, 0, 0, 0, 0, 1,
2, 0, 3, 1, 4, 1, 4, 4, 3, 2, 3, 2], dtype=int32)

[ ]: fig, ax = plt.subplots(2, 2, figsize=(10,10))


ax = ax.flatten()

for k, i in zip(clusters, ax):


kmeans = cluster.KMeans(n_clusters=k, init='k-means++', n_init=10,␣
↪max_iter=10)

viz = SilhouetteVisualizer(kmeans, ax=i)


viz.fit(X)

viz.finalize()

4
plt.tight_layout()

1.2.2. Shared Nearest Neigh


[ ]: from SharedNearestNeighbors.shared_nearest_neighbors import SNN

eps = [3, 4, 5, 6]

fig, ax = plt.subplots(2,2)
ax = ax.flatten()

for ep, i in zip(eps, range(0,len(eps))):


snn = SNN(
n_neighbors=7,

5
eps=ep,
min_samples=2,
algorithm="auto",
leaf_size=30,
metric="euclidean",
p=None,
metric_params=None,
).fit(X)

sns.scatterplot(x=X[:,0], y=X[:,1], hue=snn.labels_, palette='inferno',␣


↪ size=snn.labels_, ax=ax[i])
ax[i].set_title(f"Eps: {ep}")
print(f'For {ep}, size if: {np.unique(snn.labels_).shape}')

For 3, size if: (1,)


For 4, size if: (5,)
For 5, size if: (15,)
For 6, size if: (25,)

[ ]: snn = SNN(
n_neighbors=8,

6
eps=7,
min_samples=1,
algorithm="auto",
leaf_size=30,
metric="euclidean",
p=None,
metric_params=None,
).fit(X)

sns.scatterplot(x=X[:,0], y=X[:,1], hue=snn.labels_, palette='inferno',␣


↪size=snn.labels_)

print(f'Unique Clusters: {np.unique(snn.labels_).shape}')

Unique Clusters: (54,)

1.2.3. Shared Nearest Neigh 2


[ ]: snn_model = SNN(neighbor_num=1, min_shared_neighbor_proportion=1).fit(X)

sns.scatterplot(x=X[:,0], y=X[:,1], hue=snn_model.labels_, palette='inferno',␣


↪size=snn_model.labels_)

[ ]: <AxesSubplot: >

7
[ ]: np.unique(snn_model.labels_).shape

[ ]: (71,)

1.2.4. OPTICS
[ ]: opt = cluster.OPTICS(
min_samples=2,
max_eps=np.inf,
metric="euclidean",
p=2,
metric_params=None,
cluster_method="xi",
eps=1,
xi=0.05,
predecessor_correction=True,
min_cluster_size=None,
algorithm="auto",
leaf_size=30,
memory=None,
).fit(X)

8
sns.scatterplot(x=X[:,0], y=X[:,1], hue=opt.labels_, palette='inferno',␣
↪size=opt.labels_)

np.unique(opt.labels_).shape

[ ]: (28,)

1.2.5. DBSCAN
[ ]: dbs = cluster.DBSCAN(
eps=0.001,
min_samples=1,
metric="canberra",
metric_params=None,
algorithm="auto",
leaf_size=30,
p=None,
n_jobs=None,
).fit(X)

sns.scatterplot(x=X[:,0], y=X[:,1], hue=dbs.labels_, palette='inferno',␣


↪size=dbs.labels_)

9
np.unique(dbs.labels_).shape

[ ]: (100,)

1.2.6. Manual Confs


[ ]: # Manual Config for Shr
#from sklearn.cluster import DBSCAN
#from sklearn.neighbors import kneighbors_graph
#
#import numpy as np
#from sklearn.base import BaseEstimator, ClusterMixin
#
#def get_snn_similarity(x0, x1):
# """Calculate the shared-neighbor similarity of two sets of nearest␣
↪neighbors, normalized by the maximum number of shared neighbors"""

#
# return len(x0.intersection(x1)) / len(x0)
#
#
#def get_snn_distance(x0, x1):

10
# """Calculate the shared-neighbor distance of two sets of nearest␣
neighbors, normalized by the maximum number of shared neighbors"""

#
# return 1 - get_snn_similarity(x0, x1)
#
#def snn(X, neighbor_num, min_shared_neighbor_num):
# """Perform Shared Nearest Neighbor (SNN) clustering algorithm clustering.
# Parameters
# ----------
# X : array or sparse (CSR) matrix of shape (n_samples, n_features), or␣
↪array of shape (n_samples, n_samples)

# A feature array
# neighbor_num : int
# K number of neighbors to consider for shared nearest neighbor similarity
# min_shared_neighbor_num : int
# Number of nearest neighbors that need to share two data points to be␣
↪considered part of the same cluster

# """
#
# # for each data point, find their set of K nearest neighbors
# knn_graph = kneighbors_graph(X, n_neighbors=neighbor_num,␣
↪include_self=False)

# neighbors = np.array([set(knn_graph[i].nonzero()[1]) for i in␣


↪range(len(X))])

#
# # the distance matrix is computed as the complementary of the proportion␣
↪of shared neighbors between each pair of data points

# snn_distance_matrix = np.asarray([[get_snn_distance(neighbors[i],␣
↪neighbors[j]) for j in range(len(neighbors))] for i in␣

↪range(len(neighbors))])

#
# ssn_distance_matrix = []
#
#
# # perform DBSCAN with the shared-neighbor distance criteria for density␣
↪estimation

# dbscan = DBSCAN(min_samples=min_shared_neighbor_num, metric="precomputed")


# dbscan = dbscan.fit(snn_distance_matrix)
# return dbscan.core_sample_indices_, dbscan.labels_
#
#
#class SNN(BaseEstimator, ClusterMixin):
# """Class for performing the Shared Nearest Neighbor (SNN) clustering␣
↪algorithm.

# Parameters
# ----------

11
# neighbor_num : int
# K number of neighbors to consider for shared nearest neighbor␣
↪similarity

# min_shared_neighbor_proportion : float [0, 1]


# Proportion of the K nearest neighbors that need to share two data␣
↪points to be considered part of the same cluster

# Note: Naming conventions for attributes are based on the analogous ones of␣
↪DBSCAN

# """
#
# def __init__(self, neighbor_num, min_shared_neighbor_proportion):
#
# """Constructor"""
#
# self.neighbor_num = neighbor_num
# self.min_shared_neighbor_num = round(neighbor_num *␣
↪min_shared_neighbor_proportion)

#
# def fit(self, X):
#
# """Perform SNN clustering from features or distance matrix.
# Parameters
# ----------
# X : array or sparse (CSR) matrix of shape (n_samples, n_features), or␣
↪array of shape (n_samples, n_samples)

# A feature array
# """
#
# clusters = snn(X, neighbor_num=self.neighbor_num,␣
↪min_shared_neighbor_num=self.min_shared_neighbor_num)

# self.core_sample_indices_, self.labels_ = clusters


# if len(self.core_sample_indices_):
# # fix for scipy sparse indexing issue
# self.components_ = X[self.core_sample_indices_].copy()
# else:
# # no core samples
# self.components_ = np.empty((0, X.shape[1]))
# return self
#
# def fit_predict(self, X, y=None, sample_weight=None):
# """Performs clustering on X and returns cluster labels.
# Parameters
# ----------
# X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
# array of shape (n_samples, n_samples)
# A feature array, or array of distances between samples if

12
# ``metric='precomputed'``.
# sample_weight : array, shape (n_samples,), optional
# Weight of each sample, such that a sample with a weight of at least
# ``min_samples`` is by itself a core sample; a sample with negative
# weight may inhibit its eps-neighbor from being core.
# Note that weights are absolute, and default to 1.
# y : Ignored
# Returns
# -------
# y : ndarray, shape (n_samples,)
# cluster labels
# """
# self.fit(X)
# return self.labels_

#snn_model = SNN(neighbor_num=1, min_shared_neighbor_proportion=0.5).fit(X[:,:


↪337].sample(10_000))

13

You might also like