Intro Cluster Problem Python
Intro Cluster Problem Python
filterwarnings('ignore')
#plt.rcParams['figure.figsize'] = [13, 6]
#plt.rcParams['font.size'] = 13
X, Y = make_blobs(
n_samples=100,
n_features=2,
centers=2,
1
cluster_std=1.0,
center_box=(-10.0, 10.0),
shuffle=True,
random_state=None,
return_centers=False,
)
2
[ ]: fig, ax = plt.subplots(2, 2, figsize=(8,6))
ax = ax.flatten()
kmeans.fit(X)
plt.tight_layout()
3
[ ]: # Predict closest Cluster for new User Image.
kmeans.predict(X)
[ ]: array([3, 4, 2, 4, 4, 1, 1, 3, 4, 0, 0, 0, 2, 0, 0, 1, 3, 1, 2, 0, 2, 0,
1, 2, 4, 1, 4, 4, 0, 1, 0, 2, 0, 0, 2, 3, 4, 4, 0, 3, 4, 1, 1, 4,
3, 1, 0, 0, 3, 3, 2, 2, 3, 4, 4, 1, 4, 4, 0, 0, 2, 4, 2, 4, 4, 0,
1, 0, 2, 2, 4, 0, 0, 0, 3, 1, 2, 2, 3, 2, 3, 1, 3, 0, 0, 0, 0, 1,
2, 0, 3, 1, 4, 1, 4, 4, 3, 2, 3, 2], dtype=int32)
viz.finalize()
4
plt.tight_layout()
eps = [3, 4, 5, 6]
fig, ax = plt.subplots(2,2)
ax = ax.flatten()
5
eps=ep,
min_samples=2,
algorithm="auto",
leaf_size=30,
metric="euclidean",
p=None,
metric_params=None,
).fit(X)
[ ]: snn = SNN(
n_neighbors=8,
6
eps=7,
min_samples=1,
algorithm="auto",
leaf_size=30,
metric="euclidean",
p=None,
metric_params=None,
).fit(X)
[ ]: <AxesSubplot: >
7
[ ]: np.unique(snn_model.labels_).shape
[ ]: (71,)
1.2.4. OPTICS
[ ]: opt = cluster.OPTICS(
min_samples=2,
max_eps=np.inf,
metric="euclidean",
p=2,
metric_params=None,
cluster_method="xi",
eps=1,
xi=0.05,
predecessor_correction=True,
min_cluster_size=None,
algorithm="auto",
leaf_size=30,
memory=None,
).fit(X)
8
sns.scatterplot(x=X[:,0], y=X[:,1], hue=opt.labels_, palette='inferno',␣
↪size=opt.labels_)
np.unique(opt.labels_).shape
[ ]: (28,)
1.2.5. DBSCAN
[ ]: dbs = cluster.DBSCAN(
eps=0.001,
min_samples=1,
metric="canberra",
metric_params=None,
algorithm="auto",
leaf_size=30,
p=None,
n_jobs=None,
).fit(X)
9
np.unique(dbs.labels_).shape
[ ]: (100,)
#
# return len(x0.intersection(x1)) / len(x0)
#
#
#def get_snn_distance(x0, x1):
10
# """Calculate the shared-neighbor distance of two sets of nearest␣
neighbors, normalized by the maximum number of shared neighbors"""
↪
#
# return 1 - get_snn_similarity(x0, x1)
#
#def snn(X, neighbor_num, min_shared_neighbor_num):
# """Perform Shared Nearest Neighbor (SNN) clustering algorithm clustering.
# Parameters
# ----------
# X : array or sparse (CSR) matrix of shape (n_samples, n_features), or␣
↪array of shape (n_samples, n_samples)
# A feature array
# neighbor_num : int
# K number of neighbors to consider for shared nearest neighbor similarity
# min_shared_neighbor_num : int
# Number of nearest neighbors that need to share two data points to be␣
↪considered part of the same cluster
# """
#
# # for each data point, find their set of K nearest neighbors
# knn_graph = kneighbors_graph(X, n_neighbors=neighbor_num,␣
↪include_self=False)
#
# # the distance matrix is computed as the complementary of the proportion␣
↪of shared neighbors between each pair of data points
# snn_distance_matrix = np.asarray([[get_snn_distance(neighbors[i],␣
↪neighbors[j]) for j in range(len(neighbors))] for i in␣
↪range(len(neighbors))])
#
# ssn_distance_matrix = []
#
#
# # perform DBSCAN with the shared-neighbor distance criteria for density␣
↪estimation
# Parameters
# ----------
11
# neighbor_num : int
# K number of neighbors to consider for shared nearest neighbor␣
↪similarity
# Note: Naming conventions for attributes are based on the analogous ones of␣
↪DBSCAN
# """
#
# def __init__(self, neighbor_num, min_shared_neighbor_proportion):
#
# """Constructor"""
#
# self.neighbor_num = neighbor_num
# self.min_shared_neighbor_num = round(neighbor_num *␣
↪min_shared_neighbor_proportion)
#
# def fit(self, X):
#
# """Perform SNN clustering from features or distance matrix.
# Parameters
# ----------
# X : array or sparse (CSR) matrix of shape (n_samples, n_features), or␣
↪array of shape (n_samples, n_samples)
# A feature array
# """
#
# clusters = snn(X, neighbor_num=self.neighbor_num,␣
↪min_shared_neighbor_num=self.min_shared_neighbor_num)
12
# ``metric='precomputed'``.
# sample_weight : array, shape (n_samples,), optional
# Weight of each sample, such that a sample with a weight of at least
# ``min_samples`` is by itself a core sample; a sample with negative
# weight may inhibit its eps-neighbor from being core.
# Note that weights are absolute, and default to 1.
# y : Ignored
# Returns
# -------
# y : ndarray, shape (n_samples,)
# cluster labels
# """
# self.fit(X)
# return self.labels_
13