forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster.py
104 lines (79 loc) · 2.86 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from sklearn.cluster import KMeans, MiniBatchKMeans
from .common import Benchmark, Estimator, Predictor, Transformer
from .datasets import _20newsgroups_highdim_dataset, _blobs_dataset
from .utils import neg_mean_inertia
class KMeansBenchmark(Predictor, Transformer, Estimator, Benchmark):
"""
Benchmarks for KMeans.
"""
param_names = ["representation", "algorithm", "init"]
params = (["dense", "sparse"], ["lloyd", "elkan"], ["random", "k-means++"])
def setup_cache(self):
super().setup_cache()
def make_data(self, params):
representation, algorithm, init = params
if representation == "sparse":
data = _20newsgroups_highdim_dataset(n_samples=8000)
else:
data = _blobs_dataset(n_clusters=20)
return data
def make_estimator(self, params):
representation, algorithm, init = params
max_iter = 30 if representation == "sparse" else 100
estimator = KMeans(
n_clusters=20,
algorithm=algorithm,
init=init,
n_init=1,
max_iter=max_iter,
tol=0,
random_state=0,
)
return estimator
def make_scorers(self):
self.train_scorer = lambda _, __: neg_mean_inertia(
self.X, self.estimator.predict(self.X), self.estimator.cluster_centers_
)
self.test_scorer = lambda _, __: neg_mean_inertia(
self.X_val,
self.estimator.predict(self.X_val),
self.estimator.cluster_centers_,
)
class MiniBatchKMeansBenchmark(Predictor, Transformer, Estimator, Benchmark):
"""
Benchmarks for MiniBatchKMeans.
"""
param_names = ["representation", "init"]
params = (["dense", "sparse"], ["random", "k-means++"])
def setup_cache(self):
super().setup_cache()
def make_data(self, params):
representation, init = params
if representation == "sparse":
data = _20newsgroups_highdim_dataset()
else:
data = _blobs_dataset(n_clusters=20)
return data
def make_estimator(self, params):
representation, init = params
max_iter = 5 if representation == "sparse" else 2
estimator = MiniBatchKMeans(
n_clusters=20,
init=init,
n_init=1,
max_iter=max_iter,
batch_size=1000,
max_no_improvement=None,
compute_labels=False,
random_state=0,
)
return estimator
def make_scorers(self):
self.train_scorer = lambda _, __: neg_mean_inertia(
self.X, self.estimator.predict(self.X), self.estimator.cluster_centers_
)
self.test_scorer = lambda _, __: neg_mean_inertia(
self.X_val,
self.estimator.predict(self.X_val),
self.estimator.cluster_centers_,
)