@@ -64,7 +64,8 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
64
64
method : {'alternate', 'pam'}, default: 'alternate'
65
65
Which algorithm to use. 'alternate' is faster while 'pam' is more accurate.
66
66
67
- init : {'random', 'heuristic', 'k-medoids++', 'build'}, optional, default: 'heuristic'
67
+ init : {'random', 'heuristic', 'k-medoids++', 'build'}, or array-like of shape
68
+ (n_clusters, n_features), optional, default: 'heuristic'
68
69
Specify medoid initialization method. 'random' selects n_clusters
69
70
elements from the dataset. 'heuristic' picks the n_clusters points
70
71
with the smallest sum distance to every other point. 'k-medoids++'
@@ -74,6 +75,8 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
74
75
algorithm. Often 'build' is more efficient but slower than other
75
76
initializations on big datasets and it is also very non-robust,
76
77
if there are outliers in the dataset, use another initialization.
78
+ If an array is passed, it should be of shape (n_clusters, n_features)
79
+ and gives the initial centers.
77
80
78
81
.. _k-means++: https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf
79
82
@@ -181,13 +184,29 @@ def _check_init_args(self):
181
184
182
185
# Check init
183
186
init_methods = ["random" , "heuristic" , "k-medoids++" , "build" ]
184
- if self .init not in init_methods :
187
+ if not (
188
+ hasattr (self .init , "__array__" )
189
+ or (isinstance (self .init , str ) and self .init in init_methods )
190
+ ):
185
191
raise ValueError (
186
192
"init needs to be one of "
187
193
+ "the following: "
188
- + "%s" % init_methods
194
+ + "%s" % ( init_methods + [ "array-like" ])
189
195
)
190
196
197
+ # Check n_clusters
198
+ if (
199
+ hasattr (self .init , "__array__" )
200
+ and self .n_clusters != self .init .shape [0 ]
201
+ ):
202
+ warnings .warn (
203
+ "n_clusters should be equal to size of array-like if init "
204
+ "is array-like setting n_clusters to {}." .format (
205
+ self .init .shape [0 ]
206
+ )
207
+ )
208
+ self .n_clusters = self .init .shape [0 ]
209
+
191
210
def fit (self , X , y = None ):
192
211
"""Fit K-Medoids to the provided data.
193
212
@@ -219,7 +238,7 @@ def fit(self, X, y=None):
219
238
D = pairwise_distances (X , metric = self .metric )
220
239
221
240
medoid_idxs = self ._initialize_medoids (
222
- D , self .n_clusters , random_state_
241
+ D , self .n_clusters , random_state_ , X
223
242
)
224
243
labels = None
225
244
@@ -407,10 +426,14 @@ def predict(self, X):
407
426
408
427
return pd_argmin
409
428
410
- def _initialize_medoids (self , D , n_clusters , random_state_ ):
429
+ def _initialize_medoids (self , D , n_clusters , random_state_ , X = None ):
411
430
"""Select initial mediods when beginning clustering."""
412
431
413
- if self .init == "random" : # Random initialization
432
+ if hasattr (self .init , "__array__" ): # Pre assign cluster
433
+ medoids = np .hstack (
434
+ [np .where ((X == c ).all (axis = 1 )) for c in self .init ]
435
+ ).ravel ()
436
+ elif self .init == "random" : # Random initialization
414
437
# Pick random k medoids as the initial ones.
415
438
medoids = random_state_ .choice (len (D ), n_clusters , replace = False )
416
439
elif self .init == "k-medoids++" :
0 commit comments