scikit-learn-contrib · rth · Nov 8, 2020 · Oct 4, 2020 · Oct 4, 2020 · Oct 4, 2020
diff --git a/doc/modules/robust.rst b/doc/modules/robust.rst
@@ -1,8 +1,8 @@
 .. _robust:
 
-===================================================
-Robust algorithms for Regression and Classification
-===================================================
+===============================================================
+Robust algorithms for Regression, Classification and Clustering
+===============================================================
 
 .. currentmodule:: sklearn_extra.robust
 
@@ -59,7 +59,7 @@ that minimizes an estimation of the risk.
 
   \widehat{f} = \text{argmin}_{f\in F}\frac{1}{n}\sum_{i=1}^n\ell(f(X_i),y_i),
 
-where the :math:`ell` is a loss function (e.g. the squared distance in
+where the :math:`\ell` is a loss function (e.g. the squared distance in
 regression problems). Said in another way, we are trying to minimize an
 estimation of the expected risk and this estimation corresponds to an empirical
 mean. However, it is well known that the empirical mean is not robust to
@@ -90,10 +90,7 @@ The algorithm
 -------------
 
 The approach is implemented as a meta algorithm that takes as input a base
-estimator (e.g., SGDClassifier or SGDRegressor). To be compatible, the
-base estimator must support partial_fit and sample_weight
-partial_fit and sample_weight. Refer to the KMeans example for a template
-to adapt the method to other estimators.
+estimator (e.g., SGDClassifier, SGDRegressor or MiniBatchKMeans).
 
 At each step, the algorithm estimates sample weights that are meant to be small
 for outliers and large for inliers and then we do one optimization step using
@@ -155,25 +152,21 @@ Hence, we will not talk about classification algorithms in this comparison.
 
 As such we only compare ourselves to TheilSenRegressor and RANSACRegressor as
 they both deal with outliers in X and in Y and are closer to
-RobustWeightedEstimator.
+RobustWeightedRegressor.
 
 **Warning:** Huber weights used in our algorithm should not be confused with
 HuberRegressor or other regression with “robust losses”. Those types of
 regressions are robust only to outliers in the label Y but not in X.
 
 Pro: RANSACRegressor and TheilSenRegressor both use a hard rejection of
 outlier. This can be interpreted as though there was an outlier detection
-step and then a regression step whereas RobustWeightedEstimator is directly
+step and then a regression step whereas RobustWeightedRegressor is directly
 robust to outliers. This often increase the performance on moderatly corrupted
 datasets.
 
 Con: In general, this algorithm is slower than both  TheilSenRegressor and
 RANSACRegressor.
 
-One other advantage of RobustWeightedEstimator is that it can be used for a
-broad range of algorithms. For example, one can do robust unsupervised
-learning with RobustWeightedEstimator, see the example using KMeans algorithm.
-
 Speed and limits of the algorithm
 ---------------------------------
 
@@ -188,9 +181,9 @@ Complexity and limitation:
 
 * weighting=”huber”: the complexity is larger than that of base_estimator but
   it is still of the same order of magnitude.
-* weighting=”mom”: the larger k is the faster the algorithm will perform if 
-  sample_size is large. This weighting scheme is advised only with 
-  sufficiently large dataset (thumb rule sample_size > 500 the specifics 
+* weighting=”mom”: the larger k is the faster the algorithm will perform if
+  sample_size is large. This weighting scheme is advised only with
+  sufficiently large dataset (thumb rule sample_size > 500 the specifics
   depend on the dataset).
 
 **Warning:** On a real dataset, one should be aware that there can be outliers

diff --git a/examples/plot_clustering.py b/examples/plot_clustering.py
@@ -3,15 +3,15 @@
 ===================================================================
 A demo of several clustering algorithms on a corrupted dataset
 ===================================================================
-In this example we exhibit the results of various 
+In this example we exhibit the results of various
 scikit-learn and scikit-learn-extra clustering algorithms on
 a dataset with outliers.
-KMedoids is the most stable and efficient 
+KMedoids is the most stable and efficient
 algorithm for this application (change the seed to
-see different behavior for SpectralClustering and 
+see different behavior for SpectralClustering and
 the robust kmeans).
-The mean-shift algorithm, once correctly 
-parameterized, detects the outliers as a class of 
+The mean-shift algorithm, once correctly
+parameterized, detects the outliers as a class of
 their own.
 """
 print(__doc__)
@@ -22,11 +22,11 @@
 import matplotlib.pyplot as plt
 
 from sklearn import cluster, mixture
-from sklearn.cluster import MiniBatchKMeans, KMeans
+from sklearn.cluster import KMeans
 from sklearn.datasets import make_blobs
 from sklearn.utils import shuffle
 
-from sklearn_extra.robust import RobustWeightedEstimator
+from sklearn_extra.robust import RobustWeightedKMeans
 from sklearn_extra.cluster import KMedoids
 
 rng = np.random.RandomState(42)
@@ -37,16 +37,6 @@
 kmeans = KMeans(n_clusters=n_clusters, random_state=rng)
 kmedoid = KMedoids(n_clusters=n_clusters, random_state=rng)
 
-
-def kmeans_loss(X, pred):
-    return np.array(
-        [
-            np.linalg.norm(X[pred[i]] - np.mean(X[pred == pred[i]])) ** 2
-            for i in range(len(X))
-        ]
-    )
-
-
 two_means = cluster.MiniBatchKMeans(n_clusters=n_clusters, random_state=rng)
 spectral = cluster.SpectralClustering(
     n_clusters=n_clusters,
@@ -78,15 +68,10 @@ def kmeans_loss(X, pred):
     X = shuffle(X, random_state=rng)
 
     # Define two other clustering algorithms
-    kmeans_rob = RobustWeightedEstimator(
-        MiniBatchKMeans(
-            n_clusters, batch_size=len(X), init="random", random_state=rng
-        ),
-        # in theory, init=kmeans++ is very non-robust
-        burn_in=0,
+    kmeans_rob = RobustWeightedKMeans(
+        n_clusters,
         eta0=0.01,
         weighting="mom",
-        loss=kmeans_loss,
         max_iter=100,
         k=int(n_samples / 50),
         random_state=rng,

diff --git a/examples/plot_robust_classification_diabete.py b/examples/plot_robust_classification_diabete.py
@@ -3,22 +3,21 @@
 ======================================================================
 A demo of Robust Classification on real dataset "diabetes" from OpenML
 ======================================================================
-In this example we compare the RobustWeightedEstimator using SGDClassifier
+In this example we compare the RobustWeightedCLassifier 
 for classification on the real dataset "diabetes".
 WARNING: running this example can take some time (<1hour).
 We only compare the estimator with SGDClassifier as there is no robust
 classification estimator in scikit-learn.
 """
 import matplotlib.pyplot as plt
 import numpy as np
-from sklearn_extra.robust import RobustWeightedEstimator
+from sklearn_extra.robust import RobustWeightedClassifier
 from sklearn.linear_model import SGDClassifier
 from sklearn.datasets import fetch_openml
 from sklearn.metrics import roc_auc_score, make_scorer
 from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import RobustScaler
 
-
 X, y = fetch_openml(name="diabetes", return_X_y=True)
 
 # replace the label names with 0 or 1
@@ -36,8 +35,7 @@
 # Using GridSearchCV, we tuned the parameters c and eta0, with the
 # choice of "huber" weighting because the sample_size is not very large.
 
-clf_rob = RobustWeightedEstimator(
-    SGDClassifier(average=10, learning_rate="optimal", loss="hinge"),
+clf_rob = RobustWeightedClassifier(
     weighting="huber",
     loss="hinge",
     c=1.35,
@@ -50,8 +48,19 @@
 M = 10
 res = []
 for f in range(M):
+    rng = np.random.RandomState(f)
     print("\r Progress: %s / %s" % (f + 1, M), end="")
-    clf = SGDClassifier(average=10, learning_rate="optimal", loss="hinge")
+    clf = SGDClassifier(
+        average=10, learning_rate="optimal", loss="hinge", random_state=rng
+    )
+    clf_rob = RobustWeightedClassifier(
+        weighting="huber",
+        loss="hinge",
+        c=1.35,
+        eta0=1e-3,
+        max_iter=300,
+        random_state=rng,
+    )
 
     cv_not_rob = cross_val_score(
         clf_not_rob, X, y, cv=10, scoring=make_scorer(roc_auc_score)
@@ -64,7 +73,9 @@
     res += [[np.mean(cv_rob), np.mean(cv_not_rob)]]
 
 
-plt.boxplot(np.array(res), labels=["RobustWeightedEstimator", "SGDClassifier"])
+plt.boxplot(
+    np.array(res), labels=["RobustWeightedClassifier", "SGDClassifier"]
+)
 plt.ylabel("AUC")
 
 plt.show()

diff --git a/examples/plot_robust_classification_toy.py b/examples/plot_robust_classification_toy.py
@@ -3,12 +3,12 @@
 =============================================================
 A demo of Robust Classification on Simulated corrupted dataset
 =============================================================
-In this example we compare the RobustWeightedEstimator using SGDClassifier
+In this example we compare the RobustWeightedClassifier using SGDClassifier
 for classification with the vanilla SGDClassifier with various losses.
 """
 import matplotlib.pyplot as plt
 import numpy as np
-from sklearn_extra.robust import RobustWeightedEstimator
+from sklearn_extra.robust import RobustWeightedClassifier
 from sklearn.linear_model import SGDClassifier
 from sklearn.datasets import make_blobs
 from sklearn.utils import shuffle
@@ -40,10 +40,8 @@
         SGDClassifier(loss="modified_huber", random_state=rng),
     ),
     (
-        "RobustWeightedEstimator",
-        RobustWeightedEstimator(
-            base_estimator=SGDClassifier(),
-            loss="log",
+        "RobustWeightedClassifier",
+        RobustWeightedClassifier(
             max_iter=100,
             weighting="mom",
             k=6,

diff --git a/examples/plot_robust_regression_california_houses.py b/examples/plot_robust_regression_california_houses.py
@@ -3,26 +3,23 @@
 ================================================================
 A demo of Robust Regression on real dataset "california housing"
 ================================================================
-In this example we compare the RobustWeightedEstimator using SGDRegressor
-for regression on the real dataset california housing.
-WARNING: running this example can take some time (<1hour).
-
-We also compare with robust estimators from scikit-learn: TheilSenRegressor
-and RANSACRegressor
+In this example we compare the RobustWeightedRegressor to other scikit-learn
+regressors on the real dataset california housing.
+WARNING: running this example can take some time (<1 hour on recent computer).
 
 One of the main point of this example is the importance of taking into account
 outliers in the test dataset when dealing with real datasets.
 
-For this example, we took a parameter so that RobustWeightedEstimator is better
+For this example, we took a parameter so that RobustWeightedRegressor is better
 than RANSAC and TheilSen when talking about the mean squared error and it
 is better than the SGDRegressor when talking about the median squared error.
 Depending on what criterion one want to optimize, the parameter measuring
-robustness in RobustWeightedEstimator can change and this is not so
+robustness in RobustWeightedRegressor can change and this is not so
 straightforward when using RANSAC and TheilSenRegressor.
 """
 import matplotlib.pyplot as plt
 import numpy as np
-from sklearn_extra.robust import RobustWeightedEstimator
+from sklearn_extra.robust import RobustWeightedRegressor
 from sklearn.linear_model import (
     SGDRegressor,
     TheilSenRegressor,
@@ -57,19 +54,18 @@ def quadratic_loss(est, X, y, X_test, y_test):
         ),
     ),
     (
-        "RWE, Huber weights",
-        RobustWeightedEstimator(
-            SGDRegressor(
-                learning_rate="adaptive",
-                eta0=1e-6,
-                max_iter=1000,
-                n_iter_no_change=100,
-            ),
-            loss="squared_loss",
+        "RobustWeightedRegressor",
+        RobustWeightedRegressor(
             weighting="huber",
             c=0.5,
             eta0=1e-6,
             max_iter=500,
+            sgd_args={
+                "max_iter": 1000,
+                "n_iter_no_change": 100,
+                "learning_rate": "adaptive",
+                "eta0": 1e-6,
+            },
         ),
     ),
     ("RANSAC", RANSACRegressor()),
@@ -82,14 +78,19 @@ def quadratic_loss(est, X, y, X_test, y_test):
 for f in range(M):
     print("\r Progress: %s / %s" % (f + 1, M), end="")
 
+    rng = np.random.RandomState(f)
+
     # Split in a training set and a test set
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=rng
+    )
 
     for i, (name, est) in enumerate(estimators):
         cv = quadratic_loss(est, X_train, y_train, X_test, y_test)
 
         # It is preferable to use the median of the validation losses
-        # because it is possible that some outliers are present in the test set.
+        # because it is possible that some outliers are present in the
+        # test set.
         # We compute both for comparison.
         res[i, f, 0] = np.mean(cv)
         res[i, f, 1] = np.median(cv)

diff --git a/examples/plot_robust_regression_toy.py b/examples/plot_robust_regression_toy.py
@@ -3,13 +3,13 @@
 =============================================================
 Robust regression on simulated corrupted dataset
 =============================================================
-In this example we compare the RobustWeightedEstimator using SGDRegressor
-for regression with various robust regression algorithms from scikit-learn.
+In this example we compare the RobustWeightedRegressor
+with various robust regression algorithms from scikit-learn.
 """
 import matplotlib.pyplot as plt
 import numpy as np
 
-from sklearn_extra.robust import RobustWeightedEstimator
+from sklearn_extra.robust import RobustWeightedRegressor
 from sklearn.utils import shuffle
 from sklearn.linear_model import (
     SGDRegressor,
@@ -41,10 +41,8 @@
         SGDRegressor(loss="epsilon_insensitive", random_state=rng),
     ),
     (
-        "RobustWeightedEstimator",
-        RobustWeightedEstimator(
-            loss="squared_loss", weighting="mom", k=7, random_state=rng
-        ),
+        "RobustWeightedRegressor",
+        RobustWeightedRegressor(weighting="mom", k=7, random_state=rng),
         # The parameter k is set larger to the number of outliers
         # because here we know it.
     ),
@@ -56,7 +54,7 @@
     "Theil-Sen": "gold",
     "RANSAC": "lightgreen",
     "HuberRegressor": "black",
-    "RobustWeightedEstimator": "magenta",
+    "RobustWeightedRegressor": "magenta",
     "SGD epsilon loss": "purple",
 }
 linestyle = {
@@ -65,7 +63,7 @@
     "Theil-Sen": "-.",
     "RANSAC": "--",
     "HuberRegressor": "--",
-    "RobustWeightedEstimator": "--",
+    "RobustWeightedRegressor": "--",
 }
 lw = 3
 

diff --git a/sklearn_extra/robust/__init__.py b/sklearn_extra/robust/__init__.py
@@ -1,5 +1,11 @@
 from sklearn_extra.robust.robust_weighted_estimator import (
-    RobustWeightedEstimator,
+    RobustWeightedClassifier,
+    RobustWeightedKMeans,
+    RobustWeightedRegressor,
 )
 
-__all__ = ["RobustWeightedEstimator"]
+__all__ = [
+    "RobustWeightedClassifier",
+    "RobustWeightedKMeans",
+    "RobustWeightedRegressor",
+]