MAINT|API Clean up deprecations for 1.6: SAMME.R in AdaBoost and deprecate algorithm (scikit-learn#29997)

jeremiedbb · glemaitre · web-flow · commit 6de55b3488f2 · 2024-10-08T22:15:43.000+02:00
Co-authored-by: Guillaume Lemaitre &lt;guillaume@probabl.ai&gt;
diff --git a/benchmarks/bench_20newsgroups.py b/benchmarks/bench_20newsgroups.py
@@ -21,7 +21,7 @@
     "extra_trees": ExtraTreesClassifier(max_features="sqrt", min_samples_split=10),
     "logistic_regression": LogisticRegression(),
     "naive_bayes": MultinomialNB(),
-    "adaboost": AdaBoostClassifier(n_estimators=10, algorithm="SAMME"),
+    "adaboost": AdaBoostClassifier(n_estimators=10),
 }
 
 
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -1709,7 +1709,7 @@ learners::
     >>> from sklearn.ensemble import AdaBoostClassifier
 
     >>> X, y = load_iris(return_X_y=True)
-    >>> clf = AdaBoostClassifier(n_estimators=100, algorithm="SAMME",)
+    >>> clf = AdaBoostClassifier(n_estimators=100)
     >>> scores = cross_val_score(clf, X, y, cv=5)
     >>> scores.mean()
     0.9...
diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -251,6 +251,10 @@ Changelog
   right child node as the tree is traversed.
   :pr:`28268` by :user:`Adam Li <adam2392>`.
 
+- |API| The parameter `algorithm` of :class:`ensemble.AdaBoostClassifier` is deprecated
+  and will be removed in 1.8.
+  :pr:`29997` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
 :mod:`sklearn.impute`
 .....................
 
diff --git a/examples/classification/plot_classifier_comparison.py b/examples/classification/plot_classifier_comparison.py
@@ -64,7 +64,7 @@
         max_depth=5, n_estimators=10, max_features=1, random_state=42
     ),
     MLPClassifier(alpha=1, max_iter=1000, random_state=42),
-    AdaBoostClassifier(algorithm="SAMME", random_state=42),
+    AdaBoostClassifier(random_state=42),
     GaussianNB(),
     QuadraticDiscriminantAnalysis(),
 ]
diff --git a/examples/ensemble/plot_adaboost_multiclass.py b/examples/ensemble/plot_adaboost_multiclass.py
@@ -80,7 +80,6 @@
 adaboost_clf = AdaBoostClassifier(
     estimator=weak_learner,
     n_estimators=n_estimators,
-    algorithm="SAMME",
     random_state=42,
 ).fit(X_train, y_train)
 
diff --git a/examples/ensemble/plot_adaboost_twoclass.py b/examples/ensemble/plot_adaboost_twoclass.py
@@ -39,10 +39,7 @@
 y = np.concatenate((y1, -y2 + 1))
 
 # Create and fit an AdaBoosted decision tree
-bdt = AdaBoostClassifier(
-    DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200
-)
-
+bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200)
 bdt.fit(X, y)
 
 plot_colors = "br"
diff --git a/examples/ensemble/plot_forest_iris.py b/examples/ensemble/plot_forest_iris.py
@@ -74,11 +74,7 @@
     DecisionTreeClassifier(max_depth=None),
     RandomForestClassifier(n_estimators=n_estimators),
     ExtraTreesClassifier(n_estimators=n_estimators),
-    AdaBoostClassifier(
-        DecisionTreeClassifier(max_depth=3),
-        n_estimators=n_estimators,
-        algorithm="SAMME",
-    ),
+    AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators),
 ]
 
 for pair in ([0, 1], [0, 2], [2, 3]):
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
@@ -24,7 +24,6 @@
 from numbers import Integral, Real
 
 import numpy as np
-from scipy.special import xlogy
 
 from ..base import (
     ClassifierMixin,
@@ -36,7 +35,7 @@
 from ..metrics import accuracy_score, r2_score
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
 from ..utils import _safe_indexing, check_random_state
-from ..utils._param_validation import HasMethods, Interval, StrOptions
+from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions
 from ..utils.extmath import softmax, stable_cumsum
 from ..utils.metadata_routing import (
     _raise_for_unsupported_routing,
@@ -375,16 +374,12 @@ class AdaBoostClassifier(
         a trade-off between the `learning_rate` and `n_estimators` parameters.
         Values must be in the range `(0.0, inf)`.
 
-    algorithm : {'SAMME', 'SAMME.R'}, default='SAMME.R'
-        If 'SAMME.R' then use the SAMME.R real boosting algorithm.
-        ``estimator`` must support calculation of class probabilities.
-        If 'SAMME' then use the SAMME discrete boosting algorithm.
-        The SAMME.R algorithm typically converges faster than SAMME,
-        achieving a lower test error with fewer boosting iterations.
+    algorithm : {'SAMME'}, default='SAMME'
+        Use the SAMME discrete boosting algorithm.
 
-        .. deprecated:: 1.4
-            `"SAMME.R"` is deprecated and will be removed in version 1.6.
-            '"SAMME"' will become the default.
+        .. deprecated:: 1.6
+            `algorithm` is deprecated and will be removed in version 1.8. This
+            estimator only implements the 'SAMME' algorithm.
 
     random_state : int, RandomState instance or None, default=None
         Controls the random seed given at each `estimator` at each
@@ -470,9 +465,9 @@ class AdaBoostClassifier(
     >>> X, y = make_classification(n_samples=1000, n_features=4,
     ...                            n_informative=2, n_redundant=0,
     ...                            random_state=0, shuffle=False)
-    >>> clf = AdaBoostClassifier(n_estimators=100, algorithm="SAMME", random_state=0)
+    >>> clf = AdaBoostClassifier(n_estimators=100, random_state=0)
     >>> clf.fit(X, y)
-    AdaBoostClassifier(algorithm='SAMME', n_estimators=100, random_state=0)
+    AdaBoostClassifier(n_estimators=100, random_state=0)
     >>> clf.predict([[0, 0, 0, 0]])
     array([1])
     >>> clf.score(X, y)
@@ -487,23 +482,19 @@ class AdaBoostClassifier(
     refer to :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_twoclass.py`.
     """
 
-    # TODO(1.6): Modify _parameter_constraints for "algorithm" to only check
-    # for "SAMME"
+    # TODO(1.8): remove "algorithm" entry
     _parameter_constraints: dict = {
         **BaseWeightBoosting._parameter_constraints,
-        "algorithm": [
-            StrOptions({"SAMME", "SAMME.R"}),
-        ],
+        "algorithm": [StrOptions({"SAMME"}), Hidden(StrOptions({"deprecated"}))],
     }
 
-    # TODO(1.6): Change default "algorithm" value to "SAMME"
     def __init__(
         self,
         estimator=None,
         *,
         n_estimators=50,
         learning_rate=1.0,
-        algorithm="SAMME.R",
+        algorithm="deprecated",
         random_state=None,
     ):
         super().__init__(
@@ -519,43 +510,23 @@ def _validate_estimator(self):
         """Check the estimator and set the estimator_ attribute."""
         super()._validate_estimator(default=DecisionTreeClassifier(max_depth=1))
 
-        # TODO(1.6): Remove, as "SAMME.R" value for "algorithm" param will be
-        # removed in 1.6
-        # SAMME-R requires predict_proba-enabled base estimators
-        if self.algorithm != "SAMME":
+        if self.algorithm != "deprecated":
             warnings.warn(
-                (
-                    "The SAMME.R algorithm (the default) is deprecated and will be"
-                    " removed in 1.6. Use the SAMME algorithm to circumvent this"
-                    " warning."
-                ),
+                "The parameter 'algorithm' is deprecated in 1.6 and has no effect. "
+                "It will be removed in version 1.8.",
                 FutureWarning,
             )
-            if not hasattr(self.estimator_, "predict_proba"):
-                raise TypeError(
-                    "AdaBoostClassifier with algorithm='SAMME.R' requires "
-                    "that the weak learner supports the calculation of class "
-                    "probabilities with a predict_proba method.\n"
-                    "Please change the base estimator or set "
-                    "algorithm='SAMME' instead."
-                )
 
         if not has_fit_parameter(self.estimator_, "sample_weight"):
             raise ValueError(
                 f"{self.estimator.__class__.__name__} doesn't support sample_weight."
             )
 
-    # TODO(1.6): Redefine the scope of the `_boost` and `_boost_discrete`
-    # functions to be the same since SAMME will be the default value for the
-    # "algorithm" parameter in version 1.6. Thus, a distinguishing function is
-    # no longer needed. (Or adjust code here, if another algorithm, shall be
-    # used instead of SAMME.R.)
     def _boost(self, iboost, X, y, sample_weight, random_state):
         """Implement a single boost.
 
-        Perform a single boost according to the real multi-class SAMME.R
-        algorithm or to the discrete SAMME algorithm and return the updated
-        sample weights.
+        Perform a single boost according to the discrete SAMME algorithm and return the
+        updated sample weights.
 
         Parameters
         ----------
@@ -589,75 +560,6 @@ def _boost(self, iboost, X, y, sample_weight, random_state):
             The classification error for the current boost.
             If None then boosting has terminated early.
         """
-        if self.algorithm == "SAMME.R":
-            return self._boost_real(iboost, X, y, sample_weight, random_state)
-
-        else:  # elif self.algorithm == "SAMME":
-            return self._boost_discrete(iboost, X, y, sample_weight, random_state)
-
-    # TODO(1.6): Remove function. The `_boost_real` function won't be used any
-    # longer, because the SAMME.R algorithm will be deprecated in 1.6.
-    def _boost_real(self, iboost, X, y, sample_weight, random_state):
-        """Implement a single boost using the SAMME.R real algorithm."""
-        estimator = self._make_estimator(random_state=random_state)
-
-        estimator.fit(X, y, sample_weight=sample_weight)
-
-        y_predict_proba = estimator.predict_proba(X)
-
-        if iboost == 0:
-            self.classes_ = getattr(estimator, "classes_", None)
-            self.n_classes_ = len(self.classes_)
-
-        y_predict = self.classes_.take(np.argmax(y_predict_proba, axis=1), axis=0)
-
-        # Instances incorrectly classified
-        incorrect = y_predict != y
-
-        # Error fraction
-        estimator_error = np.mean(np.average(incorrect, weights=sample_weight, axis=0))
-
-        # Stop if classification is perfect
-        if estimator_error <= 0:
-            return sample_weight, 1.0, 0.0
-
-        # Construct y coding as described in Zhu et al [2]:
-        #
-        #    y_k = 1 if c == k else -1 / (K - 1)
-        #
-        # where K == n_classes_ and c, k in [0, K) are indices along the second
-        # axis of the y coding with c being the index corresponding to the true
-        # class label.
-        n_classes = self.n_classes_
-        classes = self.classes_
-        y_codes = np.array([-1.0 / (n_classes - 1), 1.0])
-        y_coding = y_codes.take(classes == y[:, np.newaxis])
-
-        # Displace zero probabilities so the log is defined.
-        # Also fix negative elements which may occur with
-        # negative sample weights.
-        proba = y_predict_proba  # alias for readability
-        np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba)
-
-        # Boost weight using multi-class AdaBoost SAMME.R alg
-        estimator_weight = (
-            -1.0
-            * self.learning_rate
-            * ((n_classes - 1.0) / n_classes)
-            * xlogy(y_coding, y_predict_proba).sum(axis=1)
-        )
-
-        # Only boost the weights if it will fit again
-        if not iboost == self.n_estimators - 1:
-            # Only boost positive weights
-            sample_weight *= np.exp(
-                estimator_weight * ((sample_weight > 0) | (estimator_weight < 0))
-            )
-
-        return sample_weight, 1.0, estimator_error
-
-    def _boost_discrete(self, iboost, X, y, sample_weight, random_state):
-        """Implement a single boost using the SAMME discrete algorithm."""
         estimator = self._make_estimator(random_state=random_state)
 
         estimator.fit(X, y, sample_weight=sample_weight)
@@ -789,21 +691,17 @@ class in ``classes_``, respectively.
         n_classes = self.n_classes_
         classes = self.classes_[:, np.newaxis]
 
-        # TODO(1.6): Remove, because "algorithm" param will be deprecated in 1.6
-        if self.algorithm == "SAMME.R":
-            # The weights are all 1. for SAMME.R
-            pred = sum(
-                _samme_proba(estimator, n_classes, X) for estimator in self.estimators_
-            )
-        else:  # self.algorithm == "SAMME"
-            pred = sum(
-                np.where(
-                    (estimator.predict(X) == classes).T,
-                    w,
-                    -1 / (n_classes - 1) * w,
-                )
-                for estimator, w in zip(self.estimators_, self.estimator_weights_)
+        if n_classes == 1:
+            return np.zeros_like(X, shape=(X.shape[0], 1))
+
+        pred = sum(
+            np.where(
+                (estimator.predict(X) == classes).T,
+                w,
+                -1 / (n_classes - 1) * w,
             )
+            for estimator, w in zip(self.estimators_, self.estimator_weights_)
+        )
 
         pred /= self.estimator_weights_.sum()
         if n_classes == 2:
@@ -844,17 +742,11 @@ class in ``classes_``, respectively.
         for weight, estimator in zip(self.estimator_weights_, self.estimators_):
             norm += weight
 
-            # TODO(1.6): Remove, because "algorithm" param will be deprecated in
-            # 1.6
-            if self.algorithm == "SAMME.R":
-                # The weights are all 1. for SAMME.R
-                current_pred = _samme_proba(estimator, n_classes, X)
-            else:  # elif self.algorithm == "SAMME":
-                current_pred = np.where(
-                    (estimator.predict(X) == classes).T,
-                    weight,
-                    -1 / (n_classes - 1) * weight,
-                )
+            current_pred = np.where(
+                (estimator.predict(X) == classes).T,
+                weight,
+                -1 / (n_classes - 1) * weight,
+            )
 
             if pred is None:
                 pred = current_pred
diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py
@@ -965,7 +965,7 @@ def test_bagging_with_metadata_routing(model):
     "model",
     [
         BaggingClassifier(
-            estimator=AdaBoostClassifier(n_estimators=1, algorithm="SAMME"),
+            estimator=AdaBoostClassifier(n_estimators=1),
             n_estimators=1,
         ),
         BaggingRegressor(estimator=AdaBoostRegressor(n_estimators=1), n_estimators=1),
diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@`
`21`	`21`	`"extra_trees": ExtraTreesClassifier(max_features="sqrt", min_samples_split=10),`
`22`	`22`	`"logistic_regression": LogisticRegression(),`
`23`	`23`	`"naive_bayes": MultinomialNB(),`
`24`		`- "adaboost": AdaBoostClassifier(n_estimators=10, algorithm="SAMME"),`
	`24`	`+ "adaboost": AdaBoostClassifier(n_estimators=10),`
`25`	`25`	`}`
`26`	`26`
`27`	`27`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@`
`64`	`64`	`max_depth=5, n_estimators=10, max_features=1, random_state=42`
`65`	`65`	`),`
`66`	`66`	`MLPClassifier(alpha=1, max_iter=1000, random_state=42),`
`67`		`- AdaBoostClassifier(algorithm="SAMME", random_state=42),`
	`67`	`+ AdaBoostClassifier(random_state=42),`
`68`	`68`	`GaussianNB(),`
`69`	`69`	`QuadraticDiscriminantAnalysis(),`
`70`	`70`	`]`
Original file line number	Diff line number	Diff line change
`@@ -965,7 +965,7 @@ def test_bagging_with_metadata_routing(model):`
`965`	`965`	`"model",`
`966`	`966`	`[`
`967`	`967`	`BaggingClassifier(`
`968`		`- estimator=AdaBoostClassifier(n_estimators=1, algorithm="SAMME"),`
	`968`	`+ estimator=AdaBoostClassifier(n_estimators=1),`
`969`	`969`	`n_estimators=1,`
`970`	`970`	`),`
`971`	`971`	`BaggingRegressor(estimator=AdaBoostRegressor(n_estimators=1), n_estimators=1),`