scikit-learn
diff --git a/Diff for: ‎1.1/.buildinfo
+1-1 b/Diff for: ‎1.1/.buildinfo
+1-1
diff --git a/Diff for: ‎1.1/_downloads/02a1306a494b46cc56c930ceec6e8c4a/plot_species_kde.py
+1-1 b/Diff for: ‎1.1/_downloads/02a1306a494b46cc56c930ceec6e8c4a/plot_species_kde.py
+1-1
diff --git a/Diff for: ‎1.1/_downloads/06cfc926acb27652fb2aa5bfc583e7cb/plot_hashing_vs_dict_vectorizer.ipynb
+290-2 b/Diff for: ‎1.1/_downloads/06cfc926acb27652fb2aa5bfc583e7cb/plot_hashing_vs_dict_vectorizer.ipynb
+290-2
diff --git a/Diff for: ‎1.1/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
33.8 KB b/Diff for: ‎1.1/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
33.8 KB
diff --git a/Diff for: ‎1.1/_downloads/08fc4f471ae40388eb535678346dc9d1/plot_gpc_xor.py
+1-1 b/Diff for: ‎1.1/_downloads/08fc4f471ae40388eb535678346dc9d1/plot_gpc_xor.py
+1-1
diff --git a/Diff for: ‎1.1/_downloads/14f620cd922ca2c9a39ae5784034dd0d/plot_lda.py
+3 b/Diff for: ‎1.1/_downloads/14f620cd922ca2c9a39ae5784034dd0d/plot_lda.py
+3
diff --git a/Diff for: ‎1.1/_downloads/1bcb2039afa126da41f1cea42b4a5866/plot_gpr_prior_posterior.py
+6-6 b/Diff for: ‎1.1/_downloads/1bcb2039afa126da41f1cea42b4a5866/plot_gpr_prior_posterior.py
+6-6
diff --git a/Diff for: ‎1.1/_downloads/1c4a422dfa5bd721501d19a2b7e2499b/plot_species_kde.ipynb
+1-1 b/Diff for: ‎1.1/_downloads/1c4a422dfa5bd721501d19a2b7e2499b/plot_species_kde.ipynb
+1-1
diff --git a/Diff for: ‎1.1/_downloads/24475810034a0d0d190a9de0f87d72b5/plot_all_scaling.py
+1-1 b/Diff for: ‎1.1/_downloads/24475810034a0d0d190a9de0f87d72b5/plot_all_scaling.py
+1-1
diff --git a/Diff for: ‎1.1/_downloads/2f3ef774a6d7e52e1e6b7ccbb75d25f0/plot_gradient_boosting_quantile.py
+5 b/Diff for: ‎1.1/_downloads/2f3ef774a6d7e52e1e6b7ccbb75d25f0/plot_gradient_boosting_quantile.py
+5
diff --git a/Diff for: ‎1.1/_downloads/3a10dcfbc1a4bf1349c7101a429aa47b/plot_feature_transformation.py
+6-1 b/Diff for: ‎1.1/_downloads/3a10dcfbc1a4bf1349c7101a429aa47b/plot_feature_transformation.py
+6-1
diff --git a/Diff for: ‎1.1/_downloads/3c3c738275484acc54821615bf72894a/plot_permutation_importance.py
+1-1 b/Diff for: ‎1.1/_downloads/3c3c738275484acc54821615bf72894a/plot_permutation_importance.py
+1-1
diff --git a/Diff for: ‎1.1/_downloads/3ed102fa8211c8d36f2331f0c5e1dcef/plot_model_complexity_influence.ipynb
+1-1 b/Diff for: ‎1.1/_downloads/3ed102fa8211c8d36f2331f0c5e1dcef/plot_model_complexity_influence.ipynb
+1-1
diff --git a/Diff for: ‎1.1/_downloads/473e94775f7181f54536fbb1f45b9e42/plot_agglomerative_clustering.py
+1-1 b/Diff for: ‎1.1/_downloads/473e94775f7181f54536fbb1f45b9e42/plot_agglomerative_clustering.py
+1-1
diff --git a/Diff for: ‎1.1/_downloads/4825fc8223d1af0f3b61080c3dea3a62/plot_faces_decomposition.py
+1-1 b/Diff for: ‎1.1/_downloads/4825fc8223d1af0f3b61080c3dea3a62/plot_faces_decomposition.py
+1-1
diff --git a/Diff for: ‎1.1/_downloads/57163227aeb4c19ca4c69b87a8d1949c/plot_learning_curve.py
+16-1 b/Diff for: ‎1.1/_downloads/57163227aeb4c19ca4c69b87a8d1949c/plot_learning_curve.py
+16-1
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 0eaf86448214f705bb4019609544b7b3
+config: 8c6ff21e847d280e934fd16d253894de
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -19,7 +19,7 @@
 The two species are:
 
  - `"Bradypus variegatus"
-   <http://www.iucnredlist.org/apps/redlist/details/3038/0>`_ ,
+   <https://www.iucnredlist.org/species/3038/47437046>`_ ,
    the Brown-throated Sloth.
 
  - `"Microryzomys minutus"
 
@@ -29,7 +29,7 @@
 
 # fit the model
 plt.figure(figsize=(10, 5))
-kernels = [1.0 * RBF(length_scale=1.0), 1.0 * DotProduct(sigma_0=1.0) ** 2]
+kernels = [1.0 * RBF(length_scale=1.15), 1.0 * DotProduct(sigma_0=1.0) ** 2]
 for i, kernel in enumerate(kernels):
     clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y)
 
 
@@ -71,20 +71,23 @@ def generate_data(n_samples, n_features):
     linewidth=2,
     label="Linear Discriminant Analysis with Ledoit Wolf",
     color="navy",
+    linestyle="dashed",
 )
 plt.plot(
     features_samples_ratio,
     acc_clf2,
     linewidth=2,
     label="Linear Discriminant Analysis",
     color="gold",
+    linestyle="solid",
 )
 plt.plot(
     features_samples_ratio,
     acc_clf3,
     linewidth=2,
     label="Linear Discriminant Analysis with OAS",
     color="red",
+    linestyle="dotted",
 )
 
 plt.xlabel("n_features / n_samples")
 
@@ -158,7 +158,7 @@ def plot_gpr_samples(gpr_model, n_samples, ax):
 )
 
 # %%
-# Periodic kernel
+# Exp-Sine-Squared kernel
 # ...............
 from sklearn.gaussian_process.kernels import ExpSineSquared
 
@@ -183,7 +183,7 @@ def plot_gpr_samples(gpr_model, n_samples, ax):
 axs[1].legend(bbox_to_anchor=(1.05, 1.5), loc="upper left")
 axs[1].set_title("Samples from posterior distribution")
 
-fig.suptitle("Periodic kernel", fontsize=18)
+fig.suptitle("Exp-Sine-Squared kernel", fontsize=18)
 plt.tight_layout()
 
 # %%
@@ -194,7 +194,7 @@ def plot_gpr_samples(gpr_model, n_samples, ax):
 )
 
 # %%
-# Dot product kernel
+# Dot-product kernel
 # ..................
 from sklearn.gaussian_process.kernels import ConstantKernel, DotProduct
 
@@ -216,7 +216,7 @@ def plot_gpr_samples(gpr_model, n_samples, ax):
 axs[1].legend(bbox_to_anchor=(1.05, 1.5), loc="upper left")
 axs[1].set_title("Samples from posterior distribution")
 
-fig.suptitle("Dot product kernel", fontsize=18)
+fig.suptitle("Dot-product kernel", fontsize=18)
 plt.tight_layout()
 
 # %%
@@ -227,7 +227,7 @@ def plot_gpr_samples(gpr_model, n_samples, ax):
 )
 
 # %%
-# Mattern kernel
+# Matérn kernel
 # ..............
 from sklearn.gaussian_process.kernels import Matern
 
@@ -247,7 +247,7 @@ def plot_gpr_samples(gpr_model, n_samples, ax):
 axs[1].legend(bbox_to_anchor=(1.05, 1.5), loc="upper left")
 axs[1].set_title("Samples from posterior distribution")
 
-fig.suptitle("Mattern kernel", fontsize=18)
+fig.suptitle("Matérn kernel", fontsize=18)
 plt.tight_layout()
 
 # %%
 
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Kernel Density Estimate of Species Distributions\nThis shows an example of a neighbors-based query (in particular a kernel\ndensity estimate) on geospatial data, using a Ball Tree built upon the\nHaversine distance metric -- i.e. distances over points in latitude/longitude.\nThe dataset is provided by Phillips et. al. (2006).\nIf available, the example uses\n`basemap <https://matplotlib.org/basemap/>`_\nto plot the coast lines and national boundaries of South America.\n\nThis example does not perform any learning over the data\n(see `sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py` for\nan example of classification based on the attributes in this dataset).  It\nsimply shows the kernel density estimate of observed data points in\ngeospatial coordinates.\n\nThe two species are:\n\n - `\"Bradypus variegatus\"\n   <http://www.iucnredlist.org/apps/redlist/details/3038/0>`_ ,\n   the Brown-throated Sloth.\n\n - `\"Microryzomys minutus\"\n   <http://www.iucnredlist.org/details/13408/0>`_ ,\n   also known as the Forest Small Rice Rat, a rodent that lives in Peru,\n   Colombia, Ecuador, Peru, and Venezuela.\n\n## References\n\n * `\"Maximum entropy modeling of species geographic distributions\"\n   <http://rob.schapire.net/papers/ecolmod.pdf>`_\n   S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,\n   190:231-259, 2006.\n"
+        "\n# Kernel Density Estimate of Species Distributions\nThis shows an example of a neighbors-based query (in particular a kernel\ndensity estimate) on geospatial data, using a Ball Tree built upon the\nHaversine distance metric -- i.e. distances over points in latitude/longitude.\nThe dataset is provided by Phillips et. al. (2006).\nIf available, the example uses\n`basemap <https://matplotlib.org/basemap/>`_\nto plot the coast lines and national boundaries of South America.\n\nThis example does not perform any learning over the data\n(see `sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py` for\nan example of classification based on the attributes in this dataset).  It\nsimply shows the kernel density estimate of observed data points in\ngeospatial coordinates.\n\nThe two species are:\n\n - `\"Bradypus variegatus\"\n   <https://www.iucnredlist.org/species/3038/47437046>`_ ,\n   the Brown-throated Sloth.\n\n - `\"Microryzomys minutus\"\n   <http://www.iucnredlist.org/details/13408/0>`_ ,\n   also known as the Forest Small Rice Rat, a rodent that lives in Peru,\n   Colombia, Ecuador, Peru, and Venezuela.\n\n## References\n\n * `\"Maximum entropy modeling of species geographic distributions\"\n   <http://rob.schapire.net/papers/ecolmod.pdf>`_\n   S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,\n   190:231-259, 2006.\n"
       ]
     },
     {
 
@@ -324,7 +324,7 @@ def make_plot(item_idx):
 #
 # Unlike the previous scalers, the centering and scaling statistics of
 # :class:`~sklearn.preprocessing.RobustScaler`
-# is based on percentiles and are therefore not influenced by a few
+# are based on percentiles and are therefore not influenced by a small
 # number of very large marginal outliers. Consequently, the resulting range of
 # the transformed feature values is larger than for the previous scalers and,
 # more importantly, are approximately similar: for both features most of the
 
@@ -72,6 +72,11 @@ def f(x):
     all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train)
 
 # %%
+# Notice that :class:`~sklearn.ensemble.HistGradientBoostingRegressor` is much
+# faster than :class:`~sklearn.ensemble.GradientBoostingRegressor` starting with
+# intermediate datasets (`n_samples >= 10_000`), which is not the case of the
+# present example.
+#
 # For the sake of comparison, we also fit a baseline model trained with the
 # usual (mean) squared error (MSE).
 gbr_ls = GradientBoostingRegressor(loss="squared_error", **common_params)
 
@@ -39,7 +39,7 @@
 from sklearn.datasets import make_classification
 from sklearn.model_selection import train_test_split
 
-X, y = make_classification(n_samples=80000, random_state=10)
+X, y = make_classification(n_samples=80_000, random_state=10)
 
 X_full_train, X_test, y_full_train, y_test = train_test_split(
     X, y, test_size=0.5, random_state=10
@@ -72,6 +72,11 @@
 _ = gradient_boosting.fit(X_train_ensemble, y_train_ensemble)
 
 # %%
+# Notice that :class:`~sklearn.ensemble.HistGradientBoostingClassifier` is much
+# faster than :class:`~sklearn.ensemble.GradientBoostingClassifier` starting
+# with intermediate datasets (`n_samples >= 10_000`), which is not the case of
+# the present example.
+#
 # The :class:`~sklearn.ensemble.RandomTreesEmbedding` is an unsupervised method
 # and thus does not required to be trained independently.
 
 
@@ -58,7 +58,7 @@
 # We define a predictive model based on a random forest. Therefore, we will make
 # the following preprocessing steps:
 #
-# - use :class:`~sklearn.preprocessing.OrdinaleEcnoder` to encode the
+# - use :class:`~sklearn.preprocessing.OrdinalEncoder` to encode the
 #   categorical features;
 # - use :class:`~sklearn.impute.SimpleImputer` to fill missing values for
 #   numerical features using a mean strategy.
 
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Model Complexity Influence\n\nDemonstrate how model complexity influences both prediction accuracy and\ncomputational performance.\n\nWe will be using two datasets:\n    - `diabetes_dataset` for regression.\n      This dataset consists of 10 measurements taken from diabetes patients.\n      The task is to predict disease progression;\n    - `20newsgroups_dataset` for classification. This dataset consists of\n      newsgroup posts. The task is to predict on which topic (out of 20 topics)\n      the post is written about.\n\nWe will model the complexity influence on three different estimators:\n    - :class:`~sklearn.linear_model.SGDClassifier` (for classification data)\n      which implements stochastic gradient descent learning;\n\n    - :class:`~sklearn.svm.NuSVR` (for regression data) which implements\n      Nu support vector regression;\n\n    - :class:`~sklearn.ensemble.GradientBoostingRegressor` (for regression\n      data) which builds an additive model in a forward stage-wise fashion.\n\n\nWe make the model complexity vary through the choice of relevant model\nparameters in each of our selected models. Next, we will measure the influence\non both computational performance (latency) and predictive power (MSE or\nHamming Loss).\n"
+        "\n# Model Complexity Influence\n\nDemonstrate how model complexity influences both prediction accuracy and\ncomputational performance.\n\nWe will be using two datasets:\n    - `diabetes_dataset` for regression.\n      This dataset consists of 10 measurements taken from diabetes patients.\n      The task is to predict disease progression;\n    - `20newsgroups_dataset` for classification. This dataset consists of\n      newsgroup posts. The task is to predict on which topic (out of 20 topics)\n      the post is written about.\n\nWe will model the complexity influence on three different estimators:\n    - :class:`~sklearn.linear_model.SGDClassifier` (for classification data)\n      which implements stochastic gradient descent learning;\n\n    - :class:`~sklearn.svm.NuSVR` (for regression data) which implements\n      Nu support vector regression;\n\n    - :class:`~sklearn.ensemble.GradientBoostingRegressor` builds an additive\n      model in a forward stage-wise fashion. Notice that\n      :class:`~sklearn.ensemble.HistGradientBoostingRegressor` is much faster\n      than :class:`~sklearn.ensemble.GradientBoostingRegressor` starting with\n      intermediate datasets (`n_samples >= 10_000`), which is not the case for\n      this example.\n\n\nWe make the model complexity vary through the choice of relevant model\nparameters in each of our selected models. Next, we will measure the influence\non both computational performance (latency) and predictive power (MSE or\nHamming Loss).\n"
       ]
     },
     {
 
@@ -6,7 +6,7 @@
 local structure in the data. The graph is simply the graph of 20 nearest
 neighbors.
 
-Two consequences of imposing a connectivity can be seen. First, clustering
+There are two advantages of imposing a connectivity. First, clustering
 without a connectivity matrix is much faster.
 
 Second, when using a connectivity matrix, single, average and complete
 
@@ -79,7 +79,7 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray):
 
 
 # %%
-# Let’s take a look at our data. Gray color indicates negative values,
+# Let's take a look at our data. Gray color indicates negative values,
 # white indicates positive values.
 
 plot_gallery("Faces from dataset", faces_centered[:n_components])
 
@@ -35,6 +35,7 @@ def plot_learning_curve(
     ylim=None,
     cv=None,
     n_jobs=None,
+    scoring=None,
     train_sizes=np.linspace(0.1, 1.0, 5),
 ):
     """
@@ -86,6 +87,11 @@ def plot_learning_curve(
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+    scoring : str or callable, default=None
+        A str (see model evaluation documentation) or
+        a scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+
     train_sizes : array-like of shape (n_ticks,)
         Relative or absolute numbers of training examples that will be used to
         generate the learning curve. If the ``dtype`` is float, it is regarded
@@ -109,6 +115,7 @@ def plot_learning_curve(
         estimator,
         X,
         y,
+        scoring=scoring,
         cv=cv,
         n_jobs=n_jobs,
         train_sizes=train_sizes,
@@ -189,7 +196,15 @@ def plot_learning_curve(
 
 estimator = GaussianNB()
 plot_learning_curve(
-    estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01), cv=cv, n_jobs=4
+    estimator,
+    title,
+    X,
+    y,
+    axes=axes[:, 0],
+    ylim=(0.7, 1.01),
+    cv=cv,
+    n_jobs=4,
+    scoring="accuracy",
 )
 
 title = r"Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`"cell_type": "markdown",`
`16`	`16`	`"metadata": {},`
`17`	`17`	`"source": [`
`18`		- "\n# Kernel Density Estimate of Species Distributions\nThis shows an example of a neighbors-based query (in particular a kernel\ndensity estimate) on geospatial data, using a Ball Tree built upon the\nHaversine distance metric -- i.e. distances over points in latitude/longitude.\nThe dataset is provided by Phillips et. al. (2006).\nIf available, the example uses\n`basemap <https://matplotlib.org/basemap/>`_\nto plot the coast lines and national boundaries of South America.\n\nThis example does not perform any learning over the data\n(see `sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py` for\nan example of classification based on the attributes in this dataset). It\nsimply shows the kernel density estimate of observed data points in\ngeospatial coordinates.\n\nThe two species are:\n\n - `\"Bradypus variegatus\"\n <http://www.iucnredlist.org/apps/redlist/details/3038/0>`_ ,\n the Brown-throated Sloth.\n\n - `\"Microryzomys minutus\"\n <http://www.iucnredlist.org/details/13408/0>`_ ,\n also known as the Forest Small Rice Rat, a rodent that lives in Peru,\n Colombia, Ecuador, Peru, and Venezuela.\n\n## References\n\n * `\"Maximum entropy modeling of species geographic distributions\"\n <http://rob.schapire.net/papers/ecolmod.pdf>`_\n S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,\n 190:231-259, 2006.\n"
	`18`	+ "\n# Kernel Density Estimate of Species Distributions\nThis shows an example of a neighbors-based query (in particular a kernel\ndensity estimate) on geospatial data, using a Ball Tree built upon the\nHaversine distance metric -- i.e. distances over points in latitude/longitude.\nThe dataset is provided by Phillips et. al. (2006).\nIf available, the example uses\n`basemap <https://matplotlib.org/basemap/>`_\nto plot the coast lines and national boundaries of South America.\n\nThis example does not perform any learning over the data\n(see `sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py` for\nan example of classification based on the attributes in this dataset). It\nsimply shows the kernel density estimate of observed data points in\ngeospatial coordinates.\n\nThe two species are:\n\n - `\"Bradypus variegatus\"\n <https://www.iucnredlist.org/species/3038/47437046>`_ ,\n the Brown-throated Sloth.\n\n - `\"Microryzomys minutus\"\n <http://www.iucnredlist.org/details/13408/0>`_ ,\n also known as the Forest Small Rice Rat, a rodent that lives in Peru,\n Colombia, Ecuador, Peru, and Venezuela.\n\n## References\n\n * `\"Maximum entropy modeling of species geographic distributions\"\n <http://rob.schapire.net/papers/ecolmod.pdf>`_\n S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,\n 190:231-259, 2006.\n"
`19`	`19`	`]`
`20`	`20`	`},`
`21`	`21`	`{`
Original file line number	Diff line number	Diff line change
`@@ -324,7 +324,7 @@ def make_plot(item_idx):`
`324`	`324`	`#`
`325`	`325`	`# Unlike the previous scalers, the centering and scaling statistics of`
`326`	`326`	# :class:`~sklearn.preprocessing.RobustScaler`
`327`		`-# is based on percentiles and are therefore not influenced by a few`
	`327`	`+# are based on percentiles and are therefore not influenced by a small`
`328`	`328`	`# number of very large marginal outliers. Consequently, the resulting range of`
`329`	`329`	`# the transformed feature values is larger than for the previous scalers and,`
`330`	`330`	`# more importantly, are approximately similar: for both features most of the`
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@`
`58`	`58`	`# We define a predictive model based on a random forest. Therefore, we will make`
`59`	`59`	`# the following preprocessing steps:`
`60`	`60`	`#`
`61`		-# - use :class:`~sklearn.preprocessing.OrdinaleEcnoder` to encode the
	`61`	+# - use :class:`~sklearn.preprocessing.OrdinalEncoder` to encode the
`62`	`62`	`# categorical features;`
`63`	`63`	# - use :class:`~sklearn.impute.SimpleImputer` to fill missing values for
`64`	`64`	`# numerical features using a mean strategy.`