Skip to content

Commit 0b66a9b

Browse files
committed
Pushing the docs to 1.1/ for branch: 1.1.X, commit c987b5ca84610bf5251ea8fa33b48c5826942a0d
1 parent 86f3ce5 commit 0b66a9b

File tree

1,384 files changed

+11827
-9468
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,384 files changed

+11827
-9468
lines changed

Diff for: 1.1/.buildinfo

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config: 44afcf8dd215cc5d065a44ea3a818dd0
3+
config: d0bdad28d397ffd9f93c5892709f416d
44
tags: 645f666f9bcd5a90fca523b33c5a78b7
Binary file not shown.

Diff for: 1.1/_downloads/4cf0456267ced0f869a458ef4776d4c5/plot_release_highlights_1_1_0.py

+63-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
ax.plot(X_1d, y, "o", alpha=0.5, markersize=1)
5151
for quantile, hist in hist_quantiles.items():
5252
ax.plot(X_1d, hist.predict(X), label=quantile)
53-
ax.legend(loc="lower left")
53+
_ = ax.legend(loc="lower left")
5454

5555

5656
# %%
@@ -96,6 +96,7 @@
9696

9797
log_reg_input_features = log_reg[:-1].get_feature_names_out()
9898
pd.Series(log_reg[-1].coef_.ravel(), index=log_reg_input_features).plot.bar()
99+
plt.tight_layout()
99100

100101

101102
# %%
@@ -161,3 +162,64 @@
161162
# - :class:`linear_model.GammaRegressor`
162163
# - :class:`linear_model.PoissonRegressor`
163164
# - :class:`linear_model.TweedieRegressor`
165+
166+
# %%
167+
# MiniBatchNMF: an online version of NMF
168+
# --------------------------------------
169+
# The new class :class:`decomposition.MiniBatchNMF` implements a faster but less
170+
# accurate version of non-negative matrix factorization (:class:`decomposition.NMF`).
171+
# :class:`MiniBatchNMF` divides the data into mini-batches and optimizes the NMF model
172+
# in an online manner by cycling over the mini-batches, making it better suited for
173+
# large datasets. In particular, it implements `partial_fit`, which can be used for
174+
# online learning when the data is not readily available from the start, or when the
175+
# data does not fit into memory.
176+
import numpy as np
177+
from sklearn.decomposition import MiniBatchNMF
178+
179+
rng = np.random.RandomState(0)
180+
n_samples, n_features, n_components = 10, 10, 5
181+
true_W = rng.uniform(size=(n_samples, n_components))
182+
true_H = rng.uniform(size=(n_components, n_features))
183+
X = true_W @ true_H
184+
185+
nmf = MiniBatchNMF(n_components=n_components, random_state=0)
186+
187+
for _ in range(10):
188+
nmf.partial_fit(X)
189+
190+
W = nmf.transform(X)
191+
H = nmf.components_
192+
X_reconstructed = W @ H
193+
194+
print(
195+
f"relative reconstruction error: ",
196+
f"{np.sum((X - X_reconstructed) ** 2) / np.sum(X**2):.5f}",
197+
)
198+
199+
# %%
200+
# BisectingKMeans: divide and cluster
201+
# -----------------------------------
202+
# The new class :class:`cluster.BisectingKMeans` is a variant of :class:`KMeans`, using
203+
# divisive hierarchical clustering. Instead of creating all centroids at once, centroids
204+
# are picked progressively based on a previous clustering: a cluster is split into two
205+
# new clusters repeatedly until the target number of clusters is reached, giving a
206+
# hierarchical structure to the clustering.
207+
from sklearn.datasets import make_blobs
208+
from sklearn.cluster import KMeans, BisectingKMeans
209+
import matplotlib.pyplot as plt
210+
211+
X, _ = make_blobs(n_samples=1000, centers=2, random_state=0)
212+
213+
km = KMeans(n_clusters=5, random_state=0).fit(X)
214+
bisect_km = BisectingKMeans(n_clusters=5, random_state=0).fit(X)
215+
216+
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
217+
ax[0].scatter(X[:, 0], X[:, 1], s=10, c=km.labels_)
218+
ax[0].scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=20, c="r")
219+
ax[0].set_title("KMeans")
220+
221+
ax[1].scatter(X[:, 0], X[:, 1], s=10, c=bisect_km.labels_)
222+
ax[1].scatter(
223+
bisect_km.cluster_centers_[:, 0], bisect_km.cluster_centers_[:, 1], s=20, c="r"
224+
)
225+
_ = ax[1].set_title("BisectingKMeans")

Diff for: 1.1/_downloads/68fdea23e50d165632d4bd4e36453cd5/plot_release_highlights_1_1_0.ipynb

+38-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
},
3434
"outputs": [],
3535
"source": [
36-
"from sklearn.datasets import make_regression\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n# Simple regression function for X * cos(X)\nrng = np.random.RandomState(42)\nX_1d = np.linspace(0, 10, num=2000)\nX = X_1d.reshape(-1, 1)\ny = X_1d * np.cos(X_1d) + rng.normal(scale=X_1d / 3)\n\nquantiles = [0.95, 0.5, 0.05]\nparameters = dict(loss=\"quantile\", max_bins=32, max_iter=50)\nhist_quantiles = {\n f\"quantile={quantile:.2f}\": HistGradientBoostingRegressor(\n **parameters, quantile=quantile\n ).fit(X, y)\n for quantile in quantiles\n}\n\nfig, ax = plt.subplots()\nax.plot(X_1d, y, \"o\", alpha=0.5, markersize=1)\nfor quantile, hist in hist_quantiles.items():\n ax.plot(X_1d, hist.predict(X), label=quantile)\nax.legend(loc=\"lower left\")"
36+
"from sklearn.datasets import make_regression\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n# Simple regression function for X * cos(X)\nrng = np.random.RandomState(42)\nX_1d = np.linspace(0, 10, num=2000)\nX = X_1d.reshape(-1, 1)\ny = X_1d * np.cos(X_1d) + rng.normal(scale=X_1d / 3)\n\nquantiles = [0.95, 0.5, 0.05]\nparameters = dict(loss=\"quantile\", max_bins=32, max_iter=50)\nhist_quantiles = {\n f\"quantile={quantile:.2f}\": HistGradientBoostingRegressor(\n **parameters, quantile=quantile\n ).fit(X, y)\n for quantile in quantiles\n}\n\nfig, ax = plt.subplots()\nax.plot(X_1d, y, \"o\", alpha=0.5, markersize=1)\nfor quantile, hist in hist_quantiles.items():\n ax.plot(X_1d, hist.predict(X), label=quantile)\n_ = ax.legend(loc=\"lower left\")"
3737
]
3838
},
3939
{
@@ -69,7 +69,7 @@
6969
},
7070
"outputs": [],
7171
"source": [
72-
"import pandas as pd\n\nlog_reg_input_features = log_reg[:-1].get_feature_names_out()\npd.Series(log_reg[-1].coef_.ravel(), index=log_reg_input_features).plot.bar()"
72+
"import pandas as pd\n\nlog_reg_input_features = log_reg[:-1].get_feature_names_out()\npd.Series(log_reg[-1].coef_.ravel(), index=log_reg_input_features).plot.bar()\nplt.tight_layout()"
7373
]
7474
},
7575
{
@@ -114,6 +114,42 @@
114114
"source": [
115115
"## Performance improvements\nReductions on pairwise distances for dense float64 datasets has been refactored\nto better take advantage of non-blocking thread parallelism. For example,\n:meth:`neighbors.NearestNeighbors.kneighbors` and\n:meth:`neighbors.NearestNeighbors.radius_neighbors` can respectively be up to \u00d720 and\n\u00d75 faster than previously. In summary, the following functions and estimators\nnow benefit from improved performance:\n\n- :func:`metrics.pairwise_distances_argmin`\n- :func:`metrics.pairwise_distances_argmin_min`\n- :class:`cluster.AffinityPropagation`\n- :class:`cluster.Birch`\n- :class:`cluster.MeanShift`\n- :class:`cluster.OPTICS`\n- :class:`cluster.SpectralClustering`\n- :func:`feature_selection.mutual_info_regression`\n- :class:`neighbors.KNeighborsClassifier`\n- :class:`neighbors.KNeighborsRegressor`\n- :class:`neighbors.RadiusNeighborsClassifier`\n- :class:`neighbors.RadiusNeighborsRegressor`\n- :class:`neighbors.LocalOutlierFactor`\n- :class:`neighbors.NearestNeighbors`\n- :class:`manifold.Isomap`\n- :class:`manifold.LocallyLinearEmbedding`\n- :class:`manifold.TSNE`\n- :func:`manifold.trustworthiness`\n- :class:`semi_supervised.LabelPropagation`\n- :class:`semi_supervised.LabelSpreading`\n\nTo know more about the technical details of this work, you can read\n`this suite of blog posts <https://blog.scikit-learn.org/technical/performances/>`_.\n\nMoreover, the computation of loss functions has been refactored using\nCython resulting in performance improvements for the following estimators:\n\n- :class:`linear_model.LogisticRegression`\n- :class:`linear_model.GammaRegressor`\n- :class:`linear_model.PoissonRegressor`\n- :class:`linear_model.TweedieRegressor`\n\n"
116116
]
117+
},
118+
{
119+
"cell_type": "markdown",
120+
"metadata": {},
121+
"source": [
122+
"## MiniBatchNMF: an online version of NMF\nThe new class :class:`decomposition.MiniBatchNMF` implements a faster but less\naccurate version of non-negative matrix factorization (:class:`decomposition.NMF`).\n:class:`MiniBatchNMF` divides the data into mini-batches and optimizes the NMF model\nin an online manner by cycling over the mini-batches, making it better suited for\nlarge datasets. In particular, it implements `partial_fit`, which can be used for\nonline learning when the data is not readily available from the start, or when the\ndata does not fit into memory.\n\n"
123+
]
124+
},
125+
{
126+
"cell_type": "code",
127+
"execution_count": null,
128+
"metadata": {
129+
"collapsed": false
130+
},
131+
"outputs": [],
132+
"source": [
133+
"import numpy as np\nfrom sklearn.decomposition import MiniBatchNMF\n\nrng = np.random.RandomState(0)\nn_samples, n_features, n_components = 10, 10, 5\ntrue_W = rng.uniform(size=(n_samples, n_components))\ntrue_H = rng.uniform(size=(n_components, n_features))\nX = true_W @ true_H\n\nnmf = MiniBatchNMF(n_components=n_components, random_state=0)\n\nfor _ in range(10):\n nmf.partial_fit(X)\n\nW = nmf.transform(X)\nH = nmf.components_\nX_reconstructed = W @ H\n\nprint(\n f\"relative reconstruction error: \",\n f\"{np.sum((X - X_reconstructed) ** 2) / np.sum(X**2):.5f}\",\n)"
134+
]
135+
},
136+
{
137+
"cell_type": "markdown",
138+
"metadata": {},
139+
"source": [
140+
"## BisectingKMeans: divide and cluster\nThe new class :class:`cluster.BisectingKMeans` is a variant of :class:`KMeans`, using\ndivisive hierarchical clustering. Instead of creating all centroids at once, centroids\nare picked progressively based on a previous clustering: a cluster is split into two\nnew clusters repeatedly until the target number of clusters is reached, giving a\nhierarchical structure to the clustering.\n\n"
141+
]
142+
},
143+
{
144+
"cell_type": "code",
145+
"execution_count": null,
146+
"metadata": {
147+
"collapsed": false
148+
},
149+
"outputs": [],
150+
"source": [
151+
"from sklearn.datasets import make_blobs\nfrom sklearn.cluster import KMeans, BisectingKMeans\nimport matplotlib.pyplot as plt\n\nX, _ = make_blobs(n_samples=1000, centers=2, random_state=0)\n\nkm = KMeans(n_clusters=5, random_state=0).fit(X)\nbisect_km = BisectingKMeans(n_clusters=5, random_state=0).fit(X)\n\nfig, ax = plt.subplots(1, 2, figsize=(10, 5))\nax[0].scatter(X[:, 0], X[:, 1], s=10, c=km.labels_)\nax[0].scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=20, c=\"r\")\nax[0].set_title(\"KMeans\")\n\nax[1].scatter(X[:, 0], X[:, 1], s=10, c=bisect_km.labels_)\nax[1].scatter(\n bisect_km.cluster_centers_[:, 0], bisect_km.cluster_centers_[:, 1], s=20, c=\"r\"\n)\n_ = ax[1].set_title(\"BisectingKMeans\")"
152+
]
117153
}
118154
],
119155
"metadata": {

0 commit comments

Comments
 (0)