Skip to content

Commit c677d88

Browse files
committed
Pushing the docs to 1.2/ for branch: 1.2.X, commit c1cfc4d4f36f9c00413e20d0ef85bed208a502ca
1 parent 93880d9 commit c677d88

File tree

3,970 files changed

+866797
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

3,970 files changed

+866797
-0
lines changed

Diff for: 1.2/.buildinfo

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Sphinx build info version 1
2+
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3+
config: 6bd680f9b86afba8d49c158b84a862d3
4+
tags: 645f666f9bcd5a90fca523b33c5a78b7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Segmenting the picture of greek coins in regions\n\nThis example uses `spectral_clustering` on a graph created from\nvoxel-to-voxel difference on an image to break this image into multiple\npartly-homogeneous regions.\n\nThis procedure (spectral clustering on an image) is an efficient\napproximate solution for finding normalized graph cuts.\n\nThere are three options to assign labels:\n\n* 'kmeans' spectral clustering clusters samples in the embedding space\n using a kmeans algorithm\n* 'discrete' iteratively searches for the closest partition\n space to the embedding space of spectral clustering.\n* 'cluster_qr' assigns labels using the QR factorization with pivoting\n that directly determines the partition in the embedding space.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# Author: Gael Varoquaux <[email protected]>\n# Brian Cheung\n# Andrew Knyazev <[email protected]>\n# License: BSD 3 clause\n\nimport time\n\nimport numpy as np\nfrom scipy.ndimage import gaussian_filter\nimport matplotlib.pyplot as plt\nfrom skimage.data import coins\nfrom skimage.transform import rescale\n\nfrom sklearn.feature_extraction import image\nfrom sklearn.cluster import spectral_clustering\n\n\n# load the coins as a numpy array\norig_coins = coins()\n\n# Resize it to 20% of the original size to speed up the processing\n# Applying a Gaussian filter for smoothing prior to down-scaling\n# reduces aliasing artifacts.\nsmoothened_coins = gaussian_filter(orig_coins, sigma=2)\nrescaled_coins = rescale(smoothened_coins, 0.2, mode=\"reflect\", anti_aliasing=False)\n\n# Convert the image into a graph with the value of the gradient on the\n# edges.\ngraph = image.img_to_graph(rescaled_coins)\n\n# Take a decreasing function of the gradient: an exponential\n# The smaller beta is, the more independent the segmentation is of the\n# actual image. For beta=1, the segmentation is close to a voronoi\nbeta = 10\neps = 1e-6\ngraph.data = np.exp(-beta * graph.data / graph.data.std()) + eps\n\n# The number of segmented regions to display needs to be chosen manually.\n# The current version of 'spectral_clustering' does not support determining\n# the number of good quality clusters automatically.\nn_regions = 26"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"Compute and visualize the resulting regions\n\n"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"# Computing a few extra eigenvectors may speed up the eigen_solver.\n# The spectral clustering quality may also benetif from requesting\n# extra regions for segmentation.\nn_regions_plus = 3\n\n# Apply spectral clustering using the default eigen_solver='arpack'.\n# Any implemented solver can be used: eigen_solver='arpack', 'lobpcg', or 'amg'.\n# Choosing eigen_solver='amg' requires an extra package called 'pyamg'.\n# The quality of segmentation and the speed of calculations is mostly determined\n# by the choice of the solver and the value of the tolerance 'eigen_tol'.\n# TODO: varying eigen_tol seems to have no effect for 'lobpcg' and 'amg' #21243.\nfor assign_labels in (\"kmeans\", \"discretize\", \"cluster_qr\"):\n t0 = time.time()\n labels = spectral_clustering(\n graph,\n n_clusters=(n_regions + n_regions_plus),\n eigen_tol=1e-7,\n assign_labels=assign_labels,\n random_state=42,\n )\n\n t1 = time.time()\n labels = labels.reshape(rescaled_coins.shape)\n plt.figure(figsize=(5, 5))\n plt.imshow(rescaled_coins, cmap=plt.cm.gray)\n\n plt.xticks(())\n plt.yticks(())\n title = \"Spectral clustering: %s, %.2fs\" % (assign_labels, (t1 - t0))\n print(title)\n plt.title(title)\n for l in range(n_regions):\n colors = [plt.cm.nipy_spectral((l + 4) / float(n_regions + 4))]\n plt.contour(labels == l, colors=colors)\n # To view individual segments as appear comment in plt.pause(0.5)\nplt.show()\n\n# TODO: After #21194 is merged and #21243 is fixed, check which eigen_solver\n# is the best and set eigen_solver='arpack', 'lobpcg', or 'amg' and eigen_tol\n# explicitly in this example."
48+
]
49+
}
50+
],
51+
"metadata": {
52+
"kernelspec": {
53+
"display_name": "Python 3",
54+
"language": "python",
55+
"name": "python3"
56+
},
57+
"language_info": {
58+
"codemirror_mode": {
59+
"name": "ipython",
60+
"version": 3
61+
},
62+
"file_extension": ".py",
63+
"mimetype": "text/x-python",
64+
"name": "python",
65+
"nbconvert_exporter": "python",
66+
"pygments_lexer": "ipython3",
67+
"version": "3.9.15"
68+
}
69+
},
70+
"nbformat": 4,
71+
"nbformat_minor": 0
72+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""
2+
=======================================================================
3+
Plot the decision surface of decision trees trained on the iris dataset
4+
=======================================================================
5+
6+
Plot the decision surface of a decision tree trained on pairs
7+
of features of the iris dataset.
8+
9+
See :ref:`decision tree <tree>` for more information on the estimator.
10+
11+
For each pair of iris features, the decision tree learns decision
12+
boundaries made of combinations of simple thresholding rules inferred from
13+
the training samples.
14+
15+
We also show the tree structure of a model built on all of the features.
16+
"""
17+
# %%
18+
# First load the copy of the Iris dataset shipped with scikit-learn:
19+
from sklearn.datasets import load_iris
20+
21+
iris = load_iris()
22+
23+
24+
# %%
25+
# Display the decision functions of trees trained on all pairs of features.
26+
import numpy as np
27+
import matplotlib.pyplot as plt
28+
29+
from sklearn.datasets import load_iris
30+
from sklearn.tree import DecisionTreeClassifier
31+
from sklearn.inspection import DecisionBoundaryDisplay
32+
33+
34+
# Parameters
35+
n_classes = 3
36+
plot_colors = "ryb"
37+
plot_step = 0.02
38+
39+
40+
for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
41+
# We only take the two corresponding features
42+
X = iris.data[:, pair]
43+
y = iris.target
44+
45+
# Train
46+
clf = DecisionTreeClassifier().fit(X, y)
47+
48+
# Plot the decision boundary
49+
ax = plt.subplot(2, 3, pairidx + 1)
50+
plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
51+
DecisionBoundaryDisplay.from_estimator(
52+
clf,
53+
X,
54+
cmap=plt.cm.RdYlBu,
55+
response_method="predict",
56+
ax=ax,
57+
xlabel=iris.feature_names[pair[0]],
58+
ylabel=iris.feature_names[pair[1]],
59+
)
60+
61+
# Plot the training points
62+
for i, color in zip(range(n_classes), plot_colors):
63+
idx = np.where(y == i)
64+
plt.scatter(
65+
X[idx, 0],
66+
X[idx, 1],
67+
c=color,
68+
label=iris.target_names[i],
69+
cmap=plt.cm.RdYlBu,
70+
edgecolor="black",
71+
s=15,
72+
)
73+
74+
plt.suptitle("Decision surface of decision trees trained on pairs of features")
75+
plt.legend(loc="lower right", borderpad=0, handletextpad=0)
76+
_ = plt.axis("tight")
77+
78+
# %%
79+
# Display the structure of a single decision tree trained on all the features
80+
# together.
81+
from sklearn.tree import plot_tree
82+
83+
plt.figure()
84+
clf = DecisionTreeClassifier().fit(iris.data, iris.target)
85+
plot_tree(clf, filled=True)
86+
plt.title("Decision tree trained on all the iris features")
87+
plt.show()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
"""
2+
=============================================================
3+
Receiver Operating Characteristic (ROC) with cross validation
4+
=============================================================
5+
6+
This example presents how to estimate and visualize the variance of the Receiver
7+
Operating Characteristic (ROC) metric using cross-validation.
8+
9+
ROC curves typically feature true positive rate (TPR) on the Y axis, and false
10+
positive rate (FPR) on the X axis. This means that the top left corner of the
11+
plot is the "ideal" point - a FPR of zero, and a TPR of one. This is not very
12+
realistic, but it does mean that a larger Area Under the Curve (AUC) is usually
13+
better. The "steepness" of ROC curves is also important, since it is ideal to
14+
maximize the TPR while minimizing the FPR.
15+
16+
This example shows the ROC response of different datasets, created from K-fold
17+
cross-validation. Taking all of these curves, it is possible to calculate the
18+
mean AUC, and see the variance of the curve when the
19+
training set is split into different subsets. This roughly shows how the
20+
classifier output is affected by changes in the training data, and how different
21+
the splits generated by K-fold cross-validation are from one another.
22+
23+
.. note::
24+
25+
See :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py` for a
26+
complement of the present example explaining the averaging strategies to
27+
generalize the metrics for multiclass classifiers.
28+
"""
29+
30+
# %%
31+
# Load and prepare data
32+
# =====================
33+
#
34+
# We import the :ref:`iris_dataset` which contains 3 classes, each one
35+
# corresponding to a type of iris plant. One class is linearly separable from
36+
# the other 2; the latter are **not** linearly separable from each other.
37+
#
38+
# In the following we binarize the dataset by dropping the "virginica" class
39+
# (`class_id=2`). This means that the "versicolor" class (`class_id=1`) is
40+
# regarded as the positive class and "setosa" as the negative class
41+
# (`class_id=0`).
42+
43+
import numpy as np
44+
from sklearn.datasets import load_iris
45+
46+
iris = load_iris()
47+
target_names = iris.target_names
48+
X, y = iris.data, iris.target
49+
X, y = X[y != 2], y[y != 2]
50+
n_samples, n_features = X.shape
51+
52+
# %%
53+
# We also add noisy features to make the problem harder.
54+
random_state = np.random.RandomState(0)
55+
X = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)
56+
57+
# %%
58+
# Classification and ROC analysis
59+
# -------------------------------
60+
#
61+
# Here we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and
62+
# plot the ROC curves fold-wise. Notice that the baseline to define the chance
63+
# level (dashed ROC curve) is a classifier that would always predict the most
64+
# frequent class.
65+
66+
import matplotlib.pyplot as plt
67+
68+
from sklearn import svm
69+
from sklearn.metrics import auc
70+
from sklearn.metrics import RocCurveDisplay
71+
from sklearn.model_selection import StratifiedKFold
72+
73+
cv = StratifiedKFold(n_splits=6)
74+
classifier = svm.SVC(kernel="linear", probability=True, random_state=random_state)
75+
76+
tprs = []
77+
aucs = []
78+
mean_fpr = np.linspace(0, 1, 100)
79+
80+
fig, ax = plt.subplots(figsize=(6, 6))
81+
for fold, (train, test) in enumerate(cv.split(X, y)):
82+
classifier.fit(X[train], y[train])
83+
viz = RocCurveDisplay.from_estimator(
84+
classifier,
85+
X[test],
86+
y[test],
87+
name=f"ROC fold {fold}",
88+
alpha=0.3,
89+
lw=1,
90+
ax=ax,
91+
)
92+
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
93+
interp_tpr[0] = 0.0
94+
tprs.append(interp_tpr)
95+
aucs.append(viz.roc_auc)
96+
ax.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
97+
98+
mean_tpr = np.mean(tprs, axis=0)
99+
mean_tpr[-1] = 1.0
100+
mean_auc = auc(mean_fpr, mean_tpr)
101+
std_auc = np.std(aucs)
102+
ax.plot(
103+
mean_fpr,
104+
mean_tpr,
105+
color="b",
106+
label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
107+
lw=2,
108+
alpha=0.8,
109+
)
110+
111+
std_tpr = np.std(tprs, axis=0)
112+
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
113+
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
114+
ax.fill_between(
115+
mean_fpr,
116+
tprs_lower,
117+
tprs_upper,
118+
color="grey",
119+
alpha=0.2,
120+
label=r"$\pm$ 1 std. dev.",
121+
)
122+
123+
ax.set(
124+
xlim=[-0.05, 1.05],
125+
ylim=[-0.05, 1.05],
126+
xlabel="False Positive Rate",
127+
ylabel="True Positive Rate",
128+
title=f"Mean ROC curve with variability\n(Positive label '{target_names[1]}')",
129+
)
130+
ax.axis("square")
131+
ax.legend(loc="lower right")
132+
plt.show()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""
2+
=================================================
3+
Concatenating multiple feature extraction methods
4+
=================================================
5+
6+
In many real-world examples, there are many ways to extract features from a
7+
dataset. Often it is beneficial to combine several methods to obtain good
8+
performance. This example shows how to use ``FeatureUnion`` to combine
9+
features obtained by PCA and univariate selection.
10+
11+
Combining features using this transformer has the benefit that it allows
12+
cross validation and grid searches over the whole process.
13+
14+
The combination used in this example is not particularly helpful on this
15+
dataset and is only used to illustrate the usage of FeatureUnion.
16+
17+
"""
18+
19+
# Author: Andreas Mueller <[email protected]>
20+
#
21+
# License: BSD 3 clause
22+
23+
from sklearn.pipeline import Pipeline, FeatureUnion
24+
from sklearn.model_selection import GridSearchCV
25+
from sklearn.svm import SVC
26+
from sklearn.datasets import load_iris
27+
from sklearn.decomposition import PCA
28+
from sklearn.feature_selection import SelectKBest
29+
30+
iris = load_iris()
31+
32+
X, y = iris.data, iris.target
33+
34+
# This dataset is way too high-dimensional. Better do PCA:
35+
pca = PCA(n_components=2)
36+
37+
# Maybe some original features were good, too?
38+
selection = SelectKBest(k=1)
39+
40+
# Build estimator from PCA and Univariate selection:
41+
42+
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
43+
44+
# Use combined features to transform dataset:
45+
X_features = combined_features.fit(X, y).transform(X)
46+
print("Combined space has", X_features.shape[1], "features")
47+
48+
svm = SVC(kernel="linear")
49+
50+
# Do grid search over k, n_components and C:
51+
52+
pipeline = Pipeline([("features", combined_features), ("svm", svm)])
53+
54+
param_grid = dict(
55+
features__pca__n_components=[1, 2, 3],
56+
features__univ_select__k=[1, 2],
57+
svm__C=[0.1, 1, 10],
58+
)
59+
60+
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
61+
grid_search.fit(X, y)
62+
print(grid_search.best_estimator_)

0 commit comments

Comments
 (0)