scikit-learn
diff --git a/‎1.2/.buildinfo
Lines changed: 4 additions & 0 deletions b/‎1.2/.buildinfo
Lines changed: 4 additions & 0 deletions
diff --git a/‎1.2/_downloads/006fc185672e58b056a5c134db26935c/plot_coin_segmentation.ipynb
Lines changed: 72 additions & 0 deletions b/‎1.2/_downloads/006fc185672e58b056a5c134db26935c/plot_coin_segmentation.ipynb
Lines changed: 72 additions & 0 deletions
diff --git a/‎1.2/_downloads/00ae629d652473137a3905a5e08ea815/plot_iris_dtc.py
Lines changed: 87 additions & 0 deletions b/‎1.2/_downloads/00ae629d652473137a3905a5e08ea815/plot_iris_dtc.py
Lines changed: 87 additions & 0 deletions
diff --git a/‎1.2/_downloads/010337852815f8103ac6cca38a812b3c/plot_roc_crossval.py
Lines changed: 132 additions & 0 deletions b/‎1.2/_downloads/010337852815f8103ac6cca38a812b3c/plot_roc_crossval.py
Lines changed: 132 additions & 0 deletions
diff --git a/‎1.2/_downloads/01fdc7c95204e4a420de7cd297711693/plot_feature_union.py
Lines changed: 62 additions & 0 deletions b/‎1.2/_downloads/01fdc7c95204e4a420de7cd297711693/plot_feature_union.py
Lines changed: 62 additions & 0 deletions
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: 6bd680f9b86afba8d49c158b84a862d3
+tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -0,0 +1,72 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Segmenting the picture of greek coins in regions\n\nThis example uses `spectral_clustering` on a graph created from\nvoxel-to-voxel difference on an image to break this image into multiple\npartly-homogeneous regions.\n\nThis procedure (spectral clustering on an image) is an efficient\napproximate solution for finding normalized graph cuts.\n\nThere are three options to assign labels:\n\n* 'kmeans' spectral clustering clusters samples in the embedding space\n  using a kmeans algorithm\n* 'discrete' iteratively searches for the closest partition\n  space to the embedding space of spectral clustering.\n* 'cluster_qr' assigns labels using the QR factorization with pivoting\n  that directly determines the partition in the embedding space.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Author: Gael Varoquaux <[email protected]>\n#         Brian Cheung\n#         Andrew Knyazev <[email protected]>\n# License: BSD 3 clause\n\nimport time\n\nimport numpy as np\nfrom scipy.ndimage import gaussian_filter\nimport matplotlib.pyplot as plt\nfrom skimage.data import coins\nfrom skimage.transform import rescale\n\nfrom sklearn.feature_extraction import image\nfrom sklearn.cluster import spectral_clustering\n\n\n# load the coins as a numpy array\norig_coins = coins()\n\n# Resize it to 20% of the original size to speed up the processing\n# Applying a Gaussian filter for smoothing prior to down-scaling\n# reduces aliasing artifacts.\nsmoothened_coins = gaussian_filter(orig_coins, sigma=2)\nrescaled_coins = rescale(smoothened_coins, 0.2, mode=\"reflect\", anti_aliasing=False)\n\n# Convert the image into a graph with the value of the gradient on the\n# edges.\ngraph = image.img_to_graph(rescaled_coins)\n\n# Take a decreasing function of the gradient: an exponential\n# The smaller beta is, the more independent the segmentation is of the\n# actual image. For beta=1, the segmentation is close to a voronoi\nbeta = 10\neps = 1e-6\ngraph.data = np.exp(-beta * graph.data / graph.data.std()) + eps\n\n# The number of segmented regions to display needs to be chosen manually.\n# The current version of 'spectral_clustering' does not support determining\n# the number of good quality clusters automatically.\nn_regions = 26"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Compute and visualize the resulting regions\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Computing a few extra eigenvectors may speed up the eigen_solver.\n# The spectral clustering quality may also benetif from requesting\n# extra regions for segmentation.\nn_regions_plus = 3\n\n# Apply spectral clustering using the default eigen_solver='arpack'.\n# Any implemented solver can be used: eigen_solver='arpack', 'lobpcg', or 'amg'.\n# Choosing eigen_solver='amg' requires an extra package called 'pyamg'.\n# The quality of segmentation and the speed of calculations is mostly determined\n# by the choice of the solver and the value of the tolerance 'eigen_tol'.\n# TODO: varying eigen_tol seems to have no effect for 'lobpcg' and 'amg' #21243.\nfor assign_labels in (\"kmeans\", \"discretize\", \"cluster_qr\"):\n    t0 = time.time()\n    labels = spectral_clustering(\n        graph,\n        n_clusters=(n_regions + n_regions_plus),\n        eigen_tol=1e-7,\n        assign_labels=assign_labels,\n        random_state=42,\n    )\n\n    t1 = time.time()\n    labels = labels.reshape(rescaled_coins.shape)\n    plt.figure(figsize=(5, 5))\n    plt.imshow(rescaled_coins, cmap=plt.cm.gray)\n\n    plt.xticks(())\n    plt.yticks(())\n    title = \"Spectral clustering: %s, %.2fs\" % (assign_labels, (t1 - t0))\n    print(title)\n    plt.title(title)\n    for l in range(n_regions):\n        colors = [plt.cm.nipy_spectral((l + 4) / float(n_regions + 4))]\n        plt.contour(labels == l, colors=colors)\n        # To view individual segments as appear comment in plt.pause(0.5)\nplt.show()\n\n# TODO: After #21194 is merged and #21243 is fixed, check which eigen_solver\n# is the best and set eigen_solver='arpack', 'lobpcg', or 'amg' and eigen_tol\n# explicitly in this example."
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.15"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,87 @@
+"""
+=======================================================================
+Plot the decision surface of decision trees trained on the iris dataset
+=======================================================================
+
+Plot the decision surface of a decision tree trained on pairs
+of features of the iris dataset.
+
+See :ref:`decision tree <tree>` for more information on the estimator.
+
+For each pair of iris features, the decision tree learns decision
+boundaries made of combinations of simple thresholding rules inferred from
+the training samples.
+
+We also show the tree structure of a model built on all of the features.
+"""
+# %%
+# First load the copy of the Iris dataset shipped with scikit-learn:
+from sklearn.datasets import load_iris
+
+iris = load_iris()
+
+
+# %%
+# Display the decision functions of trees trained on all pairs of features.
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.datasets import load_iris
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.inspection import DecisionBoundaryDisplay
+
+
+# Parameters
+n_classes = 3
+plot_colors = "ryb"
+plot_step = 0.02
+
+
+for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
+    # We only take the two corresponding features
+    X = iris.data[:, pair]
+    y = iris.target
+
+    # Train
+    clf = DecisionTreeClassifier().fit(X, y)
+
+    # Plot the decision boundary
+    ax = plt.subplot(2, 3, pairidx + 1)
+    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
+    DecisionBoundaryDisplay.from_estimator(
+        clf,
+        X,
+        cmap=plt.cm.RdYlBu,
+        response_method="predict",
+        ax=ax,
+        xlabel=iris.feature_names[pair[0]],
+        ylabel=iris.feature_names[pair[1]],
+    )
+
+    # Plot the training points
+    for i, color in zip(range(n_classes), plot_colors):
+        idx = np.where(y == i)
+        plt.scatter(
+            X[idx, 0],
+            X[idx, 1],
+            c=color,
+            label=iris.target_names[i],
+            cmap=plt.cm.RdYlBu,
+            edgecolor="black",
+            s=15,
+        )
+
+plt.suptitle("Decision surface of decision trees trained on pairs of features")
+plt.legend(loc="lower right", borderpad=0, handletextpad=0)
+_ = plt.axis("tight")
+
+# %%
+# Display the structure of a single decision tree trained on all the features
+# together.
+from sklearn.tree import plot_tree
+
+plt.figure()
+clf = DecisionTreeClassifier().fit(iris.data, iris.target)
+plot_tree(clf, filled=True)
+plt.title("Decision tree trained on all the iris features")
+plt.show()
@@ -0,0 +1,132 @@
+"""
+=============================================================
+Receiver Operating Characteristic (ROC) with cross validation
+=============================================================
+
+This example presents how to estimate and visualize the variance of the Receiver
+Operating Characteristic (ROC) metric using cross-validation.
+
+ROC curves typically feature true positive rate (TPR) on the Y axis, and false
+positive rate (FPR) on the X axis. This means that the top left corner of the
+plot is the "ideal" point - a FPR of zero, and a TPR of one. This is not very
+realistic, but it does mean that a larger Area Under the Curve (AUC) is usually
+better. The "steepness" of ROC curves is also important, since it is ideal to
+maximize the TPR while minimizing the FPR.
+
+This example shows the ROC response of different datasets, created from K-fold
+cross-validation. Taking all of these curves, it is possible to calculate the
+mean AUC, and see the variance of the curve when the
+training set is split into different subsets. This roughly shows how the
+classifier output is affected by changes in the training data, and how different
+the splits generated by K-fold cross-validation are from one another.
+
+.. note::
+
+    See :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py` for a
+    complement of the present example explaining the averaging strategies to
+    generalize the metrics for multiclass classifiers.
+"""
+
+# %%
+# Load and prepare data
+# =====================
+#
+# We import the :ref:`iris_dataset` which contains 3 classes, each one
+# corresponding to a type of iris plant. One class is linearly separable from
+# the other 2; the latter are **not** linearly separable from each other.
+#
+# In the following we binarize the dataset by dropping the "virginica" class
+# (`class_id=2`). This means that the "versicolor" class (`class_id=1`) is
+# regarded as the positive class and "setosa" as the negative class
+# (`class_id=0`).
+
+import numpy as np
+from sklearn.datasets import load_iris
+
+iris = load_iris()
+target_names = iris.target_names
+X, y = iris.data, iris.target
+X, y = X[y != 2], y[y != 2]
+n_samples, n_features = X.shape
+
+# %%
+# We also add noisy features to make the problem harder.
+random_state = np.random.RandomState(0)
+X = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)
+
+# %%
+# Classification and ROC analysis
+# -------------------------------
+#
+# Here we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and
+# plot the ROC curves fold-wise. Notice that the baseline to define the chance
+# level (dashed ROC curve) is a classifier that would always predict the most
+# frequent class.
+
+import matplotlib.pyplot as plt
+
+from sklearn import svm
+from sklearn.metrics import auc
+from sklearn.metrics import RocCurveDisplay
+from sklearn.model_selection import StratifiedKFold
+
+cv = StratifiedKFold(n_splits=6)
+classifier = svm.SVC(kernel="linear", probability=True, random_state=random_state)
+
+tprs = []
+aucs = []
+mean_fpr = np.linspace(0, 1, 100)
+
+fig, ax = plt.subplots(figsize=(6, 6))
+for fold, (train, test) in enumerate(cv.split(X, y)):
+    classifier.fit(X[train], y[train])
+    viz = RocCurveDisplay.from_estimator(
+        classifier,
+        X[test],
+        y[test],
+        name=f"ROC fold {fold}",
+        alpha=0.3,
+        lw=1,
+        ax=ax,
+    )
+    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
+    interp_tpr[0] = 0.0
+    tprs.append(interp_tpr)
+    aucs.append(viz.roc_auc)
+ax.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
+
+mean_tpr = np.mean(tprs, axis=0)
+mean_tpr[-1] = 1.0
+mean_auc = auc(mean_fpr, mean_tpr)
+std_auc = np.std(aucs)
+ax.plot(
+    mean_fpr,
+    mean_tpr,
+    color="b",
+    label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
+    lw=2,
+    alpha=0.8,
+)
+
+std_tpr = np.std(tprs, axis=0)
+tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
+tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
+ax.fill_between(
+    mean_fpr,
+    tprs_lower,
+    tprs_upper,
+    color="grey",
+    alpha=0.2,
+    label=r"$\pm$ 1 std. dev.",
+)
+
+ax.set(
+    xlim=[-0.05, 1.05],
+    ylim=[-0.05, 1.05],
+    xlabel="False Positive Rate",
+    ylabel="True Positive Rate",
+    title=f"Mean ROC curve with variability\n(Positive label '{target_names[1]}')",
+)
+ax.axis("square")
+ax.legend(loc="lower right")
+plt.show()
@@ -0,0 +1,62 @@
+"""
+=================================================
+Concatenating multiple feature extraction methods
+=================================================
+
+In many real-world examples, there are many ways to extract features from a
+dataset. Often it is beneficial to combine several methods to obtain good
+performance. This example shows how to use ``FeatureUnion`` to combine
+features obtained by PCA and univariate selection.
+
+Combining features using this transformer has the benefit that it allows
+cross validation and grid searches over the whole process.
+
+The combination used in this example is not particularly helpful on this
+dataset and is only used to illustrate the usage of FeatureUnion.
+
+"""
+
+# Author: Andreas Mueller <[email protected]>
+#
+# License: BSD 3 clause
+
+from sklearn.pipeline import Pipeline, FeatureUnion
+from sklearn.model_selection import GridSearchCV
+from sklearn.svm import SVC
+from sklearn.datasets import load_iris
+from sklearn.decomposition import PCA
+from sklearn.feature_selection import SelectKBest
+
+iris = load_iris()
+
+X, y = iris.data, iris.target
+
+# This dataset is way too high-dimensional. Better do PCA:
+pca = PCA(n_components=2)
+
+# Maybe some original features were good, too?
+selection = SelectKBest(k=1)
+
+# Build estimator from PCA and Univariate selection:
+
+combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
+
+# Use combined features to transform dataset:
+X_features = combined_features.fit(X, y).transform(X)
+print("Combined space has", X_features.shape[1], "features")
+
+svm = SVC(kernel="linear")
+
+# Do grid search over k, n_components and C:
+
+pipeline = Pipeline([("features", combined_features), ("svm", svm)])
+
+param_grid = dict(
+    features__pca__n_components=[1, 2, 3],
+    features__univ_select__k=[1, 2],
+    svm__C=[0.1, 1, 10],
+)
+
+grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
+grid_search.fit(X, y)
+print(grid_search.best_estimator_)