Skip to content

Commit 351d62a

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 019e9538a1a3892e1ce5c5bda935a6ce57750660
1 parent 87a1617 commit 351d62a

File tree

1,614 files changed

+7058
-9009
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,614 files changed

+7058
-9009
lines changed

dev/.buildinfo

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config: a9ac8d9817ba4b20d959691bb21a0d74
3+
config: be7d9736aba49c3929f749e1b912d054
44
tags: 645f666f9bcd5a90fca523b33c5a78b7
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,105 @@
11
"""
2-
=========================================================
3-
PCA example with Iris Data-set
4-
=========================================================
2+
==================================================
3+
Principal Component Analysis (PCA) on Iris Dataset
4+
==================================================
55
6-
Principal Component Analysis applied to the Iris dataset.
7-
8-
See `here <https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ for more
9-
information on this dataset.
6+
This example shows a well known decomposition technique known as Principal Component
7+
Analysis (PCA) on the
8+
`Iris dataset <https://en.wikipedia.org/wiki/Iris_flower_data_set>`_.
109
10+
This dataset is made of 4 features: sepal length, sepal width, petal length, petal
11+
width. We use PCA to project this 4 feature space into a 3-dimensional space.
1112
"""
1213

1314
# Authors: The scikit-learn developers
1415
# SPDX-License-Identifier: BSD-3-Clause
1516

17+
# %%
18+
# Loading the Iris dataset
19+
# ------------------------
20+
#
21+
# The Iris dataset is directly available as part of scikit-learn. It can be loaded
22+
# using the :func:`~sklearn.datasets.load_iris` function. With the default parameters,
23+
# a :class:`~sklearn.utils.Bunch` object is returned, containing the data, the
24+
# target values, the feature names, and the target names.
25+
from sklearn.datasets import load_iris
26+
27+
iris = load_iris(as_frame=True)
28+
print(iris.keys())
29+
30+
# %%
31+
# Plot of pairs of features of the Iris dataset
32+
# ---------------------------------------------
33+
#
34+
# Let's first plot the pairs of features of the Iris dataset.
35+
import seaborn as sns
36+
37+
# Rename classes using the iris target names
38+
iris.frame["target"] = iris.target_names[iris.target]
39+
_ = sns.pairplot(iris.frame, hue="target")
40+
41+
# %%
42+
# Each data point on each scatter plot refers to one of the 150 iris flowers
43+
# in the dataset, with the color indicating their respective type
44+
# (Setosa, Versicolor, and Virginica).
45+
#
46+
# You can already see a pattern regarding the Setosa type, which is
47+
# easily identifiable based on its short and wide sepal. Only
48+
# considering these two dimensions, sepal width and length, there's still
49+
# overlap between the Versicolor and Virginica types.
50+
#
51+
# The diagonal of the plot shows the distribution of each feature. We observe
52+
# that the petal width and the petal length are the most discriminant features
53+
# for the three types.
54+
#
55+
# Plot a PCA representation
56+
# -------------------------
57+
# Let's apply a Principal Component Analysis (PCA) to the iris dataset
58+
# and then plot the irises across the first three PCA dimensions.
59+
# This will allow us to better differentiate among the three types!
60+
1661
import matplotlib.pyplot as plt
1762

1863
# unused but required import for doing 3d projections with matplotlib < 3.2
1964
import mpl_toolkits.mplot3d # noqa: F401
20-
import numpy as np
21-
22-
from sklearn import datasets, decomposition
23-
24-
np.random.seed(5)
25-
26-
iris = datasets.load_iris()
27-
X = iris.data
28-
y = iris.target
29-
30-
fig = plt.figure(1, figsize=(4, 3))
31-
plt.clf()
32-
33-
ax = fig.add_subplot(111, projection="3d", elev=48, azim=134)
34-
ax.set_position([0, 0, 0.95, 1])
35-
36-
37-
plt.cla()
38-
pca = decomposition.PCA(n_components=3)
39-
pca.fit(X)
40-
X = pca.transform(X)
41-
42-
for name, label in [("Setosa", 0), ("Versicolour", 1), ("Virginica", 2)]:
43-
ax.text3D(
44-
X[y == label, 0].mean(),
45-
X[y == label, 1].mean() + 1.5,
46-
X[y == label, 2].mean(),
47-
name,
48-
horizontalalignment="center",
49-
bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"),
50-
)
51-
# Reorder the labels to have colors matching the cluster results
52-
y = np.choose(y, [1, 2, 0]).astype(float)
53-
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.nipy_spectral, edgecolor="k")
5465

66+
from sklearn.decomposition import PCA
67+
68+
fig = plt.figure(1, figsize=(8, 6))
69+
ax = fig.add_subplot(111, projection="3d", elev=-150, azim=110)
70+
71+
X_reduced = PCA(n_components=3).fit_transform(iris.data)
72+
scatter = ax.scatter(
73+
X_reduced[:, 0],
74+
X_reduced[:, 1],
75+
X_reduced[:, 2],
76+
c=iris.target,
77+
s=40,
78+
)
79+
80+
ax.set(
81+
title="First three PCA dimensions",
82+
xlabel="1st Eigenvector",
83+
ylabel="2nd Eigenvector",
84+
zlabel="3rd Eigenvector",
85+
)
5586
ax.xaxis.set_ticklabels([])
5687
ax.yaxis.set_ticklabels([])
5788
ax.zaxis.set_ticklabels([])
5889

90+
# Add a legend
91+
legend1 = ax.legend(
92+
scatter.legend_elements()[0],
93+
iris.target_names.tolist(),
94+
loc="upper right",
95+
title="Classes",
96+
)
97+
ax.add_artist(legend1)
98+
5999
plt.show()
100+
101+
# %%
102+
# PCA will create 3 new features that are a linear combination of the 4 original
103+
# features. In addition, this transformation maximizes the variance. With this
104+
# transformation, we see that we can identify each species using only the first feature
105+
# (i.e., first eigenvector).
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/26998096b90db15754e891c733ae032c/plot_iris_dataset.ipynb

-111
This file was deleted.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/46b6a23d83637bf0f381ce9d8c528aa2/plot_pca_iris.ipynb

+63-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"\n# PCA example with Iris Data-set\n\nPrincipal Component Analysis applied to the Iris dataset.\n\nSee [here](https://en.wikipedia.org/wiki/Iris_flower_data_set) for more\ninformation on this dataset.\n"
7+
"\n# Principal Component Analysis (PCA) on Iris Dataset\n\nThis example shows a well known decomposition technique known as Principal Component\nAnalysis (PCA) on the\n[Iris dataset](https://en.wikipedia.org/wiki/Iris_flower_data_set).\n\nThis dataset is made of 4 features: sepal length, sepal width, petal length, petal\nwidth. We use PCA to project this 4 feature space into a 3-dimensional space.\n"
88
]
99
},
1010
{
@@ -15,7 +15,68 @@
1515
},
1616
"outputs": [],
1717
"source": [
18-
"# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport matplotlib.pyplot as plt\n\n# unused but required import for doing 3d projections with matplotlib < 3.2\nimport mpl_toolkits.mplot3d # noqa: F401\nimport numpy as np\n\nfrom sklearn import datasets, decomposition\n\nnp.random.seed(5)\n\niris = datasets.load_iris()\nX = iris.data\ny = iris.target\n\nfig = plt.figure(1, figsize=(4, 3))\nplt.clf()\n\nax = fig.add_subplot(111, projection=\"3d\", elev=48, azim=134)\nax.set_position([0, 0, 0.95, 1])\n\n\nplt.cla()\npca = decomposition.PCA(n_components=3)\npca.fit(X)\nX = pca.transform(X)\n\nfor name, label in [(\"Setosa\", 0), (\"Versicolour\", 1), (\"Virginica\", 2)]:\n ax.text3D(\n X[y == label, 0].mean(),\n X[y == label, 1].mean() + 1.5,\n X[y == label, 2].mean(),\n name,\n horizontalalignment=\"center\",\n bbox=dict(alpha=0.5, edgecolor=\"w\", facecolor=\"w\"),\n )\n# Reorder the labels to have colors matching the cluster results\ny = np.choose(y, [1, 2, 0]).astype(float)\nax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.nipy_spectral, edgecolor=\"k\")\n\nax.xaxis.set_ticklabels([])\nax.yaxis.set_ticklabels([])\nax.zaxis.set_ticklabels([])\n\nplt.show()"
18+
"# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause"
19+
]
20+
},
21+
{
22+
"cell_type": "markdown",
23+
"metadata": {},
24+
"source": [
25+
"## Loading the Iris dataset\n\nThe Iris dataset is directly available as part of scikit-learn. It can be loaded\nusing the :func:`~sklearn.datasets.load_iris` function. With the default parameters,\na :class:`~sklearn.utils.Bunch` object is returned, containing the data, the\ntarget values, the feature names, and the target names.\n\n"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"metadata": {
32+
"collapsed": false
33+
},
34+
"outputs": [],
35+
"source": [
36+
"from sklearn.datasets import load_iris\n\niris = load_iris(as_frame=True)\nprint(iris.keys())"
37+
]
38+
},
39+
{
40+
"cell_type": "markdown",
41+
"metadata": {},
42+
"source": [
43+
"## Plot of pairs of features of the Iris dataset\n\nLet's first plot the pairs of features of the Iris dataset.\n\n"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": null,
49+
"metadata": {
50+
"collapsed": false
51+
},
52+
"outputs": [],
53+
"source": [
54+
"import seaborn as sns\n\n# Rename classes using the iris target names\niris.frame[\"target\"] = iris.target_names[iris.target]\n_ = sns.pairplot(iris.frame, hue=\"target\")"
55+
]
56+
},
57+
{
58+
"cell_type": "markdown",
59+
"metadata": {},
60+
"source": [
61+
"Each data point on each scatter plot refers to one of the 150 iris flowers\nin the dataset, with the color indicating their respective type\n(Setosa, Versicolor, and Virginica).\n\nYou can already see a pattern regarding the Setosa type, which is\neasily identifiable based on its short and wide sepal. Only\nconsidering these two dimensions, sepal width and length, there's still\noverlap between the Versicolor and Virginica types.\n\nThe diagonal of the plot shows the distribution of each feature. We observe\nthat the petal width and the petal length are the most discriminant features\nfor the three types.\n\n## Plot a PCA representation\nLet's apply a Principal Component Analysis (PCA) to the iris dataset\nand then plot the irises across the first three PCA dimensions.\nThis will allow us to better differentiate among the three types!\n\n"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": null,
67+
"metadata": {
68+
"collapsed": false
69+
},
70+
"outputs": [],
71+
"source": [
72+
"import matplotlib.pyplot as plt\n\n# unused but required import for doing 3d projections with matplotlib < 3.2\nimport mpl_toolkits.mplot3d # noqa: F401\n\nfrom sklearn.decomposition import PCA\n\nfig = plt.figure(1, figsize=(8, 6))\nax = fig.add_subplot(111, projection=\"3d\", elev=-150, azim=110)\n\nX_reduced = PCA(n_components=3).fit_transform(iris.data)\nscatter = ax.scatter(\n X_reduced[:, 0],\n X_reduced[:, 1],\n X_reduced[:, 2],\n c=iris.target,\n s=40,\n)\n\nax.set(\n title=\"First three PCA dimensions\",\n xlabel=\"1st Eigenvector\",\n ylabel=\"2nd Eigenvector\",\n zlabel=\"3rd Eigenvector\",\n)\nax.xaxis.set_ticklabels([])\nax.yaxis.set_ticklabels([])\nax.zaxis.set_ticklabels([])\n\n# Add a legend\nlegend1 = ax.legend(\n scatter.legend_elements()[0],\n iris.target_names.tolist(),\n loc=\"upper right\",\n title=\"Classes\",\n)\nax.add_artist(legend1)\n\nplt.show()"
73+
]
74+
},
75+
{
76+
"cell_type": "markdown",
77+
"metadata": {},
78+
"source": [
79+
"PCA will create 3 new features that are a linear combination of the 4 original\nfeatures. In addition, this transformation maximizes the variance. With this\ntransformation, we see that we can identify each species using only the first feature\n(i.e., first eigenvector).\n\n"
1980
]
2081
}
2182
],
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)