Skip to content

Commit 311c6e2

Browse files
ogrisellorentzenchrbetatimglemaitre
authored
ENH new svd_solver='covariance_eigh' for PCA (scikit-learn#27491)
Co-authored-by: Christian Lorentzen <[email protected]> Co-authored-by: Tim Head <[email protected]> Co-authored-by: Guillaume Lemaitre <[email protected]>
1 parent 10b5c66 commit 311c6e2

14 files changed

+634
-158
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ examples/cluster/joblib
5555
reuters/
5656
benchmarks/bench_covertype_data/
5757
benchmarks/HIGGS.csv.gz
58+
bench_pca_solvers.csv
5859

5960
*.prefs
6061
.pydevproject

benchmarks/bench_pca_solvers.py

+165
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
# %%
2+
#
3+
# This benchmark compares the speed of PCA solvers on datasets of different
4+
# sizes in order to determine the best solver to select by default via the
5+
# "auto" heuristic.
6+
#
7+
# Note: we do not control for the accuracy of the solvers: we assume that all
8+
# solvers yield transformed data with similar explained variance. This
9+
# assumption is generally true, except for the randomized solver that might
10+
# require more power iterations.
11+
#
12+
# We generate synthetic data with dimensions that are useful to plot:
13+
# - time vs n_samples for a fixed n_features and,
14+
# - time vs n_features for a fixed n_samples for a fixed n_features.
15+
import itertools
16+
from math import log10
17+
from time import perf_counter
18+
19+
import matplotlib.pyplot as plt
20+
import numpy as np
21+
import pandas as pd
22+
23+
from sklearn import config_context
24+
from sklearn.decomposition import PCA
25+
26+
REF_DIMS = [100, 1000, 10_000]
27+
data_shapes = []
28+
for ref_dim in REF_DIMS:
29+
data_shapes.extend([(ref_dim, 10**i) for i in range(1, 8 - int(log10(ref_dim)))])
30+
data_shapes.extend(
31+
[(ref_dim, 3 * 10**i) for i in range(1, 8 - int(log10(ref_dim)))]
32+
)
33+
data_shapes.extend([(10**i, ref_dim) for i in range(1, 8 - int(log10(ref_dim)))])
34+
data_shapes.extend(
35+
[(3 * 10**i, ref_dim) for i in range(1, 8 - int(log10(ref_dim)))]
36+
)
37+
38+
# Remove duplicates:
39+
data_shapes = sorted(set(data_shapes))
40+
41+
print("Generating test datasets...")
42+
rng = np.random.default_rng(0)
43+
datasets = [rng.normal(size=shape) for shape in data_shapes]
44+
45+
46+
# %%
47+
def measure_one(data, n_components, solver, method_name="fit"):
48+
print(
49+
f"Benchmarking {solver=!r}, {n_components=}, {method_name=!r} on data with"
50+
f" shape {data.shape}"
51+
)
52+
pca = PCA(n_components=n_components, svd_solver=solver, random_state=0)
53+
timings = []
54+
elapsed = 0
55+
method = getattr(pca, method_name)
56+
with config_context(assume_finite=True):
57+
while elapsed < 0.5:
58+
tic = perf_counter()
59+
method(data)
60+
duration = perf_counter() - tic
61+
timings.append(duration)
62+
elapsed += duration
63+
return np.median(timings)
64+
65+
66+
SOLVERS = ["full", "covariance_eigh", "arpack", "randomized", "auto"]
67+
measurements = []
68+
for data, n_components, method_name in itertools.product(
69+
datasets, [2, 50], ["fit", "fit_transform"]
70+
):
71+
if n_components >= min(data.shape):
72+
continue
73+
for solver in SOLVERS:
74+
if solver == "covariance_eigh" and data.shape[1] > 5000:
75+
# Too much memory and too slow.
76+
continue
77+
if solver in ["arpack", "full"] and log10(data.size) > 7:
78+
# Too slow, in particular for the full solver.
79+
continue
80+
time = measure_one(data, n_components, solver, method_name=method_name)
81+
measurements.append(
82+
{
83+
"n_components": n_components,
84+
"n_samples": data.shape[0],
85+
"n_features": data.shape[1],
86+
"time": time,
87+
"solver": solver,
88+
"method_name": method_name,
89+
}
90+
)
91+
measurements = pd.DataFrame(measurements)
92+
measurements.to_csv("bench_pca_solvers.csv", index=False)
93+
94+
# %%
95+
all_method_names = measurements["method_name"].unique()
96+
all_n_components = measurements["n_components"].unique()
97+
98+
for method_name in all_method_names:
99+
fig, axes = plt.subplots(
100+
figsize=(16, 16),
101+
nrows=len(REF_DIMS),
102+
ncols=len(all_n_components),
103+
sharey=True,
104+
constrained_layout=True,
105+
)
106+
fig.suptitle(f"Benchmarks for PCA.{method_name}, varying n_samples", fontsize=16)
107+
108+
for row_idx, ref_dim in enumerate(REF_DIMS):
109+
for n_components, ax in zip(all_n_components, axes[row_idx]):
110+
for solver in SOLVERS:
111+
if solver == "auto":
112+
style_kwargs = dict(linewidth=2, color="black", style="--")
113+
else:
114+
style_kwargs = dict(style="o-")
115+
ax.set(
116+
title=f"n_components={n_components}, n_features={ref_dim}",
117+
ylabel="time (s)",
118+
)
119+
measurements.query(
120+
"n_components == @n_components and n_features == @ref_dim"
121+
" and solver == @solver and method_name == @method_name"
122+
).plot.line(
123+
x="n_samples",
124+
y="time",
125+
label=solver,
126+
logx=True,
127+
logy=True,
128+
ax=ax,
129+
**style_kwargs,
130+
)
131+
# %%
132+
for method_name in all_method_names:
133+
fig, axes = plt.subplots(
134+
figsize=(16, 16),
135+
nrows=len(REF_DIMS),
136+
ncols=len(all_n_components),
137+
sharey=True,
138+
)
139+
fig.suptitle(f"Benchmarks for PCA.{method_name}, varying n_features", fontsize=16)
140+
141+
for row_idx, ref_dim in enumerate(REF_DIMS):
142+
for n_components, ax in zip(all_n_components, axes[row_idx]):
143+
for solver in SOLVERS:
144+
if solver == "auto":
145+
style_kwargs = dict(linewidth=2, color="black", style="--")
146+
else:
147+
style_kwargs = dict(style="o-")
148+
ax.set(
149+
title=f"n_components={n_components}, n_samples={ref_dim}",
150+
ylabel="time (s)",
151+
)
152+
measurements.query(
153+
"n_components == @n_components and n_samples == @ref_dim "
154+
" and solver == @solver and method_name == @method_name"
155+
).plot.line(
156+
x="n_features",
157+
y="time",
158+
label=solver,
159+
logx=True,
160+
logy=True,
161+
ax=ax,
162+
**style_kwargs,
163+
)
164+
165+
# %%

doc/modules/compose.rst

+8-8
Original file line numberDiff line numberDiff line change
@@ -254,14 +254,14 @@ inspect the original instance such as::
254254

255255
>>> from sklearn.datasets import load_digits
256256
>>> X_digits, y_digits = load_digits(return_X_y=True)
257-
>>> pca1 = PCA()
257+
>>> pca1 = PCA(n_components=10)
258258
>>> svm1 = SVC()
259259
>>> pipe = Pipeline([('reduce_dim', pca1), ('clf', svm1)])
260260
>>> pipe.fit(X_digits, y_digits)
261-
Pipeline(steps=[('reduce_dim', PCA()), ('clf', SVC())])
261+
Pipeline(steps=[('reduce_dim', PCA(n_components=10)), ('clf', SVC())])
262262
>>> # The pca instance can be inspected directly
263-
>>> print(pca1.components_)
264-
[[-1.77484909e-19 ... 4.07058917e-18]]
263+
>>> pca1.components_.shape
264+
(10, 64)
265265

266266

267267
Enabling caching triggers a clone of the transformers before fitting.
@@ -274,15 +274,15 @@ Instead, use the attribute ``named_steps`` to inspect estimators within
274274
the pipeline::
275275

276276
>>> cachedir = mkdtemp()
277-
>>> pca2 = PCA()
277+
>>> pca2 = PCA(n_components=10)
278278
>>> svm2 = SVC()
279279
>>> cached_pipe = Pipeline([('reduce_dim', pca2), ('clf', svm2)],
280280
... memory=cachedir)
281281
>>> cached_pipe.fit(X_digits, y_digits)
282282
Pipeline(memory=...,
283-
steps=[('reduce_dim', PCA()), ('clf', SVC())])
284-
>>> print(cached_pipe.named_steps['reduce_dim'].components_)
285-
[[-1.77484909e-19 ... 4.07058917e-18]]
283+
steps=[('reduce_dim', PCA(n_components=10)), ('clf', SVC())])
284+
>>> cached_pipe.named_steps['reduce_dim'].components_.shape
285+
(10, 64)
286286
>>> # Remove the cache directory
287287
>>> rmtree(cachedir)
288288

doc/whats_new/v1.5.rst

+30
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@ Changed models
3131
properties).
3232
:pr:`27344` by :user:`Xuefeng Xu <xuefeng-xu>`.
3333

34+
- |Enhancement| :class:`decomposition.PCA`, :class:`decomposition.SparsePCA`
35+
and :class:`decomposition.TruncatedSVD` now set the sign of the `components_`
36+
attribute based on the component values instead of using the transformed data
37+
as reference. This change is needed to be able to offer consistent component
38+
signs across all `PCA` solvers, including the new
39+
`svd_solver="covariance_eigh"` option introduced in this release.
40+
3441
Support for Array API
3542
---------------------
3643

@@ -169,10 +176,33 @@ Changelog
169176
:mod:`sklearn.decomposition`
170177
............................
171178

179+
- |Efficiency| :class:`decomposition.PCA` with `svd_solver="full"` now assigns
180+
a contiguous `components_` attribute instead of an non-contiguous slice of
181+
the singular vectors. When `n_components << n_features`, this can save some
182+
memory and, more importantly, help speed-up subsequent calls to the `transform`
183+
method by more than an order of magnitude by leveraging cache locality of
184+
BLAS GEMM on contiguous arrays.
185+
:pr:`27491` by :user:`Olivier Grisel <ogrisel>`.
186+
172187
- |Enhancement| :class:`~decomposition.PCA` now automatically selects the ARPACK solver
173188
for sparse inputs when `svd_solver="auto"` instead of raising an error.
174189
:pr:`28498` by :user:`Thanh Lam Dang <lamdang2k>`.
175190

191+
- |Enhancement| :class:`decomposition.PCA` now supports a new solver option
192+
named `svd_solver="covariance_eigh"` which offers an order of magnitude
193+
speed-up and reduced memory usage for datasets with a large number of data
194+
points and a small number of features (say, `n_samples >> 1000 >
195+
n_features`). The `svd_solver="auto"` option has been updated to use the new
196+
solver automatically for such datasets. This solver also accepts sparse input
197+
data.
198+
:pr:`27491` by :user:`Olivier Grisel <ogrisel>`.
199+
200+
- |Fix| :class:`decomposition.PCA` fit with `svd_solver="arpack"`,
201+
`whiten=True` and a value for `n_components` that is larger than the rank of
202+
the training set, no longer returns infinite values when transforming
203+
hold-out data.
204+
:pr:`27491` by :user:`Olivier Grisel <ogrisel>`.
205+
176206
:mod:`sklearn.dummy`
177207
....................
178208

sklearn/decomposition/_base.py

+20-10
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,9 @@
1212

1313
import numpy as np
1414
from scipy import linalg
15-
from scipy.sparse import issparse
1615

1716
from ..base import BaseEstimator, ClassNamePrefixFeaturesOutMixin, TransformerMixin
1817
from ..utils._array_api import _add_to_diagonal, device, get_namespace
19-
from ..utils.sparsefuncs import _implicit_column_offset
2018
from ..utils.validation import check_is_fitted
2119

2220

@@ -138,21 +136,33 @@ def transform(self, X):
138136
Projection of X in the first principal components, where `n_samples`
139137
is the number of samples and `n_components` is the number of the components.
140138
"""
141-
xp, _ = get_namespace(X)
139+
xp, _ = get_namespace(X, self.components_, self.explained_variance_)
142140

143141
check_is_fitted(self)
144142

145143
X = self._validate_data(
146-
X, accept_sparse=("csr", "csc"), dtype=[xp.float64, xp.float32], reset=False
144+
X, dtype=[xp.float64, xp.float32], accept_sparse=("csr", "csc"), reset=False
147145
)
148-
if self.mean_ is not None:
149-
if issparse(X):
150-
X = _implicit_column_offset(X, self.mean_)
151-
else:
152-
X = X - self.mean_
146+
return self._transform(X, xp=xp, x_is_centered=False)
147+
148+
def _transform(self, X, xp, x_is_centered=False):
153149
X_transformed = X @ self.components_.T
150+
if not x_is_centered:
151+
# Apply the centering after the projection.
152+
# For dense X this avoids copying or mutating the data passed by
153+
# the caller.
154+
# For sparse X it keeps sparsity and avoids having to wrap X into
155+
# a linear operator.
156+
X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
154157
if self.whiten:
155-
X_transformed /= xp.sqrt(self.explained_variance_)
158+
# For some solvers (such as "arpack" and "covariance_eigh"), on
159+
# rank deficient data, some components can have a variance
160+
# arbitrarily close to zero, leading to non-finite results when
161+
# whitening. To avoid this problem we clip the variance below.
162+
scale = xp.sqrt(self.explained_variance_)
163+
min_scale = xp.finfo(scale.dtype).eps
164+
scale[scale < min_scale] = min_scale
165+
X_transformed /= scale
156166
return X_transformed
157167

158168
def inverse_transform(self, X):

sklearn/decomposition/_kernel_pca.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -366,9 +366,7 @@ def _fit_transform(self, K):
366366
)
367367

368368
# flip eigenvectors' sign to enforce deterministic output
369-
self.eigenvectors_, _ = svd_flip(
370-
self.eigenvectors_, np.zeros_like(self.eigenvectors_).T
371-
)
369+
self.eigenvectors_, _ = svd_flip(u=self.eigenvectors_, v=None)
372370

373371
# sort eigenvectors in descending order
374372
indices = self.eigenvalues_.argsort()[::-1]

0 commit comments

Comments
 (0)