forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot_ard.py
212 lines (183 loc) · 6.95 KB
/
plot_ard.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""
====================================
Comparing Linear Bayesian Regressors
====================================
This example compares two different bayesian regressors:
- a :ref:`automatic_relevance_determination`
- a :ref:`bayesian_ridge_regression`
In the first part, we use an :ref:`ordinary_least_squares` (OLS) model as a
baseline for comparing the models' coefficients with respect to the true
coefficients. Thereafter, we show that the estimation of such models is done by
iteratively maximizing the marginal log-likelihood of the observations.
In the last section we plot predictions and uncertainties for the ARD and the
Bayesian Ridge regressions using a polynomial feature expansion to fit a
non-linear relationship between `X` and `y`.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# %%
# Models robustness to recover the ground truth weights
# =====================================================
#
# Generate synthetic dataset
# --------------------------
#
# We generate a dataset where `X` and `y` are linearly linked: 10 of the
# features of `X` will be used to generate `y`. The other features are not
# useful at predicting `y`. In addition, we generate a dataset where `n_samples
# == n_features`. Such a setting is challenging for an OLS model and leads
# potentially to arbitrary large weights. Having a prior on the weights and a
# penalty alleviates the problem. Finally, gaussian noise is added.
from sklearn.datasets import make_regression
X, y, true_weights = make_regression(
n_samples=100,
n_features=100,
n_informative=10,
noise=8,
coef=True,
random_state=42,
)
# %%
# Fit the regressors
# ------------------
#
# We now fit both Bayesian models and the OLS to later compare the models'
# coefficients.
import pandas as pd
from sklearn.linear_model import ARDRegression, BayesianRidge, LinearRegression
olr = LinearRegression().fit(X, y)
brr = BayesianRidge(compute_score=True, max_iter=30).fit(X, y)
ard = ARDRegression(compute_score=True, max_iter=30).fit(X, y)
df = pd.DataFrame(
{
"Weights of true generative process": true_weights,
"ARDRegression": ard.coef_,
"BayesianRidge": brr.coef_,
"LinearRegression": olr.coef_,
}
)
# %%
# Plot the true and estimated coefficients
# ----------------------------------------
#
# Now we compare the coefficients of each model with the weights of
# the true generative model.
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import SymLogNorm
plt.figure(figsize=(10, 6))
ax = sns.heatmap(
df.T,
norm=SymLogNorm(linthresh=10e-4, vmin=-80, vmax=80),
cbar_kws={"label": "coefficients' values"},
cmap="seismic_r",
)
plt.ylabel("linear model")
plt.xlabel("coefficients")
plt.tight_layout(rect=(0, 0, 1, 0.95))
_ = plt.title("Models' coefficients")
# %%
# Due to the added noise, none of the models recover the true weights. Indeed,
# all models always have more than 10 non-zero coefficients. Compared to the OLS
# estimator, the coefficients using a Bayesian Ridge regression are slightly
# shifted toward zero, which stabilises them. The ARD regression provides a
# sparser solution: some of the non-informative coefficients are set exactly to
# zero, while shifting others closer to zero. Some non-informative coefficients
# are still present and retain large values.
# %%
# Plot the marginal log-likelihood
# --------------------------------
import numpy as np
ard_scores = -np.array(ard.scores_)
brr_scores = -np.array(brr.scores_)
plt.plot(ard_scores, color="navy", label="ARD")
plt.plot(brr_scores, color="red", label="BayesianRidge")
plt.ylabel("Log-likelihood")
plt.xlabel("Iterations")
plt.xlim(1, 30)
plt.legend()
_ = plt.title("Models log-likelihood")
# %%
# Indeed, both models minimize the log-likelihood up to an arbitrary cutoff
# defined by the `max_iter` parameter.
#
# Bayesian regressions with polynomial feature expansion
# ======================================================
# Generate synthetic dataset
# --------------------------
# We create a target that is a non-linear function of the input feature.
# Noise following a standard uniform distribution is added.
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
rng = np.random.RandomState(0)
n_samples = 110
# sort the data to make plotting easier later
X = np.sort(-10 * rng.rand(n_samples) + 10)
noise = rng.normal(0, 1, n_samples) * 1.35
y = np.sqrt(X) * np.sin(X) + noise
full_data = pd.DataFrame({"input_feature": X, "target": y})
X = X.reshape((-1, 1))
# extrapolation
X_plot = np.linspace(10, 10.4, 10)
y_plot = np.sqrt(X_plot) * np.sin(X_plot)
X_plot = np.concatenate((X, X_plot.reshape((-1, 1))))
y_plot = np.concatenate((y - noise, y_plot))
# %%
# Fit the regressors
# ------------------
#
# Here we try a degree 10 polynomial to potentially overfit, though the bayesian
# linear models regularize the size of the polynomial coefficients. As
# `fit_intercept=True` by default for
# :class:`~sklearn.linear_model.ARDRegression` and
# :class:`~sklearn.linear_model.BayesianRidge`, then
# :class:`~sklearn.preprocessing.PolynomialFeatures` should not introduce an
# additional bias feature. By setting `return_std=True`, the bayesian regressors
# return the standard deviation of the posterior distribution for the model
# parameters.
ard_poly = make_pipeline(
PolynomialFeatures(degree=10, include_bias=False),
StandardScaler(),
ARDRegression(),
).fit(X, y)
brr_poly = make_pipeline(
PolynomialFeatures(degree=10, include_bias=False),
StandardScaler(),
BayesianRidge(),
).fit(X, y)
y_ard, y_ard_std = ard_poly.predict(X_plot, return_std=True)
y_brr, y_brr_std = brr_poly.predict(X_plot, return_std=True)
# %%
# Plotting polynomial regressions with std errors of the scores
# -------------------------------------------------------------
ax = sns.scatterplot(
data=full_data, x="input_feature", y="target", color="black", alpha=0.75
)
ax.plot(X_plot, y_plot, color="black", label="Ground Truth")
ax.plot(X_plot, y_brr, color="red", label="BayesianRidge with polynomial features")
ax.plot(X_plot, y_ard, color="navy", label="ARD with polynomial features")
ax.fill_between(
X_plot.ravel(),
y_ard - y_ard_std,
y_ard + y_ard_std,
color="navy",
alpha=0.3,
)
ax.fill_between(
X_plot.ravel(),
y_brr - y_brr_std,
y_brr + y_brr_std,
color="red",
alpha=0.3,
)
ax.legend()
_ = ax.set_title("Polynomial fit of a non-linear feature")
# %%
# The error bars represent one standard deviation of the predicted gaussian
# distribution of the query points. Notice that the ARD regression captures the
# ground truth the best when using the default parameters in both models, but
# further reducing the `lambda_init` hyperparameter of the Bayesian Ridge can
# reduce its bias (see example
# :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`).
# Finally, due to the intrinsic limitations of a polynomial regression, both
# models fail when extrapolating.