forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot_target_encoder_cross_val.py
194 lines (166 loc) · 7.09 KB
/
plot_target_encoder_cross_val.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
"""
=======================================
Target Encoder's Internal Cross fitting
=======================================
.. currentmodule:: sklearn.preprocessing
The :class:`TargetEncoder` replaces each category of a categorical feature with
the shrunk mean of the target variable for that category. This method is useful
in cases where there is a strong relationship between the categorical feature
and the target. To prevent overfitting, :meth:`TargetEncoder.fit_transform` uses
an internal :term:`cross fitting` scheme to encode the training data to be used
by a downstream model. This scheme involves splitting the data into *k* folds
and encoding each fold using the encodings learnt using the other *k-1* folds.
In this example, we demonstrate the importance of the cross
fitting procedure to prevent overfitting.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# %%
# Create Synthetic Dataset
# ========================
# For this example, we build a dataset with three categorical features:
#
# * an informative feature with medium cardinality ("informative")
# * an uninformative feature with medium cardinality ("shuffled")
# * an uninformative feature with high cardinality ("near_unique")
#
# First, we generate the informative feature:
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
n_samples = 50_000
rng = np.random.RandomState(42)
y = rng.randn(n_samples)
noise = 0.5 * rng.randn(n_samples)
n_categories = 100
kbins = KBinsDiscretizer(
n_bins=n_categories,
encode="ordinal",
strategy="uniform",
random_state=rng,
subsample=None,
)
X_informative = kbins.fit_transform((y + noise).reshape(-1, 1))
# Remove the linear relationship between y and the bin index by permuting the
# values of X_informative:
permuted_categories = rng.permutation(n_categories)
X_informative = permuted_categories[X_informative.astype(np.int32)]
# %%
# The uninformative feature with medium cardinality is generated by permuting the
# informative feature and removing the relationship with the target:
X_shuffled = rng.permutation(X_informative)
# %%
# The uninformative feature with high cardinality is generated so that it is
# independent of the target variable. We will show that target encoding without
# :term:`cross fitting` will cause catastrophic overfitting for the downstream
# regressor. These high cardinality features are basically unique identifiers
# for samples which should generally be removed from machine learning datasets.
# In this example, we generate them to show how :class:`TargetEncoder`'s default
# :term:`cross fitting` behavior mitigates the overfitting issue automatically.
X_near_unique_categories = rng.choice(
int(0.9 * n_samples), size=n_samples, replace=True
).reshape(-1, 1)
# %%
# Finally, we assemble the dataset and perform a train test split:
import pandas as pd
from sklearn.model_selection import train_test_split
X = pd.DataFrame(
np.concatenate(
[X_informative, X_shuffled, X_near_unique_categories],
axis=1,
),
columns=["informative", "shuffled", "near_unique"],
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# %%
# Training a Ridge Regressor
# ==========================
# In this section, we train a ridge regressor on the dataset with and without
# encoding and explore the influence of target encoder with and without the
# internal :term:`cross fitting`. First, we see the Ridge model trained on the
# raw features will have low performance. This is because we permuted the order
# of the informative feature meaning `X_informative` is not informative when
# raw:
import sklearn
from sklearn.linear_model import Ridge
# Configure transformers to always output DataFrames
sklearn.set_config(transform_output="pandas")
ridge = Ridge(alpha=1e-6, solver="lsqr", fit_intercept=False)
raw_model = ridge.fit(X_train, y_train)
print("Raw Model score on training set: ", raw_model.score(X_train, y_train))
print("Raw Model score on test set: ", raw_model.score(X_test, y_test))
# %%
# Next, we create a pipeline with the target encoder and ridge model. The pipeline
# uses :meth:`TargetEncoder.fit_transform` which uses :term:`cross fitting`. We
# see that the model fits the data well and generalizes to the test set:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import TargetEncoder
model_with_cf = make_pipeline(TargetEncoder(random_state=0), ridge)
model_with_cf.fit(X_train, y_train)
print("Model with CF on train set: ", model_with_cf.score(X_train, y_train))
print("Model with CF on test set: ", model_with_cf.score(X_test, y_test))
# %%
# The coefficients of the linear model shows that most of the weight is on the
# feature at column index 0, which is the informative feature
import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams["figure.constrained_layout.use"] = True
coefs_cf = pd.Series(
model_with_cf[-1].coef_, index=model_with_cf[-1].feature_names_in_
).sort_values()
ax = coefs_cf.plot(kind="barh")
_ = ax.set(
title="Target encoded with cross fitting",
xlabel="Ridge coefficient",
ylabel="Feature",
)
# %%
# While :meth:`TargetEncoder.fit_transform` uses an internal
# :term:`cross fitting` scheme to learn encodings for the training set,
# :meth:`TargetEncoder.transform` itself does not.
# It uses the complete training set to learn encodings and to transform the
# categorical features. Thus, we can use :meth:`TargetEncoder.fit` followed by
# :meth:`TargetEncoder.transform` to disable the :term:`cross fitting`. This
# encoding is then passed to the ridge model.
target_encoder = TargetEncoder(random_state=0)
target_encoder.fit(X_train, y_train)
X_train_no_cf_encoding = target_encoder.transform(X_train)
X_test_no_cf_encoding = target_encoder.transform(X_test)
model_no_cf = ridge.fit(X_train_no_cf_encoding, y_train)
# %%
# We evaluate the model that did not use :term:`cross fitting` when encoding and
# see that it overfits:
print(
"Model without CF on training set: ",
model_no_cf.score(X_train_no_cf_encoding, y_train),
)
print(
"Model without CF on test set: ",
model_no_cf.score(
X_test_no_cf_encoding,
y_test,
),
)
# %%
# The ridge model overfits because it assigns much more weight to the
# uninformative extremely high cardinality ("near_unique") and medium
# cardinality ("shuffled") features than when the model used
# :term:`cross fitting` to encode the features.
coefs_no_cf = pd.Series(
model_no_cf.coef_, index=model_no_cf.feature_names_in_
).sort_values()
ax = coefs_no_cf.plot(kind="barh")
_ = ax.set(
title="Target encoded without cross fitting",
xlabel="Ridge coefficient",
ylabel="Feature",
)
# %%
# Conclusion
# ==========
# This example demonstrates the importance of :class:`TargetEncoder`'s internal
# :term:`cross fitting`. It is important to use
# :meth:`TargetEncoder.fit_transform` to encode training data before passing it
# to a machine learning model. When a :class:`TargetEncoder` is a part of a
# :class:`~sklearn.pipeline.Pipeline` and the pipeline is fitted, the pipeline
# will correctly call :meth:`TargetEncoder.fit_transform` and use
# :term:`cross fitting` when encoding the training data.