examples/preprocessing/plot_target_encoder_cross_val.py

"""
=======================================
Target Encoder's Internal Cross fitting
=======================================

.. currentmodule:: sklearn.preprocessing

The :class:`TargetEncoder` replaces each category of a categorical feature with
the shrunk mean of the target variable for that category. This method is useful
in cases where there is a strong relationship between the categorical feature
and the target. To prevent overfitting, :meth:`TargetEncoder.fit_transform` uses
an internal :term:`cross fitting` scheme to encode the training data to be used
by a downstream model. This scheme involves splitting the data into *k* folds
and encoding each fold using the encodings learnt using the other *k-1* folds.
In this example, we demonstrate the importance of the cross
fitting procedure to prevent overfitting.
"""

# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

# %%
# Create Synthetic Dataset
# ========================
# For this example, we build a dataset with three categorical features:
#
# * an informative feature with medium cardinality ("informative")
# * an uninformative feature with medium cardinality ("shuffled")
# * an uninformative feature with high cardinality ("near_unique")
#
# First, we generate the informative feature:
import numpy as np

from sklearn.preprocessing import KBinsDiscretizer

n_samples = 50_000

rng = np.random.RandomState(42)
y = rng.randn(n_samples)
noise = 0.5 * rng.randn(n_samples)
n_categories = 100

kbins = KBinsDiscretizer(
    n_bins=n_categories,
    encode="ordinal",
    strategy="uniform",
    random_state=rng,
    subsample=None,
)
X_informative = kbins.fit_transform((y + noise).reshape(-1, 1))

# Remove the linear relationship between y and the bin index by permuting the
# values of X_informative:
permuted_categories = rng.permutation(n_categories)
X_informative = permuted_categories[X_informative.astype(np.int32)]

# %%
# The uninformative feature with medium cardinality is generated by permuting the
# informative feature and removing the relationship with the target:
X_shuffled = rng.permutation(X_informative)

# %%
# The uninformative feature with high cardinality is generated so that it is
# independent of the target variable. We will show that target encoding without
# :term:`cross fitting` will cause catastrophic overfitting for the downstream
# regressor. These high cardinality features are basically unique identifiers
# for samples which should generally be removed from machine learning datasets.
# In this example, we generate them to show how :class:`TargetEncoder`'s default
# :term:`cross fitting` behavior mitigates the overfitting issue automatically.
X_near_unique_categories = rng.choice(
    int(0.9 * n_samples), size=n_samples, replace=True
).reshape(-1, 1)

# %%
# Finally, we assemble the dataset and perform a train test split:
import pandas as pd

from sklearn.model_selection import train_test_split

X = pd.DataFrame(
    np.concatenate(
        [X_informative, X_shuffled, X_near_unique_categories],
        axis=1,
    ),
    columns=["informative", "shuffled", "near_unique"],
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# %%
# Training a Ridge Regressor
# ==========================
# In this section, we train a ridge regressor on the dataset with and without
# encoding and explore the influence of target encoder with and without the
# internal :term:`cross fitting`. First, we see the Ridge model trained on the
# raw features will have low performance. This is because we permuted the order
# of the informative feature meaning `X_informative` is not informative when
# raw:
import sklearn
from sklearn.linear_model import Ridge

# Configure transformers to always output DataFrames
sklearn.set_config(transform_output="pandas")

ridge = Ridge(alpha=1e-6, solver="lsqr", fit_intercept=False)

raw_model = ridge.fit(X_train, y_train)
print("Raw Model score on training set: ", raw_model.score(X_train, y_train))
print("Raw Model score on test set: ", raw_model.score(X_test, y_test))

# %%
# Next, we create a pipeline with the target encoder and ridge model. The pipeline
# uses :meth:`TargetEncoder.fit_transform` which uses :term:`cross fitting`. We
# see that the model fits the data well and generalizes to the test set:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import TargetEncoder

model_with_cf = make_pipeline(TargetEncoder(random_state=0), ridge)
model_with_cf.fit(X_train, y_train)
print("Model with CF on train set: ", model_with_cf.score(X_train, y_train))
print("Model with CF on test set: ", model_with_cf.score(X_test, y_test))

# %%
# The coefficients of the linear model shows that most of the weight is on the
# feature at column index 0, which is the informative feature
import matplotlib.pyplot as plt
import pandas as pd

plt.rcParams["figure.constrained_layout.use"] = True

coefs_cf = pd.Series(
    model_with_cf[-1].coef_, index=model_with_cf[-1].feature_names_in_
).sort_values()
ax = coefs_cf.plot(kind="barh")
_ = ax.set(
    title="Target encoded with cross fitting",
    xlabel="Ridge coefficient",
    ylabel="Feature",
)

# %%
# While :meth:`TargetEncoder.fit_transform` uses an internal
# :term:`cross fitting` scheme to learn encodings for the training set,
# :meth:`TargetEncoder.transform` itself does not.
# It uses the complete training set to learn encodings and to transform the
# categorical features. Thus, we can use :meth:`TargetEncoder.fit` followed by
# :meth:`TargetEncoder.transform` to disable the :term:`cross fitting`. This
# encoding is then passed to the ridge model.
target_encoder = TargetEncoder(random_state=0)
target_encoder.fit(X_train, y_train)
X_train_no_cf_encoding = target_encoder.transform(X_train)
X_test_no_cf_encoding = target_encoder.transform(X_test)

model_no_cf = ridge.fit(X_train_no_cf_encoding, y_train)

# %%
# We evaluate the model that did not use :term:`cross fitting` when encoding and
# see that it overfits:
print(
    "Model without CF on training set: ",
    model_no_cf.score(X_train_no_cf_encoding, y_train),
)
print(
    "Model without CF on test set: ",
    model_no_cf.score(
        X_test_no_cf_encoding,
        y_test,
    ),
)

# %%
# The ridge model overfits because it assigns much more weight to the
# uninformative extremely high cardinality ("near_unique") and medium
# cardinality ("shuffled") features than when the model used
# :term:`cross fitting` to encode the features.
coefs_no_cf = pd.Series(
    model_no_cf.coef_, index=model_no_cf.feature_names_in_
).sort_values()
ax = coefs_no_cf.plot(kind="barh")
_ = ax.set(
    title="Target encoded without cross fitting",
    xlabel="Ridge coefficient",
    ylabel="Feature",
)

# %%
# Conclusion
# ==========
# This example demonstrates the importance of :class:`TargetEncoder`'s internal
# :term:`cross fitting`. It is important to use
# :meth:`TargetEncoder.fit_transform` to encode training data before passing it
# to a machine learning model. When a :class:`TargetEncoder` is a part of a
# :class:`~sklearn.pipeline.Pipeline` and the pipeline is fitted, the pipeline
# will correctly call :meth:`TargetEncoder.fit_transform` and use
# :term:`cross fitting` when encoding the training data.