scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
16 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
16 Bytes
diff --git a/‎dev/_downloads/34b53ad148e36f98b6de8ddc15e3dfd3/plot_causal_interpretation.py
+3-2 b/‎dev/_downloads/34b53ad148e36f98b6de8ddc15e3dfd3/plot_causal_interpretation.py
+3-2
diff --git a/‎dev/_downloads/521b554adefca348463adbbe047d7e99/plot_linear_model_coefficient_interpretation.py
+23-25 b/‎dev/_downloads/521b554adefca348463adbbe047d7e99/plot_linear_model_coefficient_interpretation.py
+23-25
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
84 Bytes b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
84 Bytes
diff --git a/‎dev/_downloads/cf0f90f46eb559facf7f63f124f61e04/plot_linear_model_coefficient_interpretation.ipynb
+2-2 b/‎dev/_downloads/cf0f90f46eb559facf7f63f124f61e04/plot_linear_model_coefficient_interpretation.ipynb
+2-2
diff --git a/‎dev/_downloads/ff3bc184e1a2d8d99b77058ba52b764f/plot_causal_interpretation.ipynb
+2-2 b/‎dev/_downloads/ff3bc184e1a2d8d99b77058ba52b764f/plot_causal_interpretation.ipynb
+2-2
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
322 Bytes b/‎dev/_downloads/scikit-learn-docs.zip
322 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-48 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-48 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-160 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
-160 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-103 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
-103 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-10 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-10 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
-198 Bytes b/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
-198 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_anomaly_comparison_thumb.png
32 Bytes b/‎dev/_images/sphx_glr_plot_anomaly_comparison_thumb.png
32 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_001.png
666 Bytes b/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_001.png
666 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_thumb.png
-77 Bytes b/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_thumb.png
-77 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_calibration_curve_002.png
14 Bytes b/‎dev/_images/sphx_glr_plot_calibration_curve_002.png
14 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_causal_interpretation_003.png
662 Bytes b/‎dev/_images/sphx_glr_plot_causal_interpretation_003.png
662 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_cluster_comparison_001.png
-261 Bytes b/‎dev/_images/sphx_glr_plot_cluster_comparison_001.png
-261 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_cluster_comparison_thumb.png
39 Bytes b/‎dev/_images/sphx_glr_plot_cluster_comparison_thumb.png
39 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_coin_segmentation_001.png
-43 Bytes b/‎dev/_images/sphx_glr_plot_coin_segmentation_001.png
-43 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_coin_segmentation_002.png
-58 Bytes b/‎dev/_images/sphx_glr_plot_coin_segmentation_002.png
-58 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_coin_segmentation_003.png
-13 Bytes b/‎dev/_images/sphx_glr_plot_coin_segmentation_003.png
-13 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_coin_segmentation_thumb.png
-18 Bytes b/‎dev/_images/sphx_glr_plot_coin_segmentation_thumb.png
-18 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_compare_calibration_001.png
149 Bytes b/‎dev/_images/sphx_glr_plot_compare_calibration_001.png
149 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_compare_calibration_thumb.png
97 Bytes b/‎dev/_images/sphx_glr_plot_compare_calibration_thumb.png
97 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_dict_face_patches_001.png
-14 Bytes b/‎dev/_images/sphx_glr_plot_dict_face_patches_001.png
-14 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_dict_face_patches_thumb.png
-5 Bytes b/‎dev/_images/sphx_glr_plot_dict_face_patches_thumb.png
-5 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_digits_pipe_001.png
21 Bytes b/‎dev/_images/sphx_glr_plot_digits_pipe_001.png
21 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_digits_pipe_thumb.png
-28 Bytes b/‎dev/_images/sphx_glr_plot_digits_pipe_thumb.png
-28 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_document_classification_20newsgroups_005.png
-253 Bytes b/‎dev/_images/sphx_glr_plot_document_classification_20newsgroups_005.png
-253 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_document_classification_20newsgroups_006.png
202 Bytes b/‎dev/_images/sphx_glr_plot_document_classification_20newsgroups_006.png
202 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_document_clustering_001.png
47 Bytes b/‎dev/_images/sphx_glr_plot_document_clustering_001.png
47 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_document_clustering_thumb.png
71 Bytes b/‎dev/_images/sphx_glr_plot_document_clustering_thumb.png
71 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gmm_init_001.png
-149 Bytes b/‎dev/_images/sphx_glr_plot_gmm_init_001.png
-149 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gmm_init_thumb.png
-18 Bytes b/‎dev/_images/sphx_glr_plot_gmm_init_thumb.png
-18 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_001.png
606 Bytes b/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_001.png
606 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_002.png
2 Bytes b/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_002.png
2 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_thumb.png
76 Bytes b/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_thumb.png
76 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gradient_boosting_early_stopping_002.png
19 Bytes b/‎dev/_images/sphx_glr_plot_gradient_boosting_early_stopping_002.png
19 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_hashing_vs_dict_vectorizer_001.png
-20 Bytes b/‎dev/_images/sphx_glr_plot_hashing_vs_dict_vectorizer_001.png
-20 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_hashing_vs_dict_vectorizer_002.png
13 Bytes b/‎dev/_images/sphx_glr_plot_hashing_vs_dict_vectorizer_002.png
13 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_hashing_vs_dict_vectorizer_thumb.png
7 Bytes b/‎dev/_images/sphx_glr_plot_hashing_vs_dict_vectorizer_thumb.png
7 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_image_denoising_002.png
84 Bytes b/‎dev/_images/sphx_glr_plot_image_denoising_002.png
84 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_image_denoising_004.png
50 Bytes b/‎dev/_images/sphx_glr_plot_image_denoising_004.png
50 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_approximation_001.png
-453 Bytes b/‎dev/_images/sphx_glr_plot_kernel_approximation_001.png
-453 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_approximation_thumb.png
-411 Bytes b/‎dev/_images/sphx_glr_plot_kernel_approximation_thumb.png
-411 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_001.png
-58 Bytes b/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_001.png
-58 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_002.png
1.06 KB b/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_002.png
1.06 KB
diff --git a/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_thumb.png
41 Bytes b/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_thumb.png
41 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_learning_curve_002.png
2.32 KB b/‎dev/_images/sphx_glr_plot_learning_curve_002.png
2.32 KB
diff --git a/‎dev/_images/sphx_glr_plot_learning_curve_003.png
908 Bytes b/‎dev/_images/sphx_glr_plot_learning_curve_003.png
908 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_linear_model_coefficient_interpretation_001.png
-247 Bytes b/‎dev/_images/sphx_glr_plot_linear_model_coefficient_interpretation_001.png
-247 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_linear_model_coefficient_interpretation_thumb.png
-69 Bytes b/‎dev/_images/sphx_glr_plot_linear_model_coefficient_interpretation_thumb.png
-69 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_linkage_comparison_001.png
-979 Bytes b/‎dev/_images/sphx_glr_plot_linkage_comparison_001.png
-979 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_linkage_comparison_thumb.png
-11 Bytes b/‎dev/_images/sphx_glr_plot_linkage_comparison_thumb.png
-11 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_004.png
-90 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_004.png
-90 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_005.png
-73 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_005.png
-73 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_006.png
45 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_006.png
45 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_007.png
-8 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_007.png
-8 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_008.png
0 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_008.png
0 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_009.png
5 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_009.png
5 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_010.png
-29 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_010.png
-29 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_011.png
25 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_011.png
25 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_012.png
114 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_012.png
114 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_013.png
-74 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_013.png
-74 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_014.png
30 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_014.png
30 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_manifold_sphere_001.png
-592 Bytes b/‎dev/_images/sphx_glr_plot_manifold_sphere_001.png
-592 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_manifold_sphere_thumb.png
28 Bytes b/‎dev/_images/sphx_glr_plot_manifold_sphere_thumb.png
28 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_mini_batch_kmeans_001.png
-164 Bytes b/‎dev/_images/sphx_glr_plot_mini_batch_kmeans_001.png
-164 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_mini_batch_kmeans_thumb.png
3 Bytes b/‎dev/_images/sphx_glr_plot_mini_batch_kmeans_thumb.png
3 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_model_complexity_influence_001.png
247 Bytes b/‎dev/_images/sphx_glr_plot_model_complexity_influence_001.png
247 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_model_complexity_influence_002.png
-1.54 KB b/‎dev/_images/sphx_glr_plot_model_complexity_influence_002.png
-1.54 KB
diff --git a/‎dev/_images/sphx_glr_plot_model_complexity_influence_003.png
-1.92 KB b/‎dev/_images/sphx_glr_plot_model_complexity_influence_003.png
-1.92 KB
diff --git a/‎dev/_images/sphx_glr_plot_model_complexity_influence_thumb.png
280 Bytes b/‎dev/_images/sphx_glr_plot_model_complexity_influence_thumb.png
280 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_out_of_core_classification_002.png
-457 Bytes b/‎dev/_images/sphx_glr_plot_out_of_core_classification_002.png
-457 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_out_of_core_classification_003.png
-314 Bytes b/‎dev/_images/sphx_glr_plot_out_of_core_classification_003.png
-314 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_out_of_core_classification_004.png
-579 Bytes b/‎dev/_images/sphx_glr_plot_out_of_core_classification_004.png
-579 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_001.png
-10 Bytes b/‎dev/_images/sphx_glr_plot_prediction_latency_001.png
-10 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_002.png
114 Bytes b/‎dev/_images/sphx_glr_plot_prediction_latency_002.png
114 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_003.png
2.06 KB b/‎dev/_images/sphx_glr_plot_prediction_latency_003.png
2.06 KB
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_004.png
43 Bytes b/‎dev/_images/sphx_glr_plot_prediction_latency_004.png
43 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_thumb.png
-42 Bytes b/‎dev/_images/sphx_glr_plot_prediction_latency_thumb.png
-42 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_release_highlights_0_24_0_001.png
134 Bytes b/‎dev/_images/sphx_glr_plot_release_highlights_0_24_0_001.png
134 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_release_highlights_0_24_0_thumb.png
-134 Bytes b/‎dev/_images/sphx_glr_plot_release_highlights_0_24_0_thumb.png
-134 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_scalable_poly_kernels_001.png
-163 Bytes b/‎dev/_images/sphx_glr_plot_scalable_poly_kernels_001.png
-163 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_scalable_poly_kernels_thumb.png
-304 Bytes b/‎dev/_images/sphx_glr_plot_scalable_poly_kernels_thumb.png
-304 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_sgd_early_stopping_002.png
4.36 KB b/‎dev/_images/sphx_glr_plot_sgd_early_stopping_002.png
4.36 KB
diff --git a/‎dev/_images/sphx_glr_plot_sparse_logistic_regression_20newsgroups_001.png
-282 Bytes b/‎dev/_images/sphx_glr_plot_sparse_logistic_regression_20newsgroups_001.png
-282 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_sparse_logistic_regression_20newsgroups_thumb.png
502 Bytes b/‎dev/_images/sphx_glr_plot_sparse_logistic_regression_20newsgroups_thumb.png
502 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_stack_predictors_001.png
713 Bytes b/‎dev/_images/sphx_glr_plot_stack_predictors_001.png
713 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_stack_predictors_thumb.png
35 Bytes b/‎dev/_images/sphx_glr_plot_stack_predictors_thumb.png
35 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_successive_halving_heatmap_001.png
-241 Bytes b/‎dev/_images/sphx_glr_plot_successive_halving_heatmap_001.png
-241 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_successive_halving_heatmap_thumb.png
-80 Bytes b/‎dev/_images/sphx_glr_plot_successive_halving_heatmap_thumb.png
-80 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_ward_structured_vs_unstructured_001.png
18 Bytes b/‎dev/_images/sphx_glr_plot_ward_structured_vs_unstructured_001.png
18 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_ward_structured_vs_unstructured_002.png
-22 Bytes b/‎dev/_images/sphx_glr_plot_ward_structured_vs_unstructured_002.png
-22 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_ward_structured_vs_unstructured_thumb.png
18 Bytes b/‎dev/_images/sphx_glr_plot_ward_structured_vs_unstructured_thumb.png
18 Bytes
diff --git a/‎dev/_sources/auto_examples/applications/plot_cyclical_feature_engineering.rst.txt
+1-1 b/‎dev/_sources/auto_examples/applications/plot_cyclical_feature_engineering.rst.txt
+1-1
diff --git a/‎dev/_sources/auto_examples/applications/plot_digits_denoising.rst.txt
+1-1 b/‎dev/_sources/auto_examples/applications/plot_digits_denoising.rst.txt
+1-1
diff --git a/‎dev/_sources/auto_examples/applications/plot_face_recognition.rst.txt
+5-5 b/‎dev/_sources/auto_examples/applications/plot_face_recognition.rst.txt
+5-5
diff --git a/‎dev/_sources/auto_examples/applications/plot_model_complexity_influence.rst.txt
+15-15 b/‎dev/_sources/auto_examples/applications/plot_model_complexity_influence.rst.txt
+15-15
@@ -124,8 +124,7 @@
 ax = coef.plot.barh()
 ax.set_xlabel("Coefficient values")
 ax.set_title("Coefficients of the linear regression including the ability features")
-plt.tight_layout()
-plt.show()
+_ = plt.tight_layout()
 
 # %%
 # Income prediction with partial observations
@@ -158,6 +157,8 @@
 ax = coef.plot.barh()
 ax.set_xlabel("Coefficient values")
 _ = ax.set_title("Coefficients of the linear regression excluding the ability feature")
+plt.tight_layout()
+plt.show()
 
 # %%
 # To compensate for the omitted variable, the model inflates the coefficient of
 
@@ -3,25 +3,35 @@
 Common pitfalls in the interpretation of coefficients of linear models
 ======================================================================
 
-In linear models, the target value is modeled as
-a linear combination of the features (see the :ref:`linear_model` User Guide
-section for a description of a set of linear models available in
-scikit-learn).
-Coefficients in multiple linear models represent the relationship between the
-given feature, :math:`X_i` and the target, :math:`y`, assuming that all the
-other features remain constant (`conditional dependence
-<https://en.wikipedia.org/wiki/Conditional_dependence>`_).
-This is different from plotting :math:`X_i` versus :math:`y` and fitting a
-linear relationship: in that case all possible values of the other features are
-taken into account in the estimation (marginal dependence).
+In linear models, the target value is modeled as a linear combination of the
+features (see the :ref:`linear_model` User Guide section for a description of a
+set of linear models available in scikit-learn). Coefficients in multiple linear
+models represent the relationship between the given feature, :math:`X_i` and the
+target, :math:`y`, assuming that all the other features remain constant
+(`conditional dependence
+<https://en.wikipedia.org/wiki/Conditional_dependence>`_). This is different
+from plotting :math:`X_i` versus :math:`y` and fitting a linear relationship: in
+that case all possible values of the other features are taken into account in
+the estimation (marginal dependence).
 
 This example will provide some hints in interpreting coefficient in linear
 models, pointing at problems that arise when either the linear model is not
 appropriate to describe the dataset, or when features are correlated.
 
+.. note::
+
+    Keep in mind that the features :math:`X` and the outcome :math:`y` are in
+    general the result of a data generating process that is unknown to us.
+    Machine learning models are trained to approximate the unobserved
+    mathematical function that links :math:`X` to :math:`y` from sample data. As
+    a result, any interpretation made about a model may not necessarily
+    generalize to the true data generating process. This is especially true when
+    the model is of bad quality or when the sample data is not representative of
+    the population.
+
 We will use data from the `"Current Population Survey"
-<https://www.openml.org/d/534>`_ from 1985 to predict
-wage as a function of various features such as experience, age, or education.
+<https://www.openml.org/d/534>`_ from 1985 to predict wage as a function of
+various features such as experience, age, or education.
 
 .. contents::
    :local:
@@ -729,18 +739,6 @@
 # See the :ref:`sphx_glr_auto_examples_inspection_plot_causal_interpretation.py`
 # for a simulated case of ability OVB.
 #
-# Warning: data and model quality
-# -------------------------------
-#
-# Keep in mind that the outcome `y` and features `X` are the product
-# of a data generating process that is hidden from us. Machine
-# learning models are trained to approximate the unobserved
-# mathematical function that links `X` to `y` from sample data. As a
-# result, any interpretation made about a model may not necessarily
-# generalize to the true data generating process. This is especially
-# true when the model is of bad quality or when the sample data is
-# not representative of the population.
-#
 # Lessons learned
 # ---------------
 #
 
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Common pitfalls in the interpretation of coefficients of linear models\n\nIn linear models, the target value is modeled as\na linear combination of the features (see the `linear_model` User Guide\nsection for a description of a set of linear models available in\nscikit-learn).\nCoefficients in multiple linear models represent the relationship between the\ngiven feature, $X_i$ and the target, $y$, assuming that all the\nother features remain constant ([conditional dependence](https://en.wikipedia.org/wiki/Conditional_dependence)).\nThis is different from plotting $X_i$ versus $y$ and fitting a\nlinear relationship: in that case all possible values of the other features are\ntaken into account in the estimation (marginal dependence).\n\nThis example will provide some hints in interpreting coefficient in linear\nmodels, pointing at problems that arise when either the linear model is not\nappropriate to describe the dataset, or when features are correlated.\n\nWe will use data from the [\"Current Population Survey\"](https://www.openml.org/d/534) from 1985 to predict\nwage as a function of various features such as experience, age, or education.\n   :depth: 1\n"
+        "\n# Common pitfalls in the interpretation of coefficients of linear models\n\nIn linear models, the target value is modeled as a linear combination of the\nfeatures (see the `linear_model` User Guide section for a description of a\nset of linear models available in scikit-learn). Coefficients in multiple linear\nmodels represent the relationship between the given feature, $X_i$ and the\ntarget, $y$, assuming that all the other features remain constant\n([conditional dependence](https://en.wikipedia.org/wiki/Conditional_dependence)). This is different\nfrom plotting $X_i$ versus $y$ and fitting a linear relationship: in\nthat case all possible values of the other features are taken into account in\nthe estimation (marginal dependence).\n\nThis example will provide some hints in interpreting coefficient in linear\nmodels, pointing at problems that arise when either the linear model is not\nappropriate to describe the dataset, or when features are correlated.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>Keep in mind that the features $X$ and the outcome $y$ are in\n    general the result of a data generating process that is unknown to us.\n    Machine learning models are trained to approximate the unobserved\n    mathematical function that links $X$ to $y$ from sample data. As\n    a result, any interpretation made about a model may not necessarily\n    generalize to the true data generating process. This is especially true when\n    the model is of bad quality or when the sample data is not representative of\n    the population.</p></div>\n\nWe will use data from the [\"Current Population Survey\"](https://www.openml.org/d/534) from 1985 to predict wage as a function of\nvarious features such as experience, age, or education.\n   :depth: 1\n"
       ]
     },
     {
@@ -693,7 +693,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "We observe that the AGE and EXPERIENCE coefficients are varying a lot\ndepending of the fold.\n\n## Wrong causal interpretation\n\nPolicy makers might want to know the effect of education on wage to assess\nwhether or not a certain policy designed to entice people to pursue more\neducation would make economic sense. While Machine Learning models are great\nfor measuring statistical associations, they are generally unable to infer\ncausal effects.\n\nIt might be tempting to look at the coefficient of education on wage from our\nlast model (or any model for that matter) and conclude that it captures the\ntrue effect of a change in the standardized education variable on wages.\n\nUnfortunately there are likely unobserved confounding variables that either\ninflate or deflate that coefficient. A confounding variable is a variable that\ncauses both EDUCATION and WAGE. One example of such variable is ability.\nPresumably, more able people are more likely to pursue education while at the\nsame time being more likely to earn a higher hourly wage at any level of\neducation. In this case, ability induces a positive [Omitted Variable Bias](https://en.wikipedia.org/wiki/Omitted-variable_bias) (OVB) on the EDUCATION\ncoefficient, thereby exaggerating the effect of education on wages.\n\nSee the `sphx_glr_auto_examples_inspection_plot_causal_interpretation.py`\nfor a simulated case of ability OVB.\n\n## Warning: data and model quality\n\nKeep in mind that the outcome `y` and features `X` are the product\nof a data generating process that is hidden from us. Machine\nlearning models are trained to approximate the unobserved\nmathematical function that links `X` to `y` from sample data. As a\nresult, any interpretation made about a model may not necessarily\ngeneralize to the true data generating process. This is especially\ntrue when the model is of bad quality or when the sample data is\nnot representative of the population.\n\n## Lessons learned\n\n* Coefficients must be scaled to the same unit of measure to retrieve\n  feature importance. Scaling them with the standard-deviation of the\n  feature is a useful proxy.\n* Coefficients in multivariate linear models represent the dependency\n  between a given feature and the target, **conditional** on the other\n  features.\n* Correlated features induce instabilities in the coefficients of linear\n  models and their effects cannot be well teased apart.\n* Different linear models respond differently to feature correlation and\n  coefficients could significantly vary from one another.\n* Inspecting coefficients across the folds of a cross-validation loop\n  gives an idea of their stability.\n* Coefficients are unlikely to have any causal meaning. They tend\n  to be biased by unobserved confounders.\n* Inspection tools may not necessarily provide insights on the true\n  data generating process.\n\n"
+        "We observe that the AGE and EXPERIENCE coefficients are varying a lot\ndepending of the fold.\n\n## Wrong causal interpretation\n\nPolicy makers might want to know the effect of education on wage to assess\nwhether or not a certain policy designed to entice people to pursue more\neducation would make economic sense. While Machine Learning models are great\nfor measuring statistical associations, they are generally unable to infer\ncausal effects.\n\nIt might be tempting to look at the coefficient of education on wage from our\nlast model (or any model for that matter) and conclude that it captures the\ntrue effect of a change in the standardized education variable on wages.\n\nUnfortunately there are likely unobserved confounding variables that either\ninflate or deflate that coefficient. A confounding variable is a variable that\ncauses both EDUCATION and WAGE. One example of such variable is ability.\nPresumably, more able people are more likely to pursue education while at the\nsame time being more likely to earn a higher hourly wage at any level of\neducation. In this case, ability induces a positive [Omitted Variable Bias](https://en.wikipedia.org/wiki/Omitted-variable_bias) (OVB) on the EDUCATION\ncoefficient, thereby exaggerating the effect of education on wages.\n\nSee the `sphx_glr_auto_examples_inspection_plot_causal_interpretation.py`\nfor a simulated case of ability OVB.\n\n## Lessons learned\n\n* Coefficients must be scaled to the same unit of measure to retrieve\n  feature importance. Scaling them with the standard-deviation of the\n  feature is a useful proxy.\n* Coefficients in multivariate linear models represent the dependency\n  between a given feature and the target, **conditional** on the other\n  features.\n* Correlated features induce instabilities in the coefficients of linear\n  models and their effects cannot be well teased apart.\n* Different linear models respond differently to feature correlation and\n  coefficients could significantly vary from one another.\n* Inspecting coefficients across the folds of a cross-validation loop\n  gives an idea of their stability.\n* Coefficients are unlikely to have any causal meaning. They tend\n  to be biased by unobserved confounders.\n* Inspection tools may not necessarily provide insights on the true\n  data generating process.\n\n"
       ]
     }
   ],
 
@@ -105,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "import matplotlib.pyplot as plt\n\nmodel_coef = pd.Series(regressor_with_ability.coef_, index=features_names)\ncoef = pd.concat(\n    [true_coef[features_names], model_coef],\n    keys=[\"Coefficients of true generative model\", \"Model coefficients\"],\n    axis=1,\n)\nax = coef.plot.barh()\nax.set_xlabel(\"Coefficient values\")\nax.set_title(\"Coefficients of the linear regression including the ability features\")\nplt.tight_layout()\nplt.show()"
+        "import matplotlib.pyplot as plt\n\nmodel_coef = pd.Series(regressor_with_ability.coef_, index=features_names)\ncoef = pd.concat(\n    [true_coef[features_names], model_coef],\n    keys=[\"Coefficients of true generative model\", \"Model coefficients\"],\n    axis=1,\n)\nax = coef.plot.barh()\nax.set_xlabel(\"Coefficient values\")\nax.set_title(\"Coefficients of the linear regression including the ability features\")\n_ = plt.tight_layout()"
       ]
     },
     {
@@ -141,7 +141,7 @@
       },
       "outputs": [],
       "source": [
-        "model_coef = pd.Series(regressor_without_ability.coef_, index=features_names)\ncoef = pd.concat(\n    [true_coef[features_names], model_coef],\n    keys=[\"Coefficients of true generative model\", \"Model coefficients\"],\n    axis=1,\n)\nax = coef.plot.barh()\nax.set_xlabel(\"Coefficient values\")\n_ = ax.set_title(\"Coefficients of the linear regression excluding the ability feature\")"
+        "model_coef = pd.Series(regressor_without_ability.coef_, index=features_names)\ncoef = pd.concat(\n    [true_coef[features_names], model_coef],\n    keys=[\"Coefficients of true generative model\", \"Model coefficients\"],\n    axis=1,\n)\nax = coef.plot.barh()\nax.set_xlabel(\"Coefficient values\")\n_ = ax.set_title(\"Coefficients of the linear regression excluding the ability feature\")\nplt.tight_layout()\nplt.show()"
       ]
     },
     {
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`"cell_type": "markdown",`
`16`	`16`	`"metadata": {},`
`17`	`17`	`"source": [`
`18`		- "\n# Common pitfalls in the interpretation of coefficients of linear models\n\nIn linear models, the target value is modeled as\na linear combination of the features (see the `linear_model` User Guide\nsection for a description of a set of linear models available in\nscikit-learn).\nCoefficients in multiple linear models represent the relationship between the\ngiven feature, $X_i$ and the target, $y$, assuming that all the\nother features remain constant ([conditional dependence](https://en.wikipedia.org/wiki/Conditional_dependence)).\nThis is different from plotting $X_i$ versus $y$ and fitting a\nlinear relationship: in that case all possible values of the other features are\ntaken into account in the estimation (marginal dependence).\n\nThis example will provide some hints in interpreting coefficient in linear\nmodels, pointing at problems that arise when either the linear model is not\nappropriate to describe the dataset, or when features are correlated.\n\nWe will use data from the [\"Current Population Survey\"](https://www.openml.org/d/534) from 1985 to predict\nwage as a function of various features such as experience, age, or education.\n :depth: 1\n"
	`18`	+ "\n# Common pitfalls in the interpretation of coefficients of linear models\n\nIn linear models, the target value is modeled as a linear combination of the\nfeatures (see the `linear_model` User Guide section for a description of a\nset of linear models available in scikit-learn). Coefficients in multiple linear\nmodels represent the relationship between the given feature, $X_i$ and the\ntarget, $y$, assuming that all the other features remain constant\n([conditional dependence](https://en.wikipedia.org/wiki/Conditional_dependence)). This is different\nfrom plotting $X_i$ versus $y$ and fitting a linear relationship: in\nthat case all possible values of the other features are taken into account in\nthe estimation (marginal dependence).\n\nThis example will provide some hints in interpreting coefficient in linear\nmodels, pointing at problems that arise when either the linear model is not\nappropriate to describe the dataset, or when features are correlated.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>Keep in mind that the features $X$ and the outcome $y$ are in\n general the result of a data generating process that is unknown to us.\n Machine learning models are trained to approximate the unobserved\n mathematical function that links $X$ to $y$ from sample data. As\n a result, any interpretation made about a model may not necessarily\n generalize to the true data generating process. This is especially true when\n the model is of bad quality or when the sample data is not representative of\n the population.</p></div>\n\nWe will use data from the [\"Current Population Survey\"](https://www.openml.org/d/534) from 1985 to predict wage as a function of\nvarious features such as experience, age, or education.\n :depth: 1\n"
`19`	`19`	`]`
`20`	`20`	`},`
`21`	`21`	`{`
`@@ -693,7 +693,7 @@`
`693`	`693`	`"cell_type": "markdown",`
`694`	`694`	`"metadata": {},`
`695`	`695`	`"source": [`
`696`		- "We observe that the AGE and EXPERIENCE coefficients are varying a lot\ndepending of the fold.\n\n## Wrong causal interpretation\n\nPolicy makers might want to know the effect of education on wage to assess\nwhether or not a certain policy designed to entice people to pursue more\neducation would make economic sense. While Machine Learning models are great\nfor measuring statistical associations, they are generally unable to infer\ncausal effects.\n\nIt might be tempting to look at the coefficient of education on wage from our\nlast model (or any model for that matter) and conclude that it captures the\ntrue effect of a change in the standardized education variable on wages.\n\nUnfortunately there are likely unobserved confounding variables that either\ninflate or deflate that coefficient. A confounding variable is a variable that\ncauses both EDUCATION and WAGE. One example of such variable is ability.\nPresumably, more able people are more likely to pursue education while at the\nsame time being more likely to earn a higher hourly wage at any level of\neducation. In this case, ability induces a positive [Omitted Variable Bias](https://en.wikipedia.org/wiki/Omitted-variable_bias) (OVB) on the EDUCATION\ncoefficient, thereby exaggerating the effect of education on wages.\n\nSee the `sphx_glr_auto_examples_inspection_plot_causal_interpretation.py`\nfor a simulated case of ability OVB.\n\n## Warning: data and model quality\n\nKeep in mind that the outcome `y` and features `X` are the product\nof a data generating process that is hidden from us. Machine\nlearning models are trained to approximate the unobserved\nmathematical function that links `X` to `y` from sample data. As a\nresult, any interpretation made about a model may not necessarily\ngeneralize to the true data generating process. This is especially\ntrue when the model is of bad quality or when the sample data is\nnot representative of the population.\n\n## Lessons learned\n\n* Coefficients must be scaled to the same unit of measure to retrieve\n feature importance. Scaling them with the standard-deviation of the\n feature is a useful proxy.\n* Coefficients in multivariate linear models represent the dependency\n between a given feature and the target, conditional on the other\n features.\n* Correlated features induce instabilities in the coefficients of linear\n models and their effects cannot be well teased apart.\n* Different linear models respond differently to feature correlation and\n coefficients could significantly vary from one another.\n* Inspecting coefficients across the folds of a cross-validation loop\n gives an idea of their stability.\n* Coefficients are unlikely to have any causal meaning. They tend\n to be biased by unobserved confounders.\n* Inspection tools may not necessarily provide insights on the true\n data generating process.\n\n"
	`696`	+ "We observe that the AGE and EXPERIENCE coefficients are varying a lot\ndepending of the fold.\n\n## Wrong causal interpretation\n\nPolicy makers might want to know the effect of education on wage to assess\nwhether or not a certain policy designed to entice people to pursue more\neducation would make economic sense. While Machine Learning models are great\nfor measuring statistical associations, they are generally unable to infer\ncausal effects.\n\nIt might be tempting to look at the coefficient of education on wage from our\nlast model (or any model for that matter) and conclude that it captures the\ntrue effect of a change in the standardized education variable on wages.\n\nUnfortunately there are likely unobserved confounding variables that either\ninflate or deflate that coefficient. A confounding variable is a variable that\ncauses both EDUCATION and WAGE. One example of such variable is ability.\nPresumably, more able people are more likely to pursue education while at the\nsame time being more likely to earn a higher hourly wage at any level of\neducation. In this case, ability induces a positive [Omitted Variable Bias](https://en.wikipedia.org/wiki/Omitted-variable_bias) (OVB) on the EDUCATION\ncoefficient, thereby exaggerating the effect of education on wages.\n\nSee the `sphx_glr_auto_examples_inspection_plot_causal_interpretation.py`\nfor a simulated case of ability OVB.\n\n## Lessons learned\n\n* Coefficients must be scaled to the same unit of measure to retrieve\n feature importance. Scaling them with the standard-deviation of the\n feature is a useful proxy.\n* Coefficients in multivariate linear models represent the dependency\n between a given feature and the target, conditional on the other\n features.\n* Correlated features induce instabilities in the coefficients of linear\n models and their effects cannot be well teased apart.\n* Different linear models respond differently to feature correlation and\n coefficients could significantly vary from one another.\n* Inspecting coefficients across the folds of a cross-validation loop\n gives an idea of their stability.\n* Coefficients are unlikely to have any causal meaning. They tend\n to be biased by unobserved confounders.\n* Inspection tools may not necessarily provide insights on the true\n data generating process.\n\n"
`697`	`697`	`]`
`698`	`698`	`}`
`699`	`699`	`],`
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@`
`105`	`105`	`},`
`106`	`106`	`"outputs": [],`
`107`	`107`	`"source": [`
`108`		`- "import matplotlib.pyplot as plt\n\nmodel_coef = pd.Series(regressor_with_ability.coef_, index=features_names)\ncoef = pd.concat(\n [true_coef[features_names], model_coef],\n keys=[\"Coefficients of true generative model\", \"Model coefficients\"],\n axis=1,\n)\nax = coef.plot.barh()\nax.set_xlabel(\"Coefficient values\")\nax.set_title(\"Coefficients of the linear regression including the ability features\")\nplt.tight_layout()\nplt.show()"`
	`108`	`+ "import matplotlib.pyplot as plt\n\nmodel_coef = pd.Series(regressor_with_ability.coef_, index=features_names)\ncoef = pd.concat(\n [true_coef[features_names], model_coef],\n keys=[\"Coefficients of true generative model\", \"Model coefficients\"],\n axis=1,\n)\nax = coef.plot.barh()\nax.set_xlabel(\"Coefficient values\")\nax.set_title(\"Coefficients of the linear regression including the ability features\")\n_ = plt.tight_layout()"`
`109`	`109`	`]`
`110`	`110`	`},`
`111`	`111`	`{`
`@@ -141,7 +141,7 @@`
`141`	`141`	`},`
`142`	`142`	`"outputs": [],`
`143`	`143`	`"source": [`
`144`		`- "model_coef = pd.Series(regressor_without_ability.coef_, index=features_names)\ncoef = pd.concat(\n [true_coef[features_names], model_coef],\n keys=[\"Coefficients of true generative model\", \"Model coefficients\"],\n axis=1,\n)\nax = coef.plot.barh()\nax.set_xlabel(\"Coefficient values\")\n_ = ax.set_title(\"Coefficients of the linear regression excluding the ability feature\")"`
	`144`	`+ "model_coef = pd.Series(regressor_without_ability.coef_, index=features_names)\ncoef = pd.concat(\n [true_coef[features_names], model_coef],\n keys=[\"Coefficients of true generative model\", \"Model coefficients\"],\n axis=1,\n)\nax = coef.plot.barh()\nax.set_xlabel(\"Coefficient values\")\n_ = ax.set_title(\"Coefficients of the linear regression excluding the ability feature\")\nplt.tight_layout()\nplt.show()"`
`145`	`145`	`]`
`146`	`146`	`},`
`147`	`147`	`{`