{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n# Cross-validation on diabetes Dataset Exercise\n\nA tutorial exercise which uses cross-validation with linear models.\n\nThis exercise is used in the `cv_estimators_tut` part of the\n`model_selection_tut` section of the `stat_learn_tut_index`.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load dataset and apply GridSearchCV\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn import datasets\nfrom sklearn.linear_model import Lasso\nfrom sklearn.model_selection import GridSearchCV\n\nX, y = datasets.load_diabetes(return_X_y=True)\nX = X[:150]\ny = y[:150]\n\nlasso = Lasso(random_state=0, max_iter=10000)\nalphas = np.logspace(-4, -0.5, 30)\n\ntuned_parameters = [{\"alpha\": alphas}]\nn_folds = 5\n\nclf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False)\nclf.fit(X, y)\nscores = clf.cv_results_[\"mean_test_score\"]\nscores_std = clf.cv_results_[\"std_test_score\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot error lines showing +/- std. errors of the scores\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.figure().set_size_inches(8, 6)\nplt.semilogx(alphas, scores)\n\nstd_error = scores_std / np.sqrt(n_folds)\n\nplt.semilogx(alphas, scores + std_error, \"b--\")\nplt.semilogx(alphas, scores - std_error, \"b--\")\n\n# alpha=0.2 controls the translucency of the fill color\nplt.fill_between(alphas, scores + std_error, scores - std_error, alpha=0.2)\n\nplt.ylabel(\"CV score +/- std error\")\nplt.xlabel(\"alpha\")\nplt.axhline(np.max(scores), linestyle=\"--\", color=\".5\")\nplt.xlim([alphas[0], alphas[-1]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Bonus: how much can you trust the selection of alpha?\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# To answer this question we use the LassoCV object that sets its alpha\n# parameter automatically from the data by internal cross-validation (i.e. it\n# performs cross-validation on the training data it receives).\n# We use external cross-validation to see how much the automatically obtained\n# alphas differ across different cross-validation folds.\n\nfrom sklearn.linear_model import LassoCV\nfrom sklearn.model_selection import KFold\n\nlasso_cv = LassoCV(alphas=alphas, random_state=0, max_iter=10000)\nk_fold = KFold(3)\n\nprint(\"Answer to the bonus question:\", \"how much can you trust the selection of alpha?\")\nprint()\nprint(\"Alpha parameters maximising the generalization score on different\")\nprint(\"subsets of the data:\")\nfor k, (train, test) in enumerate(k_fold.split(X, y)):\n lasso_cv.fit(X[train], y[train])\n print(\n \"[fold {0}] alpha: {1:.5f}, score: {2:.5f}\".format(\n k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])\n )\n )\nprint()\nprint(\"Answer: Not very much since we obtained different alphas for different\")\nprint(\"subsets of the data and moreover, the scores for these alphas differ\")\nprint(\"quite substantially.\")\n\nplt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 0 }