{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n# Classifier comparison\n\nA comparison of a several classifiers in scikit-learn on synthetic datasets.\nThe point of this example is to illustrate the nature of decision boundaries\nof different classifiers.\nThis should be taken with a grain of salt, as the intuition conveyed by\nthese examples does not necessarily carry over to real datasets.\n\nParticularly in high-dimensional spaces, data can more easily be separated\nlinearly and the simplicity of classifiers such as naive Bayes and linear SVMs\nmight lead to better generalization than is achieved by other classifiers.\n\nThe plots show training points in solid colors and testing points\nsemi-transparent. The lower right shows the classification accuracy on the test\nset.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Code source: Ga\u00ebl Varoquaux\n# Andreas M\u00fcller\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.datasets import make_moons, make_circles, make_classification\nfrom sklearn.neural_network import MLPClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.gaussian_process import GaussianProcessClassifier\nfrom sklearn.gaussian_process.kernels import RBF\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\nfrom sklearn.inspection import DecisionBoundaryDisplay\n\nnames = [\n \"Nearest Neighbors\",\n \"Linear SVM\",\n \"RBF SVM\",\n \"Gaussian Process\",\n \"Decision Tree\",\n \"Random Forest\",\n \"Neural Net\",\n \"AdaBoost\",\n \"Naive Bayes\",\n \"QDA\",\n]\n\nclassifiers = [\n KNeighborsClassifier(3),\n SVC(kernel=\"linear\", C=0.025),\n SVC(gamma=2, C=1),\n GaussianProcessClassifier(1.0 * RBF(1.0)),\n DecisionTreeClassifier(max_depth=5),\n RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),\n MLPClassifier(alpha=1, max_iter=1000),\n AdaBoostClassifier(),\n GaussianNB(),\n QuadraticDiscriminantAnalysis(),\n]\n\nX, y = make_classification(\n n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1\n)\nrng = np.random.RandomState(2)\nX += 2 * rng.uniform(size=X.shape)\nlinearly_separable = (X, y)\n\ndatasets = [\n make_moons(noise=0.3, random_state=0),\n make_circles(noise=0.2, factor=0.5, random_state=1),\n linearly_separable,\n]\n\nfigure = plt.figure(figsize=(27, 9))\ni = 1\n# iterate over datasets\nfor ds_cnt, ds in enumerate(datasets):\n # preprocess dataset, split into training and test part\n X, y = ds\n X_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.4, random_state=42\n )\n\n x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5\n y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5\n\n # just plot the dataset first\n cm = plt.cm.RdBu\n cm_bright = ListedColormap([\"#FF0000\", \"#0000FF\"])\n ax = plt.subplot(len(datasets), len(classifiers) + 1, i)\n if ds_cnt == 0:\n ax.set_title(\"Input data\")\n # Plot the training points\n ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors=\"k\")\n # Plot the testing points\n ax.scatter(\n X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors=\"k\"\n )\n ax.set_xlim(x_min, x_max)\n ax.set_ylim(y_min, y_max)\n ax.set_xticks(())\n ax.set_yticks(())\n i += 1\n\n # iterate over classifiers\n for name, clf in zip(names, classifiers):\n ax = plt.subplot(len(datasets), len(classifiers) + 1, i)\n\n clf = make_pipeline(StandardScaler(), clf)\n clf.fit(X_train, y_train)\n score = clf.score(X_test, y_test)\n DecisionBoundaryDisplay.from_estimator(\n clf, X, cmap=cm, alpha=0.8, ax=ax, eps=0.5\n )\n\n # Plot the training points\n ax.scatter(\n X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors=\"k\"\n )\n # Plot the testing points\n ax.scatter(\n X_test[:, 0],\n X_test[:, 1],\n c=y_test,\n cmap=cm_bright,\n edgecolors=\"k\",\n alpha=0.6,\n )\n\n ax.set_xlim(x_min, x_max)\n ax.set_ylim(y_min, y_max)\n ax.set_xticks(())\n ax.set_yticks(())\n if ds_cnt == 0:\n ax.set_title(name)\n ax.text(\n x_max - 0.3,\n y_min + 0.3,\n (\"%.2f\" % score).lstrip(\"0\"),\n size=15,\n horizontalalignment=\"right\",\n )\n i += 1\n\nplt.tight_layout()\nplt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 0 }