{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n# Various Agglomerative Clustering on a 2D embedding of digits\n\nAn illustration of various linkage option for agglomerative clustering on\na 2D embedding of the digits dataset.\n\nThe goal of this example is to show intuitively how the metrics behave, and\nnot to find good clusters for the digits. This is why the example works on a\n2D embedding.\n\nWhat this example shows us is the behavior \"rich getting richer\" of\nagglomerative clustering that tends to create uneven cluster sizes.\nThis behavior is pronounced for the average linkage strategy,\nthat ends up with a couple of singleton clusters, while in the case\nof single linkage we get a single central cluster with all other clusters\nbeing drawn from noise points around the fringes.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Authors: Gael Varoquaux\n# License: BSD 3 clause (C) INRIA 2014\n\nfrom time import time\n\nimport numpy as np\nfrom scipy import ndimage\nfrom matplotlib import pyplot as plt\n\nfrom sklearn import manifold, datasets\n\nX, y = datasets.load_digits(return_X_y=True)\nn_samples, n_features = X.shape\n\nnp.random.seed(0)\n\n\ndef nudge_images(X, y):\n # Having a larger dataset shows more clearly the behavior of the\n # methods, but we multiply the size of the dataset only by 2, as the\n # cost of the hierarchical clustering methods are strongly\n # super-linear in n_samples\n shift = lambda x: ndimage.shift(\n x.reshape((8, 8)), 0.3 * np.random.normal(size=2), mode=\"constant\"\n ).ravel()\n X = np.concatenate([X, np.apply_along_axis(shift, 1, X)])\n Y = np.concatenate([y, y], axis=0)\n return X, Y\n\n\nX, y = nudge_images(X, y)\n\n\n# ----------------------------------------------------------------------\n# Visualize the clustering\ndef plot_clustering(X_red, labels, title=None):\n x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0)\n X_red = (X_red - x_min) / (x_max - x_min)\n\n plt.figure(figsize=(6, 4))\n for i in range(X_red.shape[0]):\n plt.text(\n X_red[i, 0],\n X_red[i, 1],\n str(y[i]),\n color=plt.cm.nipy_spectral(labels[i] / 10.0),\n fontdict={\"weight\": \"bold\", \"size\": 9},\n )\n\n plt.xticks([])\n plt.yticks([])\n if title is not None:\n plt.title(title, size=17)\n plt.axis(\"off\")\n plt.tight_layout(rect=[0, 0.03, 1, 0.95])\n\n\n# ----------------------------------------------------------------------\n# 2D embedding of the digits dataset\nprint(\"Computing embedding\")\nX_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)\nprint(\"Done.\")\n\nfrom sklearn.cluster import AgglomerativeClustering\n\nfor linkage in (\"ward\", \"average\", \"complete\", \"single\"):\n clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10)\n t0 = time()\n clustering.fit(X_red)\n print(\"%s :\\t%.2fs\" % (linkage, time() - t0))\n\n plot_clustering(X_red, clustering.labels_, \"%s linkage\" % linkage)\n\n\nplt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 0 }