diff --git a/sklearn_extra/cluster/minmax_linkage_tutorial.ipynb b/sklearn_extra/cluster/minmax_linkage_tutorial.ipynb new file mode 100644 index 00000000..c4e1c3d1 --- /dev/null +++ b/sklearn_extra/cluster/minmax_linkage_tutorial.ipynb @@ -0,0 +1,221 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook briefly explains [minmax linkage](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4527350/) in hierarchical clustering and it provides a naive implementation " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# What is min-max linkage?\n", + "\n", + "A minmax radius of cluster can be defined as:
\n", + "$r(C) = \\min_{x \\in C} {d_{max}(x,C)}$\n", + "\n", + "In other words, for each observation in the cluster $C$, its distance to its furthest neighbor is calculated. Then, among the calculated distances for all observations in $C$, its minimum is considered as its radius.\n", + "\n", + "Note that a (closed) ball of radius $r(C)$ centered at the prototype covers all of C. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, the minimax linkage between two clusters G and H is defined as follows:
\n", + "$d(G,H) = r(G \\cup H)$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Naive Implementation" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "#Importing Libraries\n", + "\n", + "from sklearn.metrics import pairwise_distances\n", + "import copy\n", + "import numpy as np\n", + "from sklearn.preprocessing import StandardScaler\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "def agglemorative_clustering_minmax(X, affinity='euclidean', n_clusters=2):\n", + " \"\"\"\n", + " X: array-like, shape (n_samples, n_features) or (n_samples, n_samples)\n", + " \n", + " affinity: \"precomputed\" or \"euclidean\" (default) \n", + " Metric used to compute the distance between any two samples\n", + " \n", + " n_clusters: int, default=2\n", + " The number of clusters to find. \n", + " \"\"\"\n", + " \n", + " #calculating distance matrix D\n", + " if affinity==\"precomputed\":\n", + " D = copy.deepcopy(X)\n", + " elif affinity==\"euclidean\":\n", + " D = pairwise_distances(X)\n", + " \n", + " \n", + " #create a copy of D to update \n", + " D_to_update = copy.deepcopy(D)\n", + " \n", + " #initial number of clusters equals to number of samples\n", + " n_samples = np.shape(X)[0] \n", + " \n", + " groups = {}\n", + " for idx in range(n_samples):\n", + " groups[idx] = [idx]\n", + " \n", + " \n", + " #after deleting two groups, we will add a new group in the dictionary with new key (see iteration below) \n", + " #to make sure the new key is different than the previous ones, we just start with n_samples\n", + " new_key = n_samples\n", + " \n", + " \n", + " #iteration to find clusters\n", + " k = n_samples #initial number of clusters\n", + " while k > n_clusters:\n", + " #print('k>>> ', k)\n", + " groups_keys = list(groups.keys())\n", + " \n", + " #filling diagonal with large value\n", + " np.fill_diagonal(D_to_update, float('inf'))\n", + " \n", + " D_1NN = np.min(D_to_update, axis=1)\n", + " D_1NN_idx = np.argmin(D_to_update, axis=1)\n", + " \n", + " C_i = np.argmin(D_1NN)\n", + " C_j = D_1NN_idx[C_i]\n", + " \n", + " key_i = groups_keys[C_i]\n", + " key_j = groups_keys[C_j]\n", + " \n", + " #after finding the two set of cluster, we are going to update the distance matrix \n", + " #we add a new row and column to the end of matrix \n", + " # we remove the i-th and j-th groups\n", + " #and calculate the distance between the new cluster and others\n", + " \n", + " groups[new_key] = groups[key_i]\n", + " groups[new_key].extend(groups[key_j])\n", + " del groups[key_i]\n", + " del groups[key_j]\n", + " \n", + " \n", + " D_to_update = np.delete(D_to_update, [C_i, C_j], axis=0)\n", + " D_to_update = np.delete(D_to_update, [C_i, C_j], axis=1)\n", + " \n", + " D_new = np.zeros((k-1,k-1))\n", + " D_new[:-1,:-1] = D_to_update\n", + " for idx_key, key in enumerate(groups.keys()):\n", + " obs_lst = groups[new_key][:]\n", + " obs_lst.extend(groups[key])\n", + " \n", + " new_distance = np.min(np.max(D[np.ix_(obs_lst,obs_lst)], axis=1))\n", + " D_new[idx_key,-1] = new_distance\n", + " D_new[-1, idx_key] = new_distance\n", + " \n", + " \n", + " #updating D_to_update \n", + " D_to_update = D_new\n", + " \n", + " \n", + " \n", + " #updating the current number of clusters\n", + " k = len(groups.keys())\n", + " \n", + " #update the next new key:\n", + " new_key += 1 #in the next run, use this new key\n", + " \n", + " return groups" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test it on toy data:" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAD4CAYAAADFAawfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAANBUlEQVR4nO3dX6ikd33H8c9n/4Q4MSEXmVpxs3sqlNAgmJhDMCxI3Qa7/sHS0oIyelGEc6MSQRDD3tSLpXeSXpTCkKQWMlVEDS3RRlNMCAGbdDbZaOJGsGF3XWLZCRI0PaDEfHoxs+yfnN3zzDnPnPnO7PsFhznzzJOZ748k733yzDMZJxEAoK5d8x4AAHBlhBoAiiPUAFAcoQaA4gg1ABS3ZxZPetNNN2VlZWUWTw0AS+nYsWOvJulu9NhMQr2ysqLhcDiLpwaApWT71OUe49QHABRHqAGgOEINAMURagAojlADQHGNQm37Rtvfsv2S7RO275r1YACwKAYDaWVF2rVrfDsYtPv8TS/P+wdJjyb5a9vXSOq0OwYALKbBQFpbk9bXx/dPnRrfl6Rer53X2PSI2vYNkj4g6QFJSvK7JK+18/IAsNiOHDkf6XPW18fb29Lk1Me7JY0k/bPt52zfb/u6S3eyvWZ7aHs4Go3amxAACjt9errtW9Ek1HskvU/SPyW5XdL/SfrypTsl6SdZTbLa7W74KUgAWDr790+3fSuahPqMpDNJnp7c/5bG4QaAq97Ro1LnknftOp3x9rZsGuok/yvpF7ZvmWz6M0k/bW8EAFhcvZ7U70sHDkj2+Lbfb++NRKn5VR+flzSYXPHxsqS/bW8EAFhsvV67Yb5Uo1AnOS5pdXZjAAAuh08mAkBxhBoAiiPUAFAcoQaA4gg1ABRHqAGgOEINAMURagAojlADQHGEGgCKI9QAUByhBoDiCDUAFEeoAaA4Qg0AxRFqACiOUANAcYQaAIoj1ABQHKEGgOIINQAUR6gBoDhCDQDFEWoAKK5RqG2ftP0T28dtD2c9FAAsksFAWlmRdu0a3w4G7T7/nin2/WCSV9t9eQBYbIOBtLYmra+P7586Nb4vSb1eO6/BqQ8A2IYjR85H+pz19fH2tjQNdST9wPYx22sb7WB7zfbQ9nA0GrU3IQAUdvr0dNu3ommoDyZ5n6QPS/qs7Q9cukOSfpLVJKvdbre9CQGgsP37p9u+FY1CneSVye1ZSQ9LurO9EQBgcR09KnU6F2/rdMbb27JpqG1fZ/v6c79L+pCkF9obAQAWV68n9fvSgQOSPb7t99t7I1FqdtXHOyQ9bPvc/v+a5NH2RgCAxdbrtRvmS20a6iQvS3rv7EYAAFwJl+cBQHGEGgCKI9QAUByhBoDiCDUAFEeoAaA4Qg0AxRFqACiOUANAcYQaAIoj1ABQHKEGgOIINQAUR6gBoDhCDQDFEWoAKI5QA0BxhBoAiiPUAFAcoQaA4gg1ABRHqAGgOEINAMURagAornGobe+2/ZztR2Y5EAAsmsFAWlmRdu0a3w4G7T7/nin2vUfSCUk3tDsCACyuwUBaW5PW18f3T50a35ekXq+d12h0RG17n6SPSrq/nZcFgOVw5Mj5SJ+zvj7e3pampz7uk/QlSW9ebgfba7aHtoej0aiN2QCgvNOnp9u+FZuG2vbHJJ1NcuxK+yXpJ1lNstrtdlsbEAAq279/uu1b0eSI+qCkj9s+Kekbkg7Zfqi9EQBgcR09KnU6F2/rdMbb27JpqJPcm2RfkhVJn5D0wySfam8EAFhcvZ7U70sHDkj2+Lbfb++NRGm6qz4AABvo9doN86WmCnWSJyQ9MZNJAAAb4pOJAFAcoQaA4gg1ABRHqAGgOEINAMURagAojlADQHGEGgCKI9QAUByhBoDiCDUAFEeoAaA4Qg0AxRFqACiOUANAcYQaAIoj1ABQHKEGgOIINQAUR6gBoDhCDQDFEWoAKI5QA0BxhBoAits01Lavtf2M7edtv2j7KzsxGABgbE+DfX4r6VCS123vlfSU7f9I8l8zng0AoAahThJJr0/u7p38ZJZDAQDOa3SO2vZu28clnZX0WJKnN9hnzfbQ9nA0GrU8JgBcvRqFOsnvk9wmaZ+kO22/Z4N9+klWk6x2u92WxwSAq9dUV30keU3SE5IOz2IYAMBbNbnqo2v7xsnvb5N0t6SXZjwXAGCiyVUf75T0L7Z3axz2byZ5ZLZjAQDOaXLVx48l3b4DswAANsAnEwGgOEINAMURagAojlADQHGEGgCKI9QAUByhBoDiCDUAFEeoAaA4Qg0AxRFqACiOUANAcYQaAIoj1ABQHKEGgOIINQAUR6gBoDhCDQDFEWoAKI5QA0BxhBoAiiPUAFAcoQaA4gg1ABS3aaht32z7cdsnbL9o+56dGAwAMLanwT5vSPpikmdtXy/pmO3Hkvx0xrMBANTgiDrJL5M8O/n9N5JOSHrXrAcDAIxNdY7a9oqk2yU9vcFja7aHtoej0ail8QAAjUNt++2Svi3pC0l+fenjSfpJVpOsdrvdNmcEgKtao1Db3qtxpAdJvjPbkQAAF2py1YclPSDpRJKvzn4kAMCFmhxRH5T0aUmHbB+f/HxkxnMBACY2vTwvyVOSvAOzAAA2wCcTAaA4Qg0AxRFqACiOUANAcYQaAIoj1ABQHKEGgOIINQAUR6gBoDhCDQDFEWoAKI5QA0BxhBoAiiPUAFAcoQaA4gg1ABRHqAGgOEINAMURagAojlADQHGEGgCKI9QAUByhBoDiCDUAFLdpqG0/aPus7Rd2YiAAwMWaHFF/TdLhGc8BALiMTUOd5ElJv9qBWQAAG2jtHLXtNdtD28PRaNTW0wLAVa+1UCfpJ1lNstrtdtt6WgC46nHVBwAUR6gBoLgml+d9XdKPJN1i+4ztz8x+LADAOXs22yHJJ3diEADAxjj1AQDFEWoAKI5QA0BxhBoAiiPUAFAcoQaA4gg1ABRHqAGgOEINAMURagAojlADQHGEGgCKI9QAUByhBoDiCDUAFEeoAaA4Qg0AxRFqACiOUANAcYQaAIoj1ABQHKEGgOIINQAUR6gBoLhGobZ92PbPbP/c9pdnMslgIK2sSLt2jW8Hg5m8zI5YlrUsyzqARZfkij+Sdkv6H0nvlnSNpOcl3Xqlv+aOO+7IVB56KOl0Eun8T6cz3r5olmUty7IOYEFIGuYyTfX48cuzfZekv0vy55P7904C//eX+2tWV1czHA6b/2mxsiKdOvXW7QcOSCdPNn+eCpZlLcuyDmBB2D6WZHWjx5qc+niXpF9ccP/MZNulL7Jme2h7OBqNppvw9Onptle2LGtZlnUAS6BJqL3BtrcchifpJ1lNstrtdqebYv/+6bZXtixrWZZ1AEugSajPSLr5gvv7JL3S6hRHj0qdzsXbOp3x9kWzLGtZlnUAS6BJqP9b0h/b/iPb10j6hKR/b3WKXk/q98fnP+3xbb8/3r5olmUty7IOYAls+maiJNn+iKT7NL4C5MEkVzysmvrNRAC4yl3pzcQ9TZ4gyfckfa/VqQAAjfDJRAAojlADQHGEGgCKI9QAUFyjqz6mflJ7JGmDzx83cpOkV1scZ56WZS3Lsg6JtVS0LOuQtreWA0k2/LTgTEK9HbaHl7tEZdEsy1qWZR0Sa6loWdYhzW4tnPoAgOIINQAUVzHU/XkP0KJlWcuyrENiLRUtyzqkGa2l3DlqAMDFKh5RAwAuQKgBoLiSobb9N7ZftP2m7YW7bGdHvgx4B9h+0PZZ2y/Me5btsn2z7cdtn5j8s3XPvGfaCtvX2n7G9vOTdXxl3jNtl+3dtp+z/ci8Z9kO2ydt/8T2cdut/u9DS4Za0guS/krSk/MeZFq2d0v6R0kflnSrpE/avnW+U23Z1yQdnvcQLXlD0heT/Imk90v67IL+ffmtpENJ3ivpNkmHbb9/viNt2z2STsx7iJZ8MMltbV9LXTLUSU4k+dm859iiOyX9PMnLSX4n6RuS/mLOM21Jkicl/Wrec7QhyS+TPDv5/Tcah+Et3/1Z3eQLq1+f3N07+VnYKwJs75P0UUn3z3uWykqGesE1+jJgzI/tFUm3S3p6zqNsyeRUwXFJZyU9lmQh1zFxn6QvSXpzznO0IZJ+YPuY7bU2n7jRFwfMgu3/lPSHGzx0JMm/7fQ8LWr0ZcCYD9tvl/RtSV9I8ut5z7MVSX4v6TbbN0p62PZ7kizc+wi2PybpbJJjtv90zuO04WCSV2z/gaTHbL80+a/SbZtbqJPcPa/XnrHZfxkwtsT2Xo0jPUjynXnPs11JXrP9hMbvIyxcqCUdlPTxyVf9XSvpBtsPJfnUnOfakiSvTG7P2n5Y49OgrYSaUx/tm/2XAWNqti3pAUknknx13vNsle3u5Ehatt8m6W5JL811qC1Kcm+SfUlWNP735IeLGmnb19m+/tzvkj6kFv/wLBlq239p+4ykuyR91/b35z1TU0nekPQ5Sd/X+A2rbyZ5cb5TbY3tr0v6kaRbbJ+x/Zl5z7QNByV9WtKhyeVTxydHcovmnZIet/1jjQ8KHkuy0Je1LYl3SHrK9vOSnpH03SSPtvXkfIQcAIoreUQNADiPUANAcYQaAIoj1ABQHKEGgOIINQAUR6gBoLj/B0OCeS/H9hoPAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "toy_data = np.array([[-1,0],\n", + " [0,0],\n", + " [1,0],\n", + " [5,4],\n", + " [5,5],\n", + " [5,6]])\n", + "\n", + "\n", + "\n", + "sc = StandardScaler()\n", + "X = sc.fit_transform(toy_data)\n", + "clusters = agglemorative_clustering_minmax(X, affinity='euclidean', n_clusters=2)\n", + "\n", + "color_of_clusters = ['r', 'b']\n", + "\n", + "for idx_group, group in enumerate(clusters):\n", + " for j in clusters[group]:\n", + " plt.scatter(x=toy_data[j][0], y=toy_data[j][1], color=color_of_clusters[idx_group])\n", + "\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}