- "import matplotlib.cm as cm\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.cluster import KMeans\nfrom sklearn.datasets import make_blobs\nfrom sklearn.metrics import silhouette_samples, silhouette_score\n\n# Generating the sample data from make_blobs\n# This particular setting has one distinct cluster and 3 clusters placed close\n# together.\nX, y = make_blobs(\n n_samples=500,\n n_features=2,\n centers=4,\n cluster_std=1,\n center_box=(-10.0, 10.0),\n shuffle=True,\n random_state=1,\n) # For reproducibility\n\nrange_n_clusters = [2, 3, 4, 5, 6]\n\nfor n_clusters in range_n_clusters:\n # Create a subplot with 1 row and 2 columns\n fig, (ax1, ax2) = plt.subplots(1, 2)\n fig.set_size_inches(18, 7)\n\n # The 1st subplot is the silhouette plot\n # The silhouette coefficient can range from -1, 1 but in this example all\n # lie within [-0.1, 1]\n ax1.set_xlim([-0.1, 1])\n # The (n_clusters+1)*10 is for inserting blank space between silhouette\n # plots of individual clusters, to demarcate them clearly.\n ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])\n\n # Initialize the clusterer with n_clusters value and a random generator\n # seed of 10 for reproducibility.\n clusterer = KMeans(n_clusters=n_clusters, n_init=\"auto\", random_state=10)\n cluster_labels = clusterer.fit_predict(X)\n\n # The silhouette_score gives the average value for all the samples.\n # This gives a perspective into the density and separation of the formed\n # clusters\n silhouette_avg = silhouette_score(X, cluster_labels)\n print(\n \"For n_clusters =\",\n n_clusters,\n \"The average silhouette_score is :\",\n silhouette_avg,\n )\n\n # Compute the silhouette scores for each sample\n sample_silhouette_values = silhouette_samples(X, cluster_labels)\n\n y_lower = 10\n for i in range(n_clusters):\n # Aggregate the silhouette scores for samples belonging to\n # cluster i, and sort them\n ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]\n\n ith_cluster_silhouette_values.sort()\n\n size_cluster_i = ith_cluster_silhouette_values.shape[0]\n y_upper = y_lower + size_cluster_i\n\n color = cm.nipy_spectral(float(i) / n_clusters)\n ax1.fill_betweenx(\n np.arange(y_lower, y_upper),\n 0,\n ith_cluster_silhouette_values,\n facecolor=color,\n edgecolor=color,\n alpha=0.7,\n )\n\n # Label the silhouette plots with their cluster numbers at the middle\n ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))\n\n # Compute the new y_lower for next plot\n y_lower = y_upper + 10 # 10 for the 0 samples\n\n ax1.set_title(\"The silhouette plot for the various clusters.\")\n ax1.set_xlabel(\"The silhouette coefficient values\")\n ax1.set_ylabel(\"Cluster label\")\n\n # The vertical line for average silhouette score of all the values\n ax1.axvline(x=silhouette_avg, color=\"red\", linestyle=\"--\")\n\n ax1.set_yticks([]) # Clear the yaxis labels / ticks\n ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])\n\n # 2nd Plot showing the actual clusters formed\n colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)\n ax2.scatter(\n X[:, 0], X[:, 1], marker=\".\", s=30, lw=0, alpha=0.7, c=colors, edgecolor=\"k\"\n )\n\n # Labeling the clusters\n centers = clusterer.cluster_centers_\n # Draw white circles at cluster centers\n ax2.scatter(\n centers[:, 0],\n centers[:, 1],\n marker=\"o\",\n c=\"white\",\n alpha=1,\n s=200,\n edgecolor=\"k\",\n )\n\n for i, c in enumerate(centers):\n ax2.scatter(c[0], c[1], marker=\"$%d$\" % i, alpha=1, s=50, edgecolor=\"k\")\n\n ax2.set_title(\"The visualization of the clustered data.\")\n ax2.set_xlabel(\"Feature space for the 1st feature\")\n ax2.set_ylabel(\"Feature space for the 2nd feature\")\n\n plt.suptitle(\n \"Silhouette analysis for KMeans clustering on sample data with n_clusters = %d\"\n % n_clusters,\n fontsize=14,\n fontweight=\"bold\",\n )\n\nplt.show()"
0 commit comments