|
| 1 | +""" |
| 2 | +================================================================ |
| 3 | +Biclustering documents with the Spectral Co-clustering algorithm |
| 4 | +================================================================ |
| 5 | +
|
| 6 | +This example demonstrates the Spectral Co-clustering algorithm on the |
| 7 | +twenty newsgroups dataset. The 'comp.os.ms-windows.misc' category is |
| 8 | +excluded because it contains many posts containing nothing but data. |
| 9 | +
|
| 10 | +The TF-IDF vectorized posts form a word frequency matrix, which is |
| 11 | +then biclustered using Dhillon's Spectral Co-Clustering algorithm. The |
| 12 | +resulting document-word biclusters indicate subsets words used more |
| 13 | +often in those subsets documents. |
| 14 | +
|
| 15 | +For a few of the best biclusters, its most common document categories |
| 16 | +and its ten most important words get printed. The best biclusters are |
| 17 | +determined by their normalized cut. The best words are determined by |
| 18 | +comparing their sums inside and outside the bicluster. |
| 19 | +
|
| 20 | +For comparison, the documents are also clustered using |
| 21 | +MiniBatchKMeans. The document clusters derived from the biclusters |
| 22 | +achieve a better V-measure than clusters found by MiniBatchKMeans. |
| 23 | +
|
| 24 | +Output:: |
| 25 | +
|
| 26 | + Vectorizing... |
| 27 | + Coclustering... |
| 28 | + Done in 9.53s. V-measure: 0.4455 |
| 29 | + MiniBatchKMeans... |
| 30 | + Done in 12.00s. V-measure: 0.3309 |
| 31 | +
|
| 32 | + Best biclusters: |
| 33 | + ---------------- |
| 34 | + bicluster 0 : 1951 documents, 4373 words |
| 35 | + categories : 23% talk.politics.guns, 19% talk.politics.misc, 14% sci.med |
| 36 | + words : gun, guns, geb, banks, firearms, drugs, gordon, clinton, cdt, amendment |
| 37 | +
|
| 38 | + bicluster 1 : 1165 documents, 3304 words |
| 39 | + categories : 29% talk.politics.mideast, 26% soc.religion.christian, 25% alt.atheism |
| 40 | + words : god, jesus, christians, atheists, kent, sin, morality, belief, resurrection, marriage |
| 41 | +
|
| 42 | + bicluster 2 : 2219 documents, 2830 words |
| 43 | + categories : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware, 16% comp.graphics |
| 44 | + words : voltage, dsp, board, receiver, circuit, shipping, packages, stereo, compression, package |
| 45 | +
|
| 46 | + bicluster 3 : 1860 documents, 2745 words |
| 47 | + categories : 26% rec.motorcycles, 23% rec.autos, 13% misc.forsale |
| 48 | + words : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw, bikes |
| 49 | +
|
| 50 | + bicluster 4 : 12 documents, 155 words |
| 51 | + categories : 100% rec.sport.hockey |
| 52 | + words : scorer, unassisted, reichel, semak, sweeney, kovalenko, ricci, audette, momesso, nedved |
| 53 | +
|
| 54 | +""" |
| 55 | +from __future__ import print_function |
| 56 | + |
| 57 | +print(__doc__) |
| 58 | + |
| 59 | +from collections import defaultdict |
| 60 | +import operator |
| 61 | +import re |
| 62 | +from time import time |
| 63 | + |
| 64 | +import numpy as np |
| 65 | + |
| 66 | +from sklearn.cluster.bicluster import SpectralCoclustering |
| 67 | +from sklearn.cluster import MiniBatchKMeans |
| 68 | +from sklearn.externals.six import iteritems |
| 69 | +from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups |
| 70 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 71 | +from sklearn.metrics.cluster import v_measure_score |
| 72 | + |
| 73 | + |
| 74 | +def number_aware_tokenizer(doc): |
| 75 | + """ Tokenizer that maps all numeric tokens to a placeholder. |
| 76 | +
|
| 77 | + For many applications, tokens that begin with a number are not directly |
| 78 | + useful, but the fact that such a token exists can be relevant. By applying |
| 79 | + this form of dimensionality reduction, some methods may perform better. |
| 80 | + """ |
| 81 | + token_pattern = re.compile(u'(?u)\\b\\w\\w+\\b') |
| 82 | + tokens = token_pattern.findall(doc) |
| 83 | + tokens = ["#NUMBER" if token[0] in "0123456789_" else token |
| 84 | + for token in tokens] |
| 85 | + return tokens |
| 86 | + |
| 87 | +# exclude 'comp.os.ms-windows.misc' |
| 88 | +categories = ['alt.atheism', 'comp.graphics', |
| 89 | + 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', |
| 90 | + 'comp.windows.x', 'misc.forsale', 'rec.autos', |
| 91 | + 'rec.motorcycles', 'rec.sport.baseball', |
| 92 | + 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', |
| 93 | + 'sci.med', 'sci.space', 'soc.religion.christian', |
| 94 | + 'talk.politics.guns', 'talk.politics.mideast', |
| 95 | + 'talk.politics.misc', 'talk.religion.misc'] |
| 96 | +newsgroups = fetch_20newsgroups(categories=categories) |
| 97 | +y_true = newsgroups.target |
| 98 | + |
| 99 | +vectorizer = TfidfVectorizer(stop_words='english', min_df=5, |
| 100 | + tokenizer=number_aware_tokenizer) |
| 101 | +cocluster = SpectralCoclustering(n_clusters=len(categories), |
| 102 | + svd_method='arpack', random_state=0) |
| 103 | +kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, |
| 104 | + random_state=0) |
| 105 | + |
| 106 | +print("Vectorizing...") |
| 107 | +X = vectorizer.fit_transform(newsgroups.data) |
| 108 | + |
| 109 | +print("Coclustering...") |
| 110 | +start_time = time() |
| 111 | +cocluster.fit(X) |
| 112 | +y_cocluster = cocluster.row_labels_ |
| 113 | +print("Done in {:.2f}s. V-measure: {:.4f}".format( |
| 114 | + time() - start_time, |
| 115 | + v_measure_score(y_cocluster, y_true))) |
| 116 | + |
| 117 | +print("MiniBatchKMeans...") |
| 118 | +start_time = time() |
| 119 | +y_kmeans = kmeans.fit_predict(X) |
| 120 | +print("Done in {:.2f}s. V-measure: {:.4f}".format( |
| 121 | + time() - start_time, |
| 122 | + v_measure_score(y_kmeans, y_true))) |
| 123 | + |
| 124 | +feature_names = vectorizer.get_feature_names() |
| 125 | +document_names = list(newsgroups.target_names[i] for i in newsgroups.target) |
| 126 | + |
| 127 | + |
| 128 | +def bicluster_ncut(i): |
| 129 | + rows, cols = cocluster.get_indices(i) |
| 130 | + if not (np.any(rows) and np.any(cols)): |
| 131 | + import sys |
| 132 | + return sys.float_info.max |
| 133 | + row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0] |
| 134 | + col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0] |
| 135 | + weight = X[rows[:, np.newaxis], cols].sum() |
| 136 | + cut = (X[row_complement[:, np.newaxis], cols].sum() + |
| 137 | + X[rows[:, np.newaxis], col_complement].sum()) |
| 138 | + return cut / weight |
| 139 | + |
| 140 | + |
| 141 | +def most_common(d): |
| 142 | + """Items of a defaultdict(int) with the highest values. |
| 143 | +
|
| 144 | + Like Counter.most_common in Python >=2.7. |
| 145 | + """ |
| 146 | + return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True) |
| 147 | + |
| 148 | + |
| 149 | +bicluster_ncuts = list(bicluster_ncut(i) |
| 150 | + for i in range(len(newsgroups.target_names))) |
| 151 | +best_idx = np.argsort(bicluster_ncuts)[:5] |
| 152 | + |
| 153 | +print() |
| 154 | +print("Best biclusters:") |
| 155 | +print("----------------") |
| 156 | +for idx, cluster in enumerate(best_idx): |
| 157 | + n_rows, n_cols = cocluster.get_shape(cluster) |
| 158 | + cluster_docs, cluster_words = cocluster.get_indices(cluster) |
| 159 | + if not len(cluster_docs) or not len(cluster_words): |
| 160 | + continue |
| 161 | + |
| 162 | + # categories |
| 163 | + counter = defaultdict(int) |
| 164 | + for i in cluster_docs: |
| 165 | + counter[document_names[i]] += 1 |
| 166 | + cat_string = ", ".join("{:.0f}% {}".format(float(c) / n_rows * 100, name) |
| 167 | + for name, c in most_common(counter)[:3]) |
| 168 | + |
| 169 | + # words |
| 170 | + out_of_cluster_docs = cocluster.row_labels_ != cluster |
| 171 | + out_of_cluster_docs = np.where(out_of_cluster_docs)[0] |
| 172 | + word_col = X[:, cluster_words] |
| 173 | + word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) - |
| 174 | + word_col[out_of_cluster_docs, :].sum(axis=0)) |
| 175 | + word_scores = word_scores.ravel() |
| 176 | + important_words = list(feature_names[cluster_words[i]] |
| 177 | + for i in word_scores.argsort()[:-11:-1]) |
| 178 | + |
| 179 | + print("bicluster {} : {} documents, {} words".format( |
| 180 | + idx, n_rows, n_cols)) |
| 181 | + print("categories : {}".format(cat_string)) |
| 182 | + print("words : {}\n".format(', '.join(important_words))) |
0 commit comments