doarthon
diff --git a/‎.nojekyll b/‎.nojekyll
diff --git a/‎dev/.buildinfo
Lines changed: 4 additions & 0 deletions b/‎dev/.buildinfo
Lines changed: 4 additions & 0 deletions
diff --git a/‎dev/_downloads/bicluster_newsgroups.py
Lines changed: 182 additions & 0 deletions b/‎dev/_downloads/bicluster_newsgroups.py
Lines changed: 182 additions & 0 deletions
diff --git a/‎dev/_downloads/digits_classification_exercise.py
Lines changed: 33 additions & 0 deletions b/‎dev/_downloads/digits_classification_exercise.py
Lines changed: 33 additions & 0 deletions
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: e63c2b988a3b44ccb911a6df83005e54
+tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -0,0 +1,182 @@
+"""
+================================================================
+Biclustering documents with the Spectral Co-clustering algorithm
+================================================================
+
+This example demonstrates the Spectral Co-clustering algorithm on the
+twenty newsgroups dataset. The 'comp.os.ms-windows.misc' category is
+excluded because it contains many posts containing nothing but data.
+
+The TF-IDF vectorized posts form a word frequency matrix, which is
+then biclustered using Dhillon's Spectral Co-Clustering algorithm. The
+resulting document-word biclusters indicate subsets words used more
+often in those subsets documents.
+
+For a few of the best biclusters, its most common document categories
+and its ten most important words get printed. The best biclusters are
+determined by their normalized cut. The best words are determined by
+comparing their sums inside and outside the bicluster.
+
+For comparison, the documents are also clustered using
+MiniBatchKMeans. The document clusters derived from the biclusters
+achieve a better V-measure than clusters found by MiniBatchKMeans.
+
+Output::
+
+    Vectorizing...
+    Coclustering...
+    Done in 9.53s. V-measure: 0.4455
+    MiniBatchKMeans...
+    Done in 12.00s. V-measure: 0.3309
+
+    Best biclusters:
+    ----------------
+    bicluster 0 : 1951 documents, 4373 words
+    categories   : 23% talk.politics.guns, 19% talk.politics.misc, 14% sci.med
+    words        : gun, guns, geb, banks, firearms, drugs, gordon, clinton, cdt, amendment
+
+    bicluster 1 : 1165 documents, 3304 words
+    categories   : 29% talk.politics.mideast, 26% soc.religion.christian, 25% alt.atheism
+    words        : god, jesus, christians, atheists, kent, sin, morality, belief, resurrection, marriage
+
+    bicluster 2 : 2219 documents, 2830 words
+    categories   : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware, 16% comp.graphics
+    words        : voltage, dsp, board, receiver, circuit, shipping, packages, stereo, compression, package
+
+    bicluster 3 : 1860 documents, 2745 words
+    categories   : 26% rec.motorcycles, 23% rec.autos, 13% misc.forsale
+    words        : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw, bikes
+
+    bicluster 4 : 12 documents, 155 words
+    categories   : 100% rec.sport.hockey
+    words        : scorer, unassisted, reichel, semak, sweeney, kovalenko, ricci, audette, momesso, nedved
+
+"""
+from __future__ import print_function
+
+print(__doc__)
+
+from collections import defaultdict
+import operator
+import re
+from time import time
+
+import numpy as np
+
+from sklearn.cluster.bicluster import SpectralCoclustering
+from sklearn.cluster import MiniBatchKMeans
+from sklearn.externals.six import iteritems
+from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.cluster import v_measure_score
+
+
+def number_aware_tokenizer(doc):
+    """ Tokenizer that maps all numeric tokens to a placeholder.
+
+    For many applications, tokens that begin with a number are not directly
+    useful, but the fact that such a token exists can be relevant.  By applying
+    this form of dimensionality reduction, some methods may perform better.
+    """
+    token_pattern = re.compile(u'(?u)\\b\\w\\w+\\b')
+    tokens = token_pattern.findall(doc)
+    tokens = ["#NUMBER" if token[0] in "0123456789_" else token
+              for token in tokens]
+    return tokens
+
+# exclude 'comp.os.ms-windows.misc'
+categories = ['alt.atheism', 'comp.graphics',
+              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
+              'comp.windows.x', 'misc.forsale', 'rec.autos',
+              'rec.motorcycles', 'rec.sport.baseball',
+              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
+              'sci.med', 'sci.space', 'soc.religion.christian',
+              'talk.politics.guns', 'talk.politics.mideast',
+              'talk.politics.misc', 'talk.religion.misc']
+newsgroups = fetch_20newsgroups(categories=categories)
+y_true = newsgroups.target
+
+vectorizer = TfidfVectorizer(stop_words='english', min_df=5,
+                             tokenizer=number_aware_tokenizer)
+cocluster = SpectralCoclustering(n_clusters=len(categories),
+                                 svd_method='arpack', random_state=0)
+kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,
+                         random_state=0)
+
+print("Vectorizing...")
+X = vectorizer.fit_transform(newsgroups.data)
+
+print("Coclustering...")
+start_time = time()
+cocluster.fit(X)
+y_cocluster = cocluster.row_labels_
+print("Done in {:.2f}s. V-measure: {:.4f}".format(
+    time() - start_time,
+    v_measure_score(y_cocluster, y_true)))
+
+print("MiniBatchKMeans...")
+start_time = time()
+y_kmeans = kmeans.fit_predict(X)
+print("Done in {:.2f}s. V-measure: {:.4f}".format(
+    time() - start_time,
+    v_measure_score(y_kmeans, y_true)))
+
+feature_names = vectorizer.get_feature_names()
+document_names = list(newsgroups.target_names[i] for i in newsgroups.target)
+
+
+def bicluster_ncut(i):
+    rows, cols = cocluster.get_indices(i)
+    if not (np.any(rows) and np.any(cols)):
+        import sys
+        return sys.float_info.max
+    row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]
+    col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]
+    weight = X[rows[:, np.newaxis], cols].sum()
+    cut = (X[row_complement[:, np.newaxis], cols].sum() +
+           X[rows[:, np.newaxis], col_complement].sum())
+    return cut / weight
+
+
+def most_common(d):
+    """Items of a defaultdict(int) with the highest values.
+
+    Like Counter.most_common in Python >=2.7.
+    """
+    return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)
+
+
+bicluster_ncuts = list(bicluster_ncut(i)
+                       for i in range(len(newsgroups.target_names)))
+best_idx = np.argsort(bicluster_ncuts)[:5]
+
+print()
+print("Best biclusters:")
+print("----------------")
+for idx, cluster in enumerate(best_idx):
+    n_rows, n_cols = cocluster.get_shape(cluster)
+    cluster_docs, cluster_words = cocluster.get_indices(cluster)
+    if not len(cluster_docs) or not len(cluster_words):
+        continue
+
+    # categories
+    counter = defaultdict(int)
+    for i in cluster_docs:
+        counter[document_names[i]] += 1
+    cat_string = ", ".join("{:.0f}% {}".format(float(c) / n_rows * 100, name)
+                           for name, c in most_common(counter)[:3])
+
+    # words
+    out_of_cluster_docs = cocluster.row_labels_ != cluster
+    out_of_cluster_docs = np.where(out_of_cluster_docs)[0]
+    word_col = X[:, cluster_words]
+    word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -
+                           word_col[out_of_cluster_docs, :].sum(axis=0))
+    word_scores = word_scores.ravel()
+    important_words = list(feature_names[cluster_words[i]]
+                           for i in word_scores.argsort()[:-11:-1])
+
+    print("bicluster {} : {} documents, {} words".format(
+        idx, n_rows, n_cols))
+    print("categories   : {}".format(cat_string))
+    print("words        : {}\n".format(', '.join(important_words)))
@@ -0,0 +1,33 @@
+"""
+================================
+Digits Classification Exercise
+================================
+
+A tutorial exercise regarding the use of classification techniques on
+the Digits dataset.
+
+This exercise is used in the :ref:`clf_tut` part of the
+:ref:`supervised_learning_tut` section of the
+:ref:`stat_learn_tut_index`.
+"""
+print(__doc__)
+
+from sklearn import datasets, neighbors, linear_model
+
+digits = datasets.load_digits()
+X_digits = digits.data
+y_digits = digits.target
+
+n_samples = len(X_digits)
+
+X_train = X_digits[:.9 * n_samples]
+y_train = y_digits[:.9 * n_samples]
+X_test = X_digits[.9 * n_samples:]
+y_test = y_digits[.9 * n_samples:]
+
+knn = neighbors.KNeighborsClassifier()
+logistic = linear_model.LogisticRegression()
+
+print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
+print('LogisticRegression score: %f'
+      % logistic.fit(X_train, y_train).score(X_test, y_test))