{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Table of Contents\n", "

1 - Example of GibbsLDA and vbLDA
1.1 - Loading Reuter corpus from NLTK
1.2 - Inferencen through the Gibbs sampling
1.2.1 - Print top 10 probability words for each topic
1.3 - Inferencen through the Variational Bayes
1.3.1 - Print top 10 probability words for each topic
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Example of GibbsLDA and vbLDA" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This example requires to install three nltk corpora:nltk.corpus.reuters, nltk.corpus.words, nltk.corpus.stopwords.\n", "\n", "You can download the corpora via `nltk.download()`" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import logging\n", "\n", "import numpy as np\n", "from ptm import GibbsLDA\n", "from ptm import vbLDA\n", "from ptm.nltk_corpus import get_reuters_ids_cnt\n", "from ptm.utils import convert_cnt_to_list, get_top_words" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Loading Reuter corpus from NLTK" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load reuter corpus including 1000 documents with maximum vocabulary size of 10000 from NLTK corpus" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Vocabulary size:4632\n" ] } ], "source": [ "n_doc = 1000\n", "voca, doc_ids, doc_cnt = get_reuters_ids_cnt(num_doc=n_doc, max_voca=10000)\n", "docs = convert_cnt_to_list(doc_ids, doc_cnt)\n", "n_voca = len(voca)\n", "print('Vocabulary size:%d' % n_voca)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Inferencen through the Gibbs sampling" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2016-02-10 19:42:01 INFO:GibbsLDA:[ITER] 0,\telapsed time:0.86,\tlog_likelihood:-447909.18\n", "2016-02-10 19:42:02 INFO:GibbsLDA:[ITER] 1,\telapsed time:0.89,\tlog_likelihood:-421738.22\n", "2016-02-10 19:42:03 INFO:GibbsLDA:[ITER] 2,\telapsed time:0.94,\tlog_likelihood:-405181.71\n", "2016-02-10 19:42:04 INFO:GibbsLDA:[ITER] 3,\telapsed time:0.87,\tlog_likelihood:-393867.42\n", "2016-02-10 19:42:05 INFO:GibbsLDA:[ITER] 4,\telapsed time:0.90,\tlog_likelihood:-385570.47\n", "2016-02-10 19:42:06 INFO:GibbsLDA:[ITER] 5,\telapsed time:0.90,\tlog_likelihood:-379114.11\n", "2016-02-10 19:42:07 INFO:GibbsLDA:[ITER] 6,\telapsed time:0.92,\tlog_likelihood:-374416.99\n", "2016-02-10 19:42:08 INFO:GibbsLDA:[ITER] 7,\telapsed time:0.90,\tlog_likelihood:-371338.53\n", "2016-02-10 19:42:09 INFO:GibbsLDA:[ITER] 8,\telapsed time:0.88,\tlog_likelihood:-368035.03\n", "2016-02-10 19:42:10 INFO:GibbsLDA:[ITER] 9,\telapsed time:0.93,\tlog_likelihood:-365556.67\n", "2016-02-10 19:42:11 INFO:GibbsLDA:[ITER] 10,\telapsed time:0.87,\tlog_likelihood:-363627.94\n", "2016-02-10 19:42:11 INFO:GibbsLDA:[ITER] 11,\telapsed time:0.84,\tlog_likelihood:-362118.57\n", "2016-02-10 19:42:12 INFO:GibbsLDA:[ITER] 12,\telapsed time:0.83,\tlog_likelihood:-360546.38\n", "2016-02-10 19:42:13 INFO:GibbsLDA:[ITER] 13,\telapsed time:0.85,\tlog_likelihood:-359183.30\n", "2016-02-10 19:42:14 INFO:GibbsLDA:[ITER] 14,\telapsed time:0.96,\tlog_likelihood:-358050.27\n", "2016-02-10 19:42:15 INFO:GibbsLDA:[ITER] 15,\telapsed time:0.92,\tlog_likelihood:-357094.26\n", "2016-02-10 19:42:16 INFO:GibbsLDA:[ITER] 16,\telapsed time:0.87,\tlog_likelihood:-356045.67\n", "2016-02-10 19:42:17 INFO:GibbsLDA:[ITER] 17,\telapsed time:0.84,\tlog_likelihood:-355085.27\n", "2016-02-10 19:42:18 INFO:GibbsLDA:[ITER] 18,\telapsed time:0.85,\tlog_likelihood:-354129.45\n", "2016-02-10 19:42:18 INFO:GibbsLDA:[ITER] 19,\telapsed time:0.84,\tlog_likelihood:-353360.71\n", "2016-02-10 19:42:19 INFO:GibbsLDA:[ITER] 20,\telapsed time:0.90,\tlog_likelihood:-352636.22\n", "2016-02-10 19:42:20 INFO:GibbsLDA:[ITER] 21,\telapsed time:0.87,\tlog_likelihood:-352033.27\n", "2016-02-10 19:42:21 INFO:GibbsLDA:[ITER] 22,\telapsed time:0.84,\tlog_likelihood:-351298.31\n", "2016-02-10 19:42:22 INFO:GibbsLDA:[ITER] 23,\telapsed time:0.84,\tlog_likelihood:-351056.08\n", "2016-02-10 19:42:23 INFO:GibbsLDA:[ITER] 24,\telapsed time:0.83,\tlog_likelihood:-350554.31\n", "2016-02-10 19:42:24 INFO:GibbsLDA:[ITER] 25,\telapsed time:0.85,\tlog_likelihood:-350214.01\n", "2016-02-10 19:42:25 INFO:GibbsLDA:[ITER] 26,\telapsed time:0.84,\tlog_likelihood:-350201.01\n", "2016-02-10 19:42:25 INFO:GibbsLDA:[ITER] 27,\telapsed time:0.85,\tlog_likelihood:-349730.70\n", "2016-02-10 19:42:26 INFO:GibbsLDA:[ITER] 28,\telapsed time:0.91,\tlog_likelihood:-349007.47\n", "2016-02-10 19:42:27 INFO:GibbsLDA:[ITER] 29,\telapsed time:0.85,\tlog_likelihood:-349175.12\n", "2016-02-10 19:42:28 INFO:GibbsLDA:[ITER] 30,\telapsed time:0.88,\tlog_likelihood:-348863.94\n", "2016-02-10 19:42:29 INFO:GibbsLDA:[ITER] 31,\telapsed time:0.85,\tlog_likelihood:-348612.34\n", "2016-02-10 19:42:30 INFO:GibbsLDA:[ITER] 32,\telapsed time:0.90,\tlog_likelihood:-347934.48\n", "2016-02-10 19:42:31 INFO:GibbsLDA:[ITER] 33,\telapsed time:0.97,\tlog_likelihood:-347867.02\n", "2016-02-10 19:42:32 INFO:GibbsLDA:[ITER] 34,\telapsed time:0.95,\tlog_likelihood:-347414.72\n", "2016-02-10 19:42:33 INFO:GibbsLDA:[ITER] 35,\telapsed time:0.86,\tlog_likelihood:-347418.91\n", "2016-02-10 19:42:34 INFO:GibbsLDA:[ITER] 36,\telapsed time:0.96,\tlog_likelihood:-347124.65\n", "2016-02-10 19:42:35 INFO:GibbsLDA:[ITER] 37,\telapsed time:0.84,\tlog_likelihood:-346625.26\n", "2016-02-10 19:42:35 INFO:GibbsLDA:[ITER] 38,\telapsed time:0.83,\tlog_likelihood:-346294.68\n", "2016-02-10 19:42:36 INFO:GibbsLDA:[ITER] 39,\telapsed time:0.86,\tlog_likelihood:-346413.61\n", "2016-02-10 19:42:37 INFO:GibbsLDA:[ITER] 40,\telapsed time:0.98,\tlog_likelihood:-346242.04\n", "2016-02-10 19:42:38 INFO:GibbsLDA:[ITER] 41,\telapsed time:0.89,\tlog_likelihood:-346290.64\n", "2016-02-10 19:42:39 INFO:GibbsLDA:[ITER] 42,\telapsed time:0.86,\tlog_likelihood:-346108.81\n", "2016-02-10 19:42:40 INFO:GibbsLDA:[ITER] 43,\telapsed time:0.96,\tlog_likelihood:-345780.29\n", "2016-02-10 19:42:41 INFO:GibbsLDA:[ITER] 44,\telapsed time:0.91,\tlog_likelihood:-345771.55\n", "2016-02-10 19:42:42 INFO:GibbsLDA:[ITER] 45,\telapsed time:0.85,\tlog_likelihood:-345758.19\n", "2016-02-10 19:42:43 INFO:GibbsLDA:[ITER] 46,\telapsed time:0.95,\tlog_likelihood:-345798.00\n", "2016-02-10 19:42:44 INFO:GibbsLDA:[ITER] 47,\telapsed time:0.95,\tlog_likelihood:-345794.09\n", "2016-02-10 19:42:45 INFO:GibbsLDA:[ITER] 48,\telapsed time:0.95,\tlog_likelihood:-345631.54\n", "2016-02-10 19:42:46 INFO:GibbsLDA:[ITER] 49,\telapsed time:0.89,\tlog_likelihood:-345489.19\n", "2016-02-10 19:42:47 INFO:GibbsLDA:[ITER] 50,\telapsed time:1.01,\tlog_likelihood:-345386.81\n", "2016-02-10 19:42:48 INFO:GibbsLDA:[ITER] 51,\telapsed time:0.93,\tlog_likelihood:-345105.51\n", "2016-02-10 19:42:49 INFO:GibbsLDA:[ITER] 52,\telapsed time:0.95,\tlog_likelihood:-345095.54\n", "2016-02-10 19:42:49 INFO:GibbsLDA:[ITER] 53,\telapsed time:0.89,\tlog_likelihood:-344779.87\n", "2016-02-10 19:42:50 INFO:GibbsLDA:[ITER] 54,\telapsed time:0.91,\tlog_likelihood:-344897.46\n", "2016-02-10 19:42:51 INFO:GibbsLDA:[ITER] 55,\telapsed time:0.91,\tlog_likelihood:-344546.15\n", "2016-02-10 19:42:52 INFO:GibbsLDA:[ITER] 56,\telapsed time:0.88,\tlog_likelihood:-344541.70\n", "2016-02-10 19:42:53 INFO:GibbsLDA:[ITER] 57,\telapsed time:0.89,\tlog_likelihood:-344516.72\n", "2016-02-10 19:42:54 INFO:GibbsLDA:[ITER] 58,\telapsed time:0.94,\tlog_likelihood:-344702.70\n", "2016-02-10 19:42:55 INFO:GibbsLDA:[ITER] 59,\telapsed time:0.94,\tlog_likelihood:-344196.74\n", "2016-02-10 19:42:56 INFO:GibbsLDA:[ITER] 60,\telapsed time:0.90,\tlog_likelihood:-344231.88\n", "2016-02-10 19:42:57 INFO:GibbsLDA:[ITER] 61,\telapsed time:0.89,\tlog_likelihood:-344436.79\n", "2016-02-10 19:42:58 INFO:GibbsLDA:[ITER] 62,\telapsed time:0.88,\tlog_likelihood:-343805.88\n", "2016-02-10 19:42:59 INFO:GibbsLDA:[ITER] 63,\telapsed time:0.91,\tlog_likelihood:-344083.66\n", "2016-02-10 19:43:00 INFO:GibbsLDA:[ITER] 64,\telapsed time:0.91,\tlog_likelihood:-344131.20\n", "2016-02-10 19:43:00 INFO:GibbsLDA:[ITER] 65,\telapsed time:0.87,\tlog_likelihood:-344327.27\n", "2016-02-10 19:43:01 INFO:GibbsLDA:[ITER] 66,\telapsed time:0.85,\tlog_likelihood:-343887.94\n", "2016-02-10 19:43:02 INFO:GibbsLDA:[ITER] 67,\telapsed time:0.84,\tlog_likelihood:-343762.63\n", "2016-02-10 19:43:03 INFO:GibbsLDA:[ITER] 68,\telapsed time:0.93,\tlog_likelihood:-343623.01\n", "2016-02-10 19:43:04 INFO:GibbsLDA:[ITER] 69,\telapsed time:0.89,\tlog_likelihood:-343498.40\n", "2016-02-10 19:43:05 INFO:GibbsLDA:[ITER] 70,\telapsed time:0.87,\tlog_likelihood:-343147.74\n", "2016-02-10 19:43:06 INFO:GibbsLDA:[ITER] 71,\telapsed time:0.85,\tlog_likelihood:-343025.72\n", "2016-02-10 19:43:07 INFO:GibbsLDA:[ITER] 72,\telapsed time:0.87,\tlog_likelihood:-343189.09\n", "2016-02-10 19:43:08 INFO:GibbsLDA:[ITER] 73,\telapsed time:0.89,\tlog_likelihood:-343104.90\n", "2016-02-10 19:43:09 INFO:GibbsLDA:[ITER] 74,\telapsed time:0.88,\tlog_likelihood:-343020.70\n", "2016-02-10 19:43:09 INFO:GibbsLDA:[ITER] 75,\telapsed time:0.91,\tlog_likelihood:-342822.27\n", "2016-02-10 19:43:10 INFO:GibbsLDA:[ITER] 76,\telapsed time:0.86,\tlog_likelihood:-342671.10\n", "2016-02-10 19:43:11 INFO:GibbsLDA:[ITER] 77,\telapsed time:0.87,\tlog_likelihood:-342537.95\n", "2016-02-10 19:43:12 INFO:GibbsLDA:[ITER] 78,\telapsed time:0.88,\tlog_likelihood:-342711.56\n", "2016-02-10 19:43:13 INFO:GibbsLDA:[ITER] 79,\telapsed time:0.87,\tlog_likelihood:-342544.57\n", "2016-02-10 19:43:14 INFO:GibbsLDA:[ITER] 80,\telapsed time:0.88,\tlog_likelihood:-342719.10\n", "2016-02-10 19:43:15 INFO:GibbsLDA:[ITER] 81,\telapsed time:0.92,\tlog_likelihood:-342605.74\n", "2016-02-10 19:43:16 INFO:GibbsLDA:[ITER] 82,\telapsed time:0.87,\tlog_likelihood:-342609.81\n", "2016-02-10 19:43:17 INFO:GibbsLDA:[ITER] 83,\telapsed time:0.90,\tlog_likelihood:-342740.90\n", "2016-02-10 19:43:18 INFO:GibbsLDA:[ITER] 84,\telapsed time:0.89,\tlog_likelihood:-342668.54\n", "2016-02-10 19:43:18 INFO:GibbsLDA:[ITER] 85,\telapsed time:0.89,\tlog_likelihood:-342678.21\n", "2016-02-10 19:43:19 INFO:GibbsLDA:[ITER] 86,\telapsed time:0.87,\tlog_likelihood:-342797.02\n", "2016-02-10 19:43:20 INFO:GibbsLDA:[ITER] 87,\telapsed time:0.92,\tlog_likelihood:-342652.20\n", "2016-02-10 19:43:21 INFO:GibbsLDA:[ITER] 88,\telapsed time:0.89,\tlog_likelihood:-342328.18\n", "2016-02-10 19:43:22 INFO:GibbsLDA:[ITER] 89,\telapsed time:0.88,\tlog_likelihood:-342428.68\n", "2016-02-10 19:43:23 INFO:GibbsLDA:[ITER] 90,\telapsed time:0.90,\tlog_likelihood:-342853.29\n", "2016-02-10 19:43:24 INFO:GibbsLDA:[ITER] 91,\telapsed time:0.87,\tlog_likelihood:-342336.00\n", "2016-02-10 19:43:25 INFO:GibbsLDA:[ITER] 92,\telapsed time:0.89,\tlog_likelihood:-342357.74\n", "2016-02-10 19:43:26 INFO:GibbsLDA:[ITER] 93,\telapsed time:0.89,\tlog_likelihood:-341976.18\n", "2016-02-10 19:43:27 INFO:GibbsLDA:[ITER] 94,\telapsed time:0.93,\tlog_likelihood:-342270.78\n", "2016-02-10 19:43:28 INFO:GibbsLDA:[ITER] 95,\telapsed time:0.94,\tlog_likelihood:-342271.96\n", "2016-02-10 19:43:29 INFO:GibbsLDA:[ITER] 96,\telapsed time:0.94,\tlog_likelihood:-342092.68\n", "2016-02-10 19:43:30 INFO:GibbsLDA:[ITER] 97,\telapsed time:0.92,\tlog_likelihood:-341932.06\n", "2016-02-10 19:43:30 INFO:GibbsLDA:[ITER] 98,\telapsed time:0.90,\tlog_likelihood:-342061.92\n", "2016-02-10 19:43:31 INFO:GibbsLDA:[ITER] 99,\telapsed time:0.89,\tlog_likelihood:-341768.40\n" ] } ], "source": [ "max_iter=100\n", "n_topic=10\n", "\n", "logger = logging.getLogger('GibbsLDA')\n", "logger.propagate = False\n", "\n", "model = GibbsLDA(n_doc, len(voca), n_topic)\n", "model.fit(docs, max_iter=max_iter)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Print top 10 probability words for each topic" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Topic 0 : market,bank,week,rate,rose,money,two,rise,three,fed\n", "Topic 1 : quarter,first,april,record,earnings,dividend,share,prior,may,one\n", "Topic 2 : oil,dome,one,debt,gas,price,plan,new,would,energy\n", "Topic 3 : nil,stocks,production,total,end,use,start,soybean,supply,demand\n", "Topic 4 : last,month,wheat,crop,grain,department,sugar,april,week,export\n", "Topic 5 : loss,profit,corp,note,tax,chemical,gain,quarter,nine,operating\n", "Topic 6 : trade,government,last,also,deficit,would,surplus,foreign,canada,industry\n", "Topic 7 : japan,would,could,economic,japanese,market,west,growth,meeting,policy\n", "Topic 8 : dollar,bank,yen,interest,exchange,term,days,currency,rate,current\n", "Topic 9 : share,offer,stock,corp,acquisition,would,group,common,also,cash\n" ] } ], "source": [ "for ti in range(n_topic):\n", " top_words = get_top_words(model.TW, voca, ti, n_words=10)\n", " print('Topic', ti ,': ', ','.join(top_words))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Inferencen through the Variational Bayes" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2016-02-10 19:43:32 INFO:vbLDA:[ITER] 0,\telapsed time:0.79,\tELBO:-478629.24\n", "2016-02-10 19:43:33 INFO:vbLDA:[ITER] 1,\telapsed time:0.78,\tELBO:-424352.68\n", "2016-02-10 19:43:34 INFO:vbLDA:[ITER] 2,\telapsed time:0.79,\tELBO:-380711.73\n", "2016-02-10 19:43:34 INFO:vbLDA:[ITER] 3,\telapsed time:0.76,\tELBO:-364218.72\n", "2016-02-10 19:43:35 INFO:vbLDA:[ITER] 4,\telapsed time:0.72,\tELBO:-357506.75\n", "2016-02-10 19:43:36 INFO:vbLDA:[ITER] 5,\telapsed time:0.69,\tELBO:-354117.34\n", "2016-02-10 19:43:37 INFO:vbLDA:[ITER] 6,\telapsed time:0.69,\tELBO:-352265.21\n", "2016-02-10 19:43:37 INFO:vbLDA:[ITER] 7,\telapsed time:0.69,\tELBO:-351168.75\n", "2016-02-10 19:43:38 INFO:vbLDA:[ITER] 8,\telapsed time:0.65,\tELBO:-350393.52\n", "2016-02-10 19:43:39 INFO:vbLDA:[ITER] 9,\telapsed time:0.65,\tELBO:-349864.68\n", "2016-02-10 19:43:39 INFO:vbLDA:[ITER] 10,\telapsed time:0.64,\tELBO:-349479.59\n", "2016-02-10 19:43:40 INFO:vbLDA:[ITER] 11,\telapsed time:0.66,\tELBO:-349231.45\n", "2016-02-10 19:43:40 INFO:vbLDA:[ITER] 12,\telapsed time:0.64,\tELBO:-349048.99\n", "2016-02-10 19:43:41 INFO:vbLDA:[ITER] 13,\telapsed time:0.64,\tELBO:-348919.67\n", "2016-02-10 19:43:42 INFO:vbLDA:[ITER] 14,\telapsed time:0.63,\tELBO:-348796.75\n", "2016-02-10 19:43:42 INFO:vbLDA:[ITER] 15,\telapsed time:0.65,\tELBO:-348698.18\n", "2016-02-10 19:43:43 INFO:vbLDA:[ITER] 16,\telapsed time:0.65,\tELBO:-348608.65\n", "2016-02-10 19:43:44 INFO:vbLDA:[ITER] 17,\telapsed time:0.64,\tELBO:-348538.82\n", "2016-02-10 19:43:44 INFO:vbLDA:[ITER] 18,\telapsed time:0.62,\tELBO:-348471.38\n", "2016-02-10 19:43:45 INFO:vbLDA:[ITER] 19,\telapsed time:0.63,\tELBO:-348418.05\n", "2016-02-10 19:43:46 INFO:vbLDA:[ITER] 20,\telapsed time:0.62,\tELBO:-348372.82\n", "2016-02-10 19:43:46 INFO:vbLDA:[ITER] 21,\telapsed time:0.63,\tELBO:-348327.48\n", "2016-02-10 19:43:47 INFO:vbLDA:[ITER] 22,\telapsed time:0.63,\tELBO:-348286.69\n", "2016-02-10 19:43:47 INFO:vbLDA:[ITER] 23,\telapsed time:0.63,\tELBO:-348257.43\n", "2016-02-10 19:43:48 INFO:vbLDA:[ITER] 24,\telapsed time:0.61,\tELBO:-348232.60\n", "2016-02-10 19:43:49 INFO:vbLDA:[ITER] 25,\telapsed time:0.63,\tELBO:-348203.76\n", "2016-02-10 19:43:49 INFO:vbLDA:[ITER] 26,\telapsed time:0.62,\tELBO:-348182.56\n", "2016-02-10 19:43:50 INFO:vbLDA:[ITER] 27,\telapsed time:0.62,\tELBO:-348160.58\n", "2016-02-10 19:43:51 INFO:vbLDA:[ITER] 28,\telapsed time:0.64,\tELBO:-348144.50\n", "2016-02-10 19:43:51 INFO:vbLDA:[ITER] 29,\telapsed time:0.65,\tELBO:-348122.57\n", "2016-02-10 19:43:52 INFO:vbLDA:[ITER] 30,\telapsed time:0.66,\tELBO:-348109.34\n", "2016-02-10 19:43:53 INFO:vbLDA:[ITER] 31,\telapsed time:0.63,\tELBO:-348098.76\n", "2016-02-10 19:43:53 INFO:vbLDA:[ITER] 32,\telapsed time:0.62,\tELBO:-348084.17\n", "2016-02-10 19:43:54 INFO:vbLDA:[ITER] 33,\telapsed time:0.61,\tELBO:-348071.97\n", "2016-02-10 19:43:54 INFO:vbLDA:[ITER] 34,\telapsed time:0.63,\tELBO:-348059.91\n", "2016-02-10 19:43:55 INFO:vbLDA:[ITER] 35,\telapsed time:0.62,\tELBO:-348051.82\n", "2016-02-10 19:43:56 INFO:vbLDA:[ITER] 36,\telapsed time:0.65,\tELBO:-348045.39\n", "2016-02-10 19:43:56 INFO:vbLDA:[ITER] 37,\telapsed time:0.60,\tELBO:-348034.56\n", "2016-02-10 19:43:57 INFO:vbLDA:[ITER] 38,\telapsed time:0.63,\tELBO:-348025.53\n", "2016-02-10 19:43:58 INFO:vbLDA:[ITER] 39,\telapsed time:0.61,\tELBO:-348018.32\n", "2016-02-10 19:43:58 INFO:vbLDA:[ITER] 40,\telapsed time:0.60,\tELBO:-348011.50\n", "2016-02-10 19:43:59 INFO:vbLDA:[ITER] 41,\telapsed time:0.62,\tELBO:-348008.20\n", "2016-02-10 19:43:59 INFO:vbLDA:[ITER] 42,\telapsed time:0.61,\tELBO:-348007.99\n", "2016-02-10 19:44:00 INFO:vbLDA:[ITER] 43,\telapsed time:0.62,\tELBO:-348007.58\n", "2016-02-10 19:44:01 INFO:vbLDA:[ITER] 44,\telapsed time:0.61,\tELBO:-348006.46\n", "2016-02-10 19:44:01 INFO:vbLDA:[ITER] 45,\telapsed time:0.63,\tELBO:-348003.09\n", "2016-02-10 19:44:02 INFO:vbLDA:[ITER] 46,\telapsed time:0.61,\tELBO:-347999.54\n", "2016-02-10 19:44:02 INFO:vbLDA:[ITER] 47,\telapsed time:0.61,\tELBO:-347995.14\n", "2016-02-10 19:44:03 INFO:vbLDA:[ITER] 48,\telapsed time:0.60,\tELBO:-347992.98\n", "2016-02-10 19:44:04 INFO:vbLDA:[ITER] 49,\telapsed time:0.59,\tELBO:-347990.23\n", "2016-02-10 19:44:04 INFO:vbLDA:[ITER] 50,\telapsed time:0.59,\tELBO:-347986.13\n", "2016-02-10 19:44:05 INFO:vbLDA:[ITER] 51,\telapsed time:0.59,\tELBO:-347984.36\n", "2016-02-10 19:44:05 INFO:vbLDA:[ITER] 52,\telapsed time:0.60,\tELBO:-347981.83\n", "2016-02-10 19:44:06 INFO:vbLDA:[ITER] 53,\telapsed time:0.59,\tELBO:-347980.00\n", "2016-02-10 19:44:07 INFO:vbLDA:[ITER] 54,\telapsed time:0.60,\tELBO:-347975.99\n", "2016-02-10 19:44:07 INFO:vbLDA:[ITER] 55,\telapsed time:0.58,\tELBO:-347973.46\n", "2016-02-10 19:44:08 INFO:vbLDA:[ITER] 56,\telapsed time:0.60,\tELBO:-347970.75\n", "2016-02-10 19:44:08 INFO:vbLDA:[ITER] 57,\telapsed time:0.59,\tELBO:-347970.34\n", "2016-02-10 19:44:09 INFO:vbLDA:[ITER] 58,\telapsed time:0.60,\tELBO:-347970.31\n", "2016-02-10 19:44:10 INFO:vbLDA:[ITER] 59,\telapsed time:0.59,\tELBO:-347970.25\n", "2016-02-10 19:44:10 INFO:vbLDA:[ITER] 60,\telapsed time:0.60,\tELBO:-347970.11\n", "2016-02-10 19:44:11 INFO:vbLDA:[ITER] 61,\telapsed time:0.65,\tELBO:-347969.67\n", "2016-02-10 19:44:12 INFO:vbLDA:[ITER] 62,\telapsed time:0.69,\tELBO:-347968.08\n", "2016-02-10 19:44:12 INFO:vbLDA:[ITER] 63,\telapsed time:0.67,\tELBO:-347967.16\n", "2016-02-10 19:44:13 INFO:vbLDA:[ITER] 64,\telapsed time:0.65,\tELBO:-347966.72\n", "2016-02-10 19:44:13 INFO:vbLDA:[ITER] 65,\telapsed time:0.63,\tELBO:-347965.37\n", "2016-02-10 19:44:14 INFO:vbLDA:[ITER] 66,\telapsed time:0.62,\tELBO:-347964.13\n", "2016-02-10 19:44:15 INFO:vbLDA:[ITER] 67,\telapsed time:0.62,\tELBO:-347964.13\n", "2016-02-10 19:44:15 INFO:vbLDA:[ITER] 68,\telapsed time:0.63,\tELBO:-347964.12\n", "2016-02-10 19:44:16 INFO:vbLDA:[ITER] 69,\telapsed time:0.63,\tELBO:-347964.11\n", "2016-02-10 19:44:17 INFO:vbLDA:[ITER] 70,\telapsed time:0.65,\tELBO:-347964.11\n", "2016-02-10 19:44:17 INFO:vbLDA:[ITER] 71,\telapsed time:0.65,\tELBO:-347964.10\n", "2016-02-10 19:44:18 INFO:vbLDA:[ITER] 72,\telapsed time:0.64,\tELBO:-347964.08\n", "2016-02-10 19:44:19 INFO:vbLDA:[ITER] 73,\telapsed time:0.62,\tELBO:-347964.06\n", "2016-02-10 19:44:19 INFO:vbLDA:[ITER] 74,\telapsed time:0.64,\tELBO:-347964.02\n", "2016-02-10 19:44:20 INFO:vbLDA:[ITER] 75,\telapsed time:0.62,\tELBO:-347963.94\n", "2016-02-10 19:44:20 INFO:vbLDA:[ITER] 76,\telapsed time:0.62,\tELBO:-347963.75\n", "2016-02-10 19:44:21 INFO:vbLDA:[ITER] 77,\telapsed time:0.62,\tELBO:-347963.15\n", "2016-02-10 19:44:22 INFO:vbLDA:[ITER] 78,\telapsed time:0.62,\tELBO:-347961.36\n", "2016-02-10 19:44:22 INFO:vbLDA:[ITER] 79,\telapsed time:0.64,\tELBO:-347960.89\n", "2016-02-10 19:44:23 INFO:vbLDA:[ITER] 80,\telapsed time:0.62,\tELBO:-347960.88\n", "2016-02-10 19:44:24 INFO:vbLDA:[ITER] 81,\telapsed time:0.61,\tELBO:-347960.86\n", "2016-02-10 19:44:24 INFO:vbLDA:[ITER] 82,\telapsed time:0.59,\tELBO:-347960.78\n", "2016-02-10 19:44:25 INFO:vbLDA:[ITER] 83,\telapsed time:0.64,\tELBO:-347960.45\n", "2016-02-10 19:44:25 INFO:vbLDA:[ITER] 84,\telapsed time:0.64,\tELBO:-347959.02\n", "2016-02-10 19:44:26 INFO:vbLDA:[ITER] 85,\telapsed time:0.64,\tELBO:-347958.29\n", "2016-02-10 19:44:27 INFO:vbLDA:[ITER] 86,\telapsed time:0.69,\tELBO:-347958.28\n", "2016-02-10 19:44:27 INFO:vbLDA:[ITER] 87,\telapsed time:0.64,\tELBO:-347958.28\n", "2016-02-10 19:44:28 INFO:vbLDA:[ITER] 88,\telapsed time:0.62,\tELBO:-347958.28\n", "2016-02-10 19:44:29 INFO:vbLDA:[ITER] 89,\telapsed time:0.61,\tELBO:-347958.27\n", "2016-02-10 19:44:29 INFO:vbLDA:[ITER] 90,\telapsed time:0.59,\tELBO:-347958.27\n", "2016-02-10 19:44:30 INFO:vbLDA:[ITER] 91,\telapsed time:0.59,\tELBO:-347958.26\n", "2016-02-10 19:44:30 INFO:vbLDA:[ITER] 92,\telapsed time:0.60,\tELBO:-347958.26\n", "2016-02-10 19:44:31 INFO:vbLDA:[ITER] 93,\telapsed time:0.63,\tELBO:-347958.25\n", "2016-02-10 19:44:32 INFO:vbLDA:[ITER] 94,\telapsed time:0.65,\tELBO:-347958.24\n", "2016-02-10 19:44:32 INFO:vbLDA:[ITER] 95,\telapsed time:0.66,\tELBO:-347958.23\n", "2016-02-10 19:44:33 INFO:vbLDA:[ITER] 96,\telapsed time:0.61,\tELBO:-347958.23\n", "2016-02-10 19:44:34 INFO:vbLDA:[ITER] 97,\telapsed time:0.59,\tELBO:-347958.22\n", "2016-02-10 19:44:34 INFO:vbLDA:[ITER] 98,\telapsed time:0.58,\tELBO:-347958.20\n", "2016-02-10 19:44:35 INFO:vbLDA:[ITER] 99,\telapsed time:0.59,\tELBO:-347958.19\n" ] } ], "source": [ "logger = logging.getLogger('vbLDA')\n", "logger.propagate = False\n", "\n", "vbmodel = vbLDA(n_doc, n_voca, n_topic)\n", "vbmodel.fit(doc_ids, doc_cnt, max_iter=max_iter)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Print top 10 probability words for each topic" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Topic 0 : share,stock,profit,would,offer,corp,earnings,per,dividend,first\n", "Topic 1 : fed,price,trade,may,two,april,market,reserve,would,japan\n", "Topic 2 : dollar,would,one,foreign,growth,last,trade,economic,week,rise\n", "Topic 3 : loss,profit,corp,note,quarter,national,share,gain,one,first\n", "Topic 4 : bank,market,week,days,rate,money,new,april,today,day\n", "Topic 5 : quarter,first,tax,share,income,april,bank,dividend,record,may\n", "Topic 6 : oil,quarter,first,gas,march,gold,february,price,earnings,last\n", "Topic 7 : japan,dollar,trade,would,yen,dome,japanese,market,also,agreement\n", "Topic 8 : nil,last,stocks,month,production,total,grain,crop,wheat,end\n", "Topic 9 : share,corp,april,wheat,price,new,group,would,exchange,department\n" ] } ], "source": [ "for ti in range(n_topic):\n", " top_words = get_top_words(vbmodel._lambda, voca, ti, n_words=10)\n", " print('Topic', ti ,': ', ','.join(top_words))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" }, "toc": { "toc_cell": true, "toc_number_sections": true, "toc_threshold": 4, "toc_window_display": true }, "toc_position": { "left": "1120px", "right": "20px", "top": "120px", "width": "299px" } }, "nbformat": 4, "nbformat_minor": 0 }