{ "cells": [ { "cell_type": "markdown", "metadata": { "toc": "true" }, "source": [ "# Table of Contents\n", "

1 - Supervised Topic Model
1.1 - Read and tokenize moview review dataset
1.2 - Infer topics with SupervisedLDA
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Supervised Topic Model " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook shows an example of supervised topic model with the dataset provided by [Bo Pang](https://www.cs.cornell.edu/people/pabo/movie-review-data/)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n", "import logging\n", "\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from ptm import GibbsSupervisedLDA\n", "from ptm.nltk_corpus import get_ids_cnt\n", "from ptm.utils import convert_cnt_to_list, get_top_words\n", "\n", "%matplotlib inline \n", "\n", "logger = logging.getLogger('GibbsSupervisedLDA')\n", "logger.propagate = False" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read and tokenize moview review dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "datafolder = '../data/scaledata/Dennis+Schwartz/'\n", "rating_file = os.path.join(datafolder, 'rating.Dennis+Schwartz')\n", "review_file = os.path.join(datafolder, 'subj.Dennis+Schwartz')\n", "with open(rating_file, 'r') as f:\n", " ratings = np.array([float(line.strip()) for line in f.readlines()])\n", "with open(review_file, 'r') as f:\n", " reviews = [line for line in f.readlines()]\n", "\n", "voca, word_ids, word_cnt = get_ids_cnt(reviews)\n", "corpus = convert_cnt_to_list(word_ids, word_cnt)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "num doc 1027 num_voca 10526\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEACAYAAACwB81wAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEfZJREFUeJzt3XmsXGd9xvHvAyYRW01Y4tvGkFBCIUHQkBKTCiQGaENS\ntXHEH2kAVSylRaIsKqBio1Z2VwgSUamqVCpbDYKmKbRJ2E0II5qSBchCwG5qaG2Ciy8QlhJFIIf8\n+scch+nlJnfuMj5jv9+PNMqZc8/yeG7muee+c849qSokSUe/+/UdQJJ0eFj4ktQIC1+SGmHhS1Ij\nLHxJaoSFL0mNWLLwkxyb5LokNyb5SpK/6uYfl2RnkluTfDLJ+rF1tibZk2R3krOm+Q+QJE0mk5yH\nn+RBVXVnkvsD/w68HjgXuL2q3prkjcBxVbUlyanA+4EzgI3AlcDjyxP+JalXEw3pVNWd3eSx3Trf\nAzYDO7r5O4DzuulzgUuq6q6q2gvsATatVWBJ0spMVPhJ7pfkRuAAMKyqXcCGqpoHqKoDwPHd4icA\nt42tvr+bJ0nq0bpJFqqqu4GnJvk54JNJBsDCIRqHbCRphk1U+IdU1f8m+RjwNGA+yYaqmk8yB3yr\nW2w/8Oix1TZ28/6fJP6AkKQVqKqsZL1JztJ55KEzcJI8EPh14EbgCuAl3WIvBi7vpq8ALkhyTJLH\nAicD199L6Jl7bNu2rfcMZjJTi7nMNNljNSY5wv95YEeSMPoB8b6q+nQ3pn9pkpcB+4DzuxLfleRS\nYBdwEHhlrTalJGnVliz8qroFOH2R+d8Ffu1e1nkz8OZVp5MkrRmvtF1gMBj0HeFnmGkyZprcLOYy\n0/RNdOHVVHacONIjScuUhJrWh7aSpKODhS9JjbDwJakRFr4kNcLCl6RGWPiS1AgLX5IaYeFLUiMs\nfElqhIUvSY2w8CWpERa+JDXCwpekRlj4ktQIC1+SGmHhS1IjLHxJaoSFL0mNsPAlqREWviQ1wsKX\npEZY+JLUCAtfkhph4UtSIyx8SWqEhS9JjViy8JNsTHJVkq8kuSXJq7v525J8I8kN3ePssXW2JtmT\nZHeSs6b5D5AkTSZVdd8LJHPAXFXdlOQhwBeBzcBvAz+sqosWLH8K8AHgDGAjcCXw+FqwoyQLZ0mS\nlpCEqspK1l3yCL+qDlTVTd30HcBu4IRD+15klc3AJVV1V1XtBfYAm1YSTpK0dpY1hp/kJOA04Lpu\n1quS3JTknUnWd/NOAG4bW20/P/0BIR1R5uZOIkmvj7m5k/p+GXSUmLjwu+GcDwKv7Y70LwZ+sapO\nAw4Ab5tORKk/8/P7gOr1Mcogrd66SRZKso5R2b+vqi4HqKpvjy3yDuDD3fR+4NFjX9vYzfsZ27dv\nv2d6MBgwGAwmjC1JbRgOhwyHwzXZ1pIf2gIkeS/wnap63di8uao60E3/IXBGVb0wyanA+4GnMxrK\n+RR+aKsjVBJGR9q9psD3ig5ZzYe2Sx7hJ3kG8CLgliQ3Mvq//03AC5OcBtwN7AVeAVBVu5JcCuwC\nDgKvtNklqX8THeFPZcce4esI4BG+Zs1UT8uUJB0dLHxJaoSFL0mNsPAlqREWviQ1wsKXpEZY+JLU\nCAtfkhph4UtSIyx8SWqEhS9JjbDwJakRFr4kNcLCl6RGWPiS1AgLX5IaYeFLUiMsfElqhIUvSY2w\n8CWpERa+JDXCwpekRlj4ktQIC1+SGmHhS1IjLHxJaoSFL0mNsPAlqREWviQ1YsnCT7IxyVVJvpLk\nliSv6eYfl2RnkluTfDLJ+rF1tibZk2R3krOm+Q+QJE0mVXXfCyRzwFxV3ZTkIcAXgc3AS4Hbq+qt\nSd4IHFdVW5KcCrwfOAPYCFwJPL4W7CjJwlnSzEkC9P3/afC9okOSUFVZybpLHuFX1YGquqmbvgPY\nzajINwM7usV2AOd10+cCl1TVXVW1F9gDbFpJOEnS2lnWGH6Sk4DTgGuBDVU1D6MfCsDx3WInALeN\nrba/mydJ6tG6SRfshnM+CLy2qu5IsvB3zGX/zrl9+/Z7pgeDAYPBYLmbkKSj2nA4ZDgcrsm2lhzD\nB0iyDvgI8PGqens3bzcwqKr5bpz/M1V1SpItQFXVhd1ynwC2VdV1C7bpGL5mnmP4mjVTHcPvvBvY\ndajsO1cAL+mmXwxcPjb/giTHJHkscDJw/UrCSZLWziRn6TwD+CxwC6NDnQLexKjELwUeDewDzq+q\n73frbAV+FzjIaAho5yLb9QhfM88jfM2a1RzhTzSkMw0Wvo4EFr5mzeEY0pEkHeEsfElqhIUvSY2w\n8CWpERNfeCUdbnNzJzE/v6/vGNJRw7N0NLNm5QyZWcjge0WHeJaOJGlJFr4kNcLCl6RGWPiS1AgL\nX5IaYeFLUiMsfElqhIUvSY3wSltp5h3bXYTWnw0bTuTAgb29ZtDqeaWtZpZX2s5WBt+vs8ErbSVJ\nS7LwJakRFr4kNcLCl6RGWPiS1AgLX5IaYeFLUiMsfElqhIUvSY2w8CWpERa+JDXCwpekRixZ+Ene\nlWQ+yZfG5m1L8o0kN3SPs8e+tjXJniS7k5w1reCSpOWZ5Aj/PcDzFpl/UVWd3j0+AZDkFOB84BTg\nHODi9P13XSVJwASFX1VXA99b5EuLFflm4JKququq9gJ7gE2rSihJWhOrGcN/VZKbkrwzyfpu3gnA\nbWPL7O/mSZJ6ttI7Xl0M/FlVVZK/AN4GvHy5G9m+ffs904PBgMFgsMI4knR0Gg6HDIfDNdnWRHe8\nSnIi8OGqesp9fS3JFqCq6sLua58AtlXVdYus5x2vdJ+849VsZfD9OhsOxx2vwtiYfZK5sa89H/hy\nN30FcEGSY5I8FjgZuH4lwSRJa2vJIZ0kHwAGwCOSfB3YBjw7yWnA3cBe4BUAVbUryaXALuAg8EoP\n4yVpNngTc80sh3RmK4Pv19ngTcwlSUuy8CWpERa+JDXCwpekRlj4ktSIlV5pq6Pc3NxJzM/v6zuG\npDXkaZlalKdEmmFhBt+vs8HTMiVJS7LwJakRFr4kNcLCl6RGWPiS1AhPy5Q0gWPp+/bUGzacyIED\ne3vNcKTztEwtytMyzTCLGewMT8uUJE3AwpekRlj4ktQIC1+SGmHhS1IjLHxJaoSFL0mNsPAlqREW\nviQ1wsKXpEZY+JLUCAtfkhph4UtSIyx8SWrEkoWf5F1J5pN8aWzecUl2Jrk1ySeTrB/72tYke5Ls\nTnLWtIJLkpZnkiP89wDPWzBvC3BlVT0BuArYCpDkVOB84BTgHODi9H3XBEkSMEHhV9XVwPcWzN4M\n7OimdwDnddPnApdU1V1VtRfYA2xam6iSpNVY6Rj+8VU1D1BVB4Dju/knALeNLbe/mydJ6tla3dN2\nRfcd2759+z3Tg8GAwWCwRnEk6egwHA4ZDodrsq2J7mmb5ETgw1X1lO75bmBQVfNJ5oDPVNUpSbYA\nVVUXdst9AthWVdctsk3vaTvDvKetGWYxg51xeO5pm+5xyBXAS7rpFwOXj82/IMkxSR4LnAxcv5Jg\nkqS1teSQTpIPAAPgEUm+DmwD3gL8c5KXAfsYnZlDVe1KcimwCzgIvNLDeEmaDRMN6Uxlxw7pzDSH\ndMwwixnsjMMzpCNJOsJZ+JLUCAtfkhph4UtSI9bqwitJmrJj6ftPc23YcCIHDuztNcNqeJaOFuVZ\nOmYww+IZ+u4tz9KRJC3JwpekRlj4ktQIC1+SGmHhS1IjLHxJaoSFL0mNsPAlqREWviQ1wsKXpEZY\n+JLUCAtfkhph4UtSIyx8SWqEhS9JjbDwJakRFr4kNcLCl6RGWPiS1AgLX5IaYeFLUiMsfElqxLrV\nrJxkL/AD4G7gYFVtSnIc8E/AicBe4Pyq+sEqc0qSVmm1R/h3A4OqempVbermbQGurKonAFcBW1e5\nD0nSGlht4WeRbWwGdnTTO4DzVrkPSdIaWG3hF/CpJJ9P8vJu3oaqmgeoqgPA8avchyRpDaxqDB94\nRlV9M8mjgJ1JbmX0Q2DcwueSpB6sqvCr6pvdf7+d5DJgEzCfZENVzSeZA751b+tv3779nunBYMBg\nMFhNHEk66gyHQ4bD4ZpsK1UrOwBP8iDgflV1R5IHAzuBPwWeC3y3qi5M8kbguKrassj6tdJ9a/qS\n0P8vZ2Yww+xl6Lu3klBVWcm6qznC3wD8a5LqtvP+qtqZ5AvApUleBuwDzl/FPiRJa2TFR/ir3rFH\n+DPNI3wzmGHxDH331mqO8L3SVpIaYeFLUiMsfElqhIUvSY2w8CWpERa+JDXCwpekRlj4ktSI1f7x\nNE3B3NxJzM/v6zuGpKOMV9rOIK9yNYMZZjdD373llbaSpCVZ+JLUCAtfkhph4UtSIzxLR5Imdmx3\nUsWRycKXpIn9mFk4U2ilHNKRpEZY+JLUCAtfkhph4UtSIyx8SWqEhS9JjbDwJakRFr4kNcLCl6RG\nWPiS1AgLX5IaYeFLUiOmVvhJzk7yH0n+M8kbp7UfSdJkplL4Se4H/C3wPOBJwAuSPHEa+1prw+Gw\n7wiLGPYdYBHDvgMsYth3gEUM+w5wL4Z9B1jEsO8Aixj2HWBNTesIfxOwp6r2VdVB4BJg85T2taYs\n/EkN+w6wiGHfARYx7DvAvRj2HWARw74DLGLYd4A1Na2/h38CcNvY828w+iEw0y677DKuueYaLrro\not4yPOpRj+pt35KObt4AZcwFF7yIH//4Tnbu3Nl3FElac6la+7u3JDkT2F5VZ3fPtwBVVReOLdP3\nbWMk6YhUVSu67dW0Cv/+wK3Ac4FvAtcDL6iq3Wu+M0nSRKYypFNVP0nyKmAnow+G32XZS1K/pnKE\nL0maPVO/0naSC7CS/E2SPUluSnJa35mSPCHJ55L8KMnrpp1nwkwvTHJz97g6yZNnJNe5XaYbk3wh\nyXP6zjS23BlJDiZ5ft+ZkjwryfeT3NA9/rjvTN0yg+579+Ukn+k7U5I3dHluSHJLkruSPGwGcj0i\nyce7jrolyUtmINPDkvxL9/67NsmpS260qqb2YPQD5avAicADgJuAJy5Y5hzgo93004FrZyDTI4Ff\nAf4ceN008ywj05nA+m767Gm/TsvI9aCx6ScDX+0709hynwY+Ajy/70zAs4Arpv09W2am9cBXgBO6\n54/sO9OC5X8TuHJGXqttwJsPvU7A7cC6njO9FfiTbvoJk7xW0z7Cn+QCrM3AewGq6jpgfZINfWaq\nqu9U1ReBu6aYY7mZrq2qH3RPr2V0rcMs5Lpz7OlDgO/0nanzauCDwLemnGc5mVZ0ZsUUM70Q+FBV\n7YfR//czkGncC4B/nHKmSXMdAB7aTT8UuL2qptkPk2Q6FbgKoKpuBU5Kcp8X8ky78Be7AGthUS1c\nZv8iyxzuTIfbcjO9HPj4VBONTJQryXlJdgMfA17Td6YkvwCcV1V/x+Ep2Um/f7/aDQl8dKJfv6ef\n6ZeAhyf5TJLPJ/mdGcgEQJIHMvpN9kNTzjRprncAT0ryP8DNwGtnINPNwPMBkmwCHgNsvK+NeuHV\nESbJs4GXAs/sO8shVXUZcFmSZwLvY/TrZZ/+Ghgf8zycR9b35ovAY6rqziTnAJcxKtw+rQNOB54D\nPBi4Jsk1VfXVfmMB8FvA1VX1/b6DdLYCN1fVs5M8DvhUkqdU1R09ZnoL8PYkNwC3ADcCP7mvFaZd\n+PsZ/dQ5ZGM3b+Eyj15imcOd6XCbKFOSpwB/D5xdVd+blVyHVNXVSdYleURV3d5jpqcBlyQJo/HW\nc5IcrKor+so0XgxV9fEkFyd5eFV9t69MjI4av1NVPwJ+lOSzwC8zGjvuK9MhF3B4hnNgslzPAP4S\noKq+luS/gScCX+grU1X9EHjZoeddpv+6z61O+cOQ+/PTDx6OYfTBwykLlvkNfvqh7ZlM/0PbJTON\nLbsNeP008yzjdXoMsAc4c9p5lpnrcWPTpwNf6zvTguXfw/Q/tJ3kddowNr0J2DsDmZ4IfKpb9kGM\njhJP7ft7x+jD5NuBB07zNVrma/U2YNuh7yWj4ZaH95xpPfCAbvr3gH9YcruH4cU8m9FVt3uALd28\nVwC/P7bM33b/uJuB0/vONPYN/T7wXeDrwEN6zvSO7k1wA6Nf3a6f9us0Ya4/Ar7c5fo34Gl9Z1qw\n7LuZcuFP+Dr9Qfc63Qh8Dnh635m6529gdKbOl4BXz0imFwMfmHaWZX7/Hgl8uOuoLzH6ywF9Zzqz\n+/puRicorF9qm154JUmN8BaHktQIC1+SGmHhS1IjLHxJaoSFL0mNsPAlqREWviQ1wsKXpEb8H8B6\n+LYT4HUJAAAAAElFTkSuQmCC\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "max rating 0.9 \tmin rating 0.1\n" ] } ], "source": [ "n_doc = len(corpus)\n", "n_voca = voca.size\n", "print('num doc', n_doc, 'num_voca', n_voca)\n", "plt.hist(ratings, bins=9)\n", "plt.show()\n", "print('max rating', np.max(ratings), '\\tmin rating', np.min(ratings))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Infer topics with SupervisedLDA" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2016-02-10 19:43:04 INFO:GibbsSupervisedLDA:[ITER] 0,\tMAE:0.07,\tlog_likelihood:-1200104.10\n", "2016-02-10 19:43:13 INFO:GibbsSupervisedLDA:[ITER] 1,\tMAE:0.07,\tlog_likelihood:-1115555.47\n", "2016-02-10 19:43:22 INFO:GibbsSupervisedLDA:[ITER] 2,\tMAE:0.07,\tlog_likelihood:-1073563.29\n", "2016-02-10 19:43:32 INFO:GibbsSupervisedLDA:[ITER] 3,\tMAE:0.07,\tlog_likelihood:-1048468.36\n", "2016-02-10 19:43:41 INFO:GibbsSupervisedLDA:[ITER] 4,\tMAE:0.07,\tlog_likelihood:-1032117.31\n", "2016-02-10 19:43:49 INFO:GibbsSupervisedLDA:[ITER] 5,\tMAE:0.08,\tlog_likelihood:-1020609.06\n", "2016-02-10 19:43:58 INFO:GibbsSupervisedLDA:[ITER] 6,\tMAE:0.08,\tlog_likelihood:-1012588.62\n", "2016-02-10 19:44:07 INFO:GibbsSupervisedLDA:[ITER] 7,\tMAE:0.08,\tlog_likelihood:-1006078.19\n", "2016-02-10 19:44:17 INFO:GibbsSupervisedLDA:[ITER] 8,\tMAE:0.08,\tlog_likelihood:-1000468.34\n", "2016-02-10 19:44:26 INFO:GibbsSupervisedLDA:[ITER] 9,\tMAE:0.08,\tlog_likelihood:-996064.31\n", "2016-02-10 19:44:35 INFO:GibbsSupervisedLDA:[ITER] 10,\tMAE:0.08,\tlog_likelihood:-991448.64\n", "2016-02-10 19:44:44 INFO:GibbsSupervisedLDA:[ITER] 11,\tMAE:0.09,\tlog_likelihood:-989309.68\n", "2016-02-10 19:44:53 INFO:GibbsSupervisedLDA:[ITER] 12,\tMAE:0.08,\tlog_likelihood:-986026.11\n", "2016-02-10 19:45:02 INFO:GibbsSupervisedLDA:[ITER] 13,\tMAE:0.08,\tlog_likelihood:-983278.74\n", "2016-02-10 19:45:12 INFO:GibbsSupervisedLDA:[ITER] 14,\tMAE:0.08,\tlog_likelihood:-980648.03\n", "2016-02-10 19:45:20 INFO:GibbsSupervisedLDA:[ITER] 15,\tMAE:0.09,\tlog_likelihood:-978593.27\n", "2016-02-10 19:45:29 INFO:GibbsSupervisedLDA:[ITER] 16,\tMAE:0.09,\tlog_likelihood:-977028.47\n", "2016-02-10 19:45:38 INFO:GibbsSupervisedLDA:[ITER] 17,\tMAE:0.09,\tlog_likelihood:-975142.51\n", "2016-02-10 19:45:47 INFO:GibbsSupervisedLDA:[ITER] 18,\tMAE:0.09,\tlog_likelihood:-974758.06\n", "2016-02-10 19:45:57 INFO:GibbsSupervisedLDA:[ITER] 19,\tMAE:0.08,\tlog_likelihood:-972879.10\n", "2016-02-10 19:46:05 INFO:GibbsSupervisedLDA:[ITER] 20,\tMAE:0.09,\tlog_likelihood:-971601.43\n", "2016-02-10 19:46:15 INFO:GibbsSupervisedLDA:[ITER] 21,\tMAE:0.09,\tlog_likelihood:-970929.30\n", "2016-02-10 19:46:24 INFO:GibbsSupervisedLDA:[ITER] 22,\tMAE:0.08,\tlog_likelihood:-969826.58\n", "2016-02-10 19:46:34 INFO:GibbsSupervisedLDA:[ITER] 23,\tMAE:0.08,\tlog_likelihood:-968402.00\n", "2016-02-10 19:46:43 INFO:GibbsSupervisedLDA:[ITER] 24,\tMAE:0.08,\tlog_likelihood:-968001.73\n", "2016-02-10 19:46:52 INFO:GibbsSupervisedLDA:[ITER] 25,\tMAE:0.08,\tlog_likelihood:-967423.17\n", "2016-02-10 19:47:02 INFO:GibbsSupervisedLDA:[ITER] 26,\tMAE:0.08,\tlog_likelihood:-966159.14\n", "2016-02-10 19:47:12 INFO:GibbsSupervisedLDA:[ITER] 27,\tMAE:0.08,\tlog_likelihood:-965307.04\n", "2016-02-10 19:47:21 INFO:GibbsSupervisedLDA:[ITER] 28,\tMAE:0.08,\tlog_likelihood:-964563.86\n", "2016-02-10 19:47:31 INFO:GibbsSupervisedLDA:[ITER] 29,\tMAE:0.08,\tlog_likelihood:-963570.42\n", "2016-02-10 19:47:42 INFO:GibbsSupervisedLDA:[ITER] 30,\tMAE:0.08,\tlog_likelihood:-963055.59\n", "2016-02-10 19:47:52 INFO:GibbsSupervisedLDA:[ITER] 31,\tMAE:0.08,\tlog_likelihood:-962735.30\n", "2016-02-10 19:48:02 INFO:GibbsSupervisedLDA:[ITER] 32,\tMAE:0.08,\tlog_likelihood:-961139.44\n", "2016-02-10 19:48:12 INFO:GibbsSupervisedLDA:[ITER] 33,\tMAE:0.08,\tlog_likelihood:-960502.03\n", "2016-02-10 19:48:22 INFO:GibbsSupervisedLDA:[ITER] 34,\tMAE:0.08,\tlog_likelihood:-959892.16\n", "2016-02-10 19:48:32 INFO:GibbsSupervisedLDA:[ITER] 35,\tMAE:0.08,\tlog_likelihood:-959321.29\n", "2016-02-10 19:48:42 INFO:GibbsSupervisedLDA:[ITER] 36,\tMAE:0.08,\tlog_likelihood:-959294.60\n", "2016-02-10 19:48:51 INFO:GibbsSupervisedLDA:[ITER] 37,\tMAE:0.08,\tlog_likelihood:-958416.57\n", "2016-02-10 19:49:00 INFO:GibbsSupervisedLDA:[ITER] 38,\tMAE:0.08,\tlog_likelihood:-958708.81\n", "2016-02-10 19:49:09 INFO:GibbsSupervisedLDA:[ITER] 39,\tMAE:0.08,\tlog_likelihood:-958331.49\n", "2016-02-10 19:49:18 INFO:GibbsSupervisedLDA:[ITER] 40,\tMAE:0.08,\tlog_likelihood:-957697.51\n", "2016-02-10 19:49:28 INFO:GibbsSupervisedLDA:[ITER] 41,\tMAE:0.08,\tlog_likelihood:-956916.19\n", "2016-02-10 19:49:37 INFO:GibbsSupervisedLDA:[ITER] 42,\tMAE:0.08,\tlog_likelihood:-955973.23\n", "2016-02-10 19:49:46 INFO:GibbsSupervisedLDA:[ITER] 43,\tMAE:0.08,\tlog_likelihood:-955332.56\n", "2016-02-10 19:49:54 INFO:GibbsSupervisedLDA:[ITER] 44,\tMAE:0.08,\tlog_likelihood:-955296.00\n", "2016-02-10 19:50:03 INFO:GibbsSupervisedLDA:[ITER] 45,\tMAE:0.08,\tlog_likelihood:-955303.42\n", "2016-02-10 19:50:12 INFO:GibbsSupervisedLDA:[ITER] 46,\tMAE:0.08,\tlog_likelihood:-954540.14\n", "2016-02-10 19:50:22 INFO:GibbsSupervisedLDA:[ITER] 47,\tMAE:0.08,\tlog_likelihood:-954232.94\n", "2016-02-10 19:50:32 INFO:GibbsSupervisedLDA:[ITER] 48,\tMAE:0.08,\tlog_likelihood:-952474.63\n", "2016-02-10 19:50:41 INFO:GibbsSupervisedLDA:[ITER] 49,\tMAE:0.08,\tlog_likelihood:-952360.92\n", "2016-02-10 19:50:51 INFO:GibbsSupervisedLDA:[ITER] 50,\tMAE:0.08,\tlog_likelihood:-953415.19\n", "2016-02-10 19:51:00 INFO:GibbsSupervisedLDA:[ITER] 51,\tMAE:0.08,\tlog_likelihood:-952345.91\n", "2016-02-10 19:51:10 INFO:GibbsSupervisedLDA:[ITER] 52,\tMAE:0.08,\tlog_likelihood:-952259.97\n", "2016-02-10 19:51:20 INFO:GibbsSupervisedLDA:[ITER] 53,\tMAE:0.08,\tlog_likelihood:-952232.09\n", "2016-02-10 19:51:30 INFO:GibbsSupervisedLDA:[ITER] 54,\tMAE:0.08,\tlog_likelihood:-952488.02\n", "2016-02-10 19:51:40 INFO:GibbsSupervisedLDA:[ITER] 55,\tMAE:0.08,\tlog_likelihood:-951400.56\n", "2016-02-10 19:51:49 INFO:GibbsSupervisedLDA:[ITER] 56,\tMAE:0.08,\tlog_likelihood:-951612.91\n", "2016-02-10 19:51:59 INFO:GibbsSupervisedLDA:[ITER] 57,\tMAE:0.08,\tlog_likelihood:-951843.57\n", "2016-02-10 19:52:08 INFO:GibbsSupervisedLDA:[ITER] 58,\tMAE:0.08,\tlog_likelihood:-951312.42\n", "2016-02-10 19:52:18 INFO:GibbsSupervisedLDA:[ITER] 59,\tMAE:0.08,\tlog_likelihood:-951363.82\n", "2016-02-10 19:52:28 INFO:GibbsSupervisedLDA:[ITER] 60,\tMAE:0.08,\tlog_likelihood:-950682.99\n", "2016-02-10 19:52:38 INFO:GibbsSupervisedLDA:[ITER] 61,\tMAE:0.08,\tlog_likelihood:-950734.54\n", "2016-02-10 19:52:47 INFO:GibbsSupervisedLDA:[ITER] 62,\tMAE:0.08,\tlog_likelihood:-950539.13\n", "2016-02-10 19:52:57 INFO:GibbsSupervisedLDA:[ITER] 63,\tMAE:0.08,\tlog_likelihood:-950733.91\n", "2016-02-10 19:53:07 INFO:GibbsSupervisedLDA:[ITER] 64,\tMAE:0.08,\tlog_likelihood:-949927.04\n", "2016-02-10 19:53:16 INFO:GibbsSupervisedLDA:[ITER] 65,\tMAE:0.08,\tlog_likelihood:-949374.16\n", "2016-02-10 19:53:25 INFO:GibbsSupervisedLDA:[ITER] 66,\tMAE:0.08,\tlog_likelihood:-949330.23\n", "2016-02-10 19:53:35 INFO:GibbsSupervisedLDA:[ITER] 67,\tMAE:0.08,\tlog_likelihood:-948267.36\n", "2016-02-10 19:53:44 INFO:GibbsSupervisedLDA:[ITER] 68,\tMAE:0.08,\tlog_likelihood:-949421.16\n", "2016-02-10 19:53:53 INFO:GibbsSupervisedLDA:[ITER] 69,\tMAE:0.08,\tlog_likelihood:-948148.44\n", "2016-02-10 19:54:02 INFO:GibbsSupervisedLDA:[ITER] 70,\tMAE:0.08,\tlog_likelihood:-947131.47\n", "2016-02-10 19:54:11 INFO:GibbsSupervisedLDA:[ITER] 71,\tMAE:0.08,\tlog_likelihood:-947165.15\n", "2016-02-10 19:54:19 INFO:GibbsSupervisedLDA:[ITER] 72,\tMAE:0.08,\tlog_likelihood:-947004.44\n", "2016-02-10 19:54:28 INFO:GibbsSupervisedLDA:[ITER] 73,\tMAE:0.08,\tlog_likelihood:-947023.80\n", "2016-02-10 19:54:36 INFO:GibbsSupervisedLDA:[ITER] 74,\tMAE:0.08,\tlog_likelihood:-946379.40\n", "2016-02-10 19:54:45 INFO:GibbsSupervisedLDA:[ITER] 75,\tMAE:0.08,\tlog_likelihood:-946648.86\n", "2016-02-10 19:54:53 INFO:GibbsSupervisedLDA:[ITER] 76,\tMAE:0.08,\tlog_likelihood:-947049.11\n", "2016-02-10 19:55:01 INFO:GibbsSupervisedLDA:[ITER] 77,\tMAE:0.08,\tlog_likelihood:-946696.94\n", "2016-02-10 19:55:10 INFO:GibbsSupervisedLDA:[ITER] 78,\tMAE:0.08,\tlog_likelihood:-947052.36\n", "2016-02-10 19:55:18 INFO:GibbsSupervisedLDA:[ITER] 79,\tMAE:0.08,\tlog_likelihood:-945975.22\n", "2016-02-10 19:55:27 INFO:GibbsSupervisedLDA:[ITER] 80,\tMAE:0.08,\tlog_likelihood:-945828.06\n", "2016-02-10 19:55:35 INFO:GibbsSupervisedLDA:[ITER] 81,\tMAE:0.08,\tlog_likelihood:-945327.94\n", "2016-02-10 19:55:43 INFO:GibbsSupervisedLDA:[ITER] 82,\tMAE:0.08,\tlog_likelihood:-945460.64\n", "2016-02-10 19:55:52 INFO:GibbsSupervisedLDA:[ITER] 83,\tMAE:0.08,\tlog_likelihood:-944772.90\n", "2016-02-10 19:56:00 INFO:GibbsSupervisedLDA:[ITER] 84,\tMAE:0.08,\tlog_likelihood:-944241.21\n", "2016-02-10 19:56:09 INFO:GibbsSupervisedLDA:[ITER] 85,\tMAE:0.08,\tlog_likelihood:-944988.86\n", "2016-02-10 19:56:17 INFO:GibbsSupervisedLDA:[ITER] 86,\tMAE:0.08,\tlog_likelihood:-944814.22\n", "2016-02-10 19:56:26 INFO:GibbsSupervisedLDA:[ITER] 87,\tMAE:0.08,\tlog_likelihood:-945325.78\n", "2016-02-10 19:56:34 INFO:GibbsSupervisedLDA:[ITER] 88,\tMAE:0.08,\tlog_likelihood:-945067.91\n", "2016-02-10 19:56:43 INFO:GibbsSupervisedLDA:[ITER] 89,\tMAE:0.08,\tlog_likelihood:-944895.82\n", "2016-02-10 19:56:51 INFO:GibbsSupervisedLDA:[ITER] 90,\tMAE:0.08,\tlog_likelihood:-944184.63\n", "2016-02-10 19:56:59 INFO:GibbsSupervisedLDA:[ITER] 91,\tMAE:0.08,\tlog_likelihood:-944688.10\n", "2016-02-10 19:57:08 INFO:GibbsSupervisedLDA:[ITER] 92,\tMAE:0.08,\tlog_likelihood:-944621.01\n", "2016-02-10 19:57:16 INFO:GibbsSupervisedLDA:[ITER] 93,\tMAE:0.08,\tlog_likelihood:-944162.71\n", "2016-02-10 19:57:25 INFO:GibbsSupervisedLDA:[ITER] 94,\tMAE:0.08,\tlog_likelihood:-943703.28\n", "2016-02-10 19:57:33 INFO:GibbsSupervisedLDA:[ITER] 95,\tMAE:0.08,\tlog_likelihood:-943792.98\n", "2016-02-10 19:57:42 INFO:GibbsSupervisedLDA:[ITER] 96,\tMAE:0.08,\tlog_likelihood:-944534.72\n", "2016-02-10 19:57:50 INFO:GibbsSupervisedLDA:[ITER] 97,\tMAE:0.08,\tlog_likelihood:-944806.54\n", "2016-02-10 19:57:59 INFO:GibbsSupervisedLDA:[ITER] 98,\tMAE:0.08,\tlog_likelihood:-944684.98\n", "2016-02-10 19:58:07 INFO:GibbsSupervisedLDA:[ITER] 99,\tMAE:0.08,\tlog_likelihood:-943800.65\n" ] } ], "source": [ "n_topic = 50\n", "r_var = 0.01\n", "\n", "model = GibbsSupervisedLDA(n_doc, n_voca, n_topic, sigma=r_var)\n", "model.fit(corpus, ratings)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Eta -0.234767820987 Topic 34 :\t bad,could,anything,get,never,movie,another,go,ca,script\n", "Eta -0.214572023979 Topic 41 :\t could,would,get,made,might,anything,many,good,look,never\n", "Eta -0.160506173159 Topic 38 :\t got,far,time,right,way,every,making,part,dull,guess\n", "Eta 0.0219171933193 Topic 49 :\t found,still,subject,intellectual,becomes,sense,tension,fine,talk,century\n", "Eta 0.0419401650241 Topic 10 :\t make,action,good,everything,acting,made,funny,dialogue,script,place\n", "Eta 0.0764470841209 Topic 27 :\t entertainment,value,lot,theme,violent,red,take,mostly,many,easy\n", "Eta 0.128975547132 Topic 21 :\t much,part,might,great,bad,getting,two,away,audience,least\n", "Eta 0.159533085017 Topic 36 :\t done,time,manner,dialogue,never,version,great,production,feeling,right\n", "Eta 0.184797336584 Topic 44 :\t opera,long,interesting,soap,go,telling,picture,chance,different,offering\n", "Eta 0.210652963064 Topic 31 :\t director,enough,done,many,shown,money,movie,must,give,blood\n", "Eta 0.232820782727 Topic 37 :\t good,look,everything,looking,also,romance,tale,new,thriller,find\n", "Eta 0.260212275042 Topic 11 :\t comedy,black,satire,comic,humor,love,going,cast,main,romantic\n", "Eta 0.339720354837 Topic 30 :\t mystery,thriller,suspense,plot,role,usual,give,horror,atmosphere,john\n", "Eta 0.346229992245 Topic 39 :\t role,case,watch,long,sense,back,problem,experience,well,type\n", "Eta 0.35836129898 Topic 28 :\t good,believe,without,go,god,church,back,interesting,still,much\n", "Eta 0.381319468906 Topic 23 :\t funny,also,way,think,humor,character,good,see,comedy,still\n", "Eta 0.384113637482 Topic 6 :\t special,make,movie,see,effects,would,first,something,though,horror\n", "Eta 0.396933018793 Topic 32 :\t director,scene,time,able,part,interesting,shot,interest,acting,set\n", "Eta 0.404554661869 Topic 5 :\t really,enough,way,could,something,much,movie,time,since,seem\n", "Eta 0.425037264045 Topic 33 :\t musical,music,best,french,dance,way,song,dancing,great,lively\n", "Eta 0.457287458236 Topic 22 :\t love,made,much,life,though,part,modern,times,acting,kind\n", "Eta 0.490046516159 Topic 40 :\t many,something,people,us,good,could,times,might,instead,small\n", "Eta 0.535690662439 Topic 14 :\t make,movie,great,would,work,comes,though,really,mark,audience\n", "Eta 0.544370355785 Topic 7 :\t get,really,anything,seeing,life,feel,difficult,see,enough,emotional\n", "Eta 0.545175280796 Topic 8 :\t killer,two,movie,together,couple,script,serial,though,best,ca\n", "Eta 0.565327709237 Topic 45 :\t almost,joan,making,sense,two,easily,care,lot,someone,person\n", "Eta 0.567068916427 Topic 12 :\t would,made,still,main,audience,goes,music,man,subject,felt\n", "Eta 0.580004732236 Topic 24 :\t could,showing,see,house,come,life,say,psychological,going,city\n", "Eta 0.588440544275 Topic 25 :\t noir,character,dark,gave,role,hero,also,study,genre,protagonist\n", "Eta 0.589570607942 Topic 17 :\t much,also,best,camera,thought,together,two,work,added,every\n", "Eta 0.594475657781 Topic 16 :\t western,artist,good,man,scene,action,art,pleasing,plenty,still\n", "Eta 0.598417849919 Topic 26 :\t war,men,made,sides,place,show,battle,submarine,military,performance\n", "Eta 0.607731679912 Topic 35 :\t message,world,audience,political,people,point,showing,time,public,country\n", "Eta 0.635974893904 Topic 15 :\t sex,culture,first,seem,comes,sexual,play,performance,direction,sense\n", "Eta 0.64483388889 Topic 3 :\t noir,shot,dark,made,ending,feel,style,city,mood,little\n", "Eta 0.646089932446 Topic 19 :\t love,tale,director,hollywood,get,much,make,could,never,away\n", "Eta 0.65050684023 Topic 48 :\t way,make,see,though,life,know,family,might,someone,real\n", "Eta 0.66143777888 Topic 1 :\t hollywood,action,role,say,star,get,office,certain,original,come\n", "Eta 0.686787742783 Topic 46 :\t sense,think,thing,would,much,something,new,better,saying,feeling\n", "Eta 0.709956673591 Topic 9 :\t director,political,never,right,making,bad,without,taken,seen,plot\n", "Eta 0.712450862064 Topic 29 :\t something,violence,way,say,many,get,society,think,everyone,point\n", "Eta 0.735116092694 Topic 13 :\t character,performance,real,someone,also,without,director,work,well,interesting\n", "Eta 0.779470138429 Topic 42 :\t much,work,though,works,yet,well,rather,would,odd,make\n", "Eta 0.80049939286 Topic 18 :\t many,better,movie,look,way,last,white,directed,book,mood\n", "Eta 0.891479893452 Topic 47 :\t way,also,two,making,good,fun,much,scene,seem,romantic\n", "Eta 0.900176187326 Topic 0 :\t could,made,used,seen,violence,man,different,cast,scene,john\n", "Eta 1.0136094881 Topic 43 :\t people,us,human,see,history,part,right,point,message,place\n", "Eta 1.05242575044 Topic 2 :\t american,might,also,director,work,take,much,always,society,america\n", "Eta 1.50749544462 Topic 20 :\t screen,might,see,masterpiece,way,work,version,must,better,well\n", "Eta 1.8658611489 Topic 4 :\t many,great,life,performance,best,look,something,way,see,theme\n" ] } ], "source": [ "for ti in model.eta.argsort():\n", " top_words = get_top_words(model.TW, voca, ti, n_words=10)\n", " print('Eta', model.eta[ti] ,'Topic', ti ,':\\t', ','.join(top_words))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**The review about one movie, so the topics does not seem to be clearly distinguishable. At least, however, the most negative topics contain words such as `bad`, `never`, and `dull`. And the most positive topics contain word like `great`, `best`, and `masterpeice`.**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" }, "toc": { "toc_cell": true, "toc_number_sections": true, "toc_threshold": 4, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 0 }