{ "cells": [ { "cell_type": "markdown", "metadata": { "toc": "true" }, "source": [ "# Table of Contents\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Supervised Topic Model " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook shows an example of supervised topic model with the dataset provided by [Bo Pang](https://www.cs.cornell.edu/people/pabo/movie-review-data/)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n", "import logging\n", "\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from ptm import GibbsSupervisedLDA\n", "from ptm.nltk_corpus import get_ids_cnt\n", "from ptm.utils import convert_cnt_to_list, get_top_words\n", "\n", "%matplotlib inline \n", "\n", "logger = logging.getLogger('GibbsSupervisedLDA')\n", "logger.propagate = False" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read and tokenize moview review dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "datafolder = '../data/scaledata/Dennis+Schwartz/'\n", "rating_file = os.path.join(datafolder, 'rating.Dennis+Schwartz')\n", "review_file = os.path.join(datafolder, 'subj.Dennis+Schwartz')\n", "with open(rating_file, 'r') as f:\n", " ratings = np.array([float(line.strip()) for line in f.readlines()])\n", "with open(review_file, 'r') as f:\n", " reviews = [line for line in f.readlines()]\n", "\n", "voca, word_ids, word_cnt = get_ids_cnt(reviews)\n", "corpus = convert_cnt_to_list(word_ids, word_cnt)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "num doc 1027 num_voca 10526\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEACAYAAACwB81wAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEfZJREFUeJzt3XmsXGd9xvHvAyYRW01Y4tvGkFBCIUHQkBKTCiQGaENS\ntXHEH2kAVSylRaIsKqBio1Z2VwgSUamqVCpbDYKmKbRJ2E0II5qSBchCwG5qaG2Ciy8QlhJFIIf8\n+scch+nlJnfuMj5jv9+PNMqZc8/yeG7muee+c849qSokSUe/+/UdQJJ0eFj4ktQIC1+SGmHhS1Ij\nLHxJaoSFL0mNWLLwkxyb5LokNyb5SpK/6uYfl2RnkluTfDLJ+rF1tibZk2R3krOm+Q+QJE0mk5yH\nn+RBVXVnkvsD/w68HjgXuL2q3prkjcBxVbUlyanA+4EzgI3AlcDjyxP+JalXEw3pVNWd3eSx3Trf\nAzYDO7r5O4DzuulzgUuq6q6q2gvsATatVWBJ0spMVPhJ7pfkRuAAMKyqXcCGqpoHqKoDwPHd4icA\nt42tvr+bJ0nq0bpJFqqqu4GnJvk54JNJBsDCIRqHbCRphk1U+IdU1f8m+RjwNGA+yYaqmk8yB3yr\nW2w/8Oix1TZ28/6fJP6AkKQVqKqsZL1JztJ55KEzcJI8EPh14EbgCuAl3WIvBi7vpq8ALkhyTJLH\nAicD199L6Jl7bNu2rfcMZjJTi7nMNNljNSY5wv95YEeSMPoB8b6q+nQ3pn9pkpcB+4DzuxLfleRS\nYBdwEHhlrTalJGnVliz8qroFOH2R+d8Ffu1e1nkz8OZVp5MkrRmvtF1gMBj0HeFnmGkyZprcLOYy\n0/RNdOHVVHacONIjScuUhJrWh7aSpKODhS9JjbDwJakRFr4kNcLCl6RGWPiS1AgLX5IaYeFLUiMs\nfElqhIUvSY2w8CWpERa+JDXCwpekRlj4ktQIC1+SGmHhS1IjLHxJaoSFL0mNsPAlqREWviQ1wsKX\npEZY+JLUCAtfkhph4UtSIyx8SWqEhS9JjViy8JNsTHJVkq8kuSXJq7v525J8I8kN3ePssXW2JtmT\nZHeSs6b5D5AkTSZVdd8LJHPAXFXdlOQhwBeBzcBvAz+sqosWLH8K8AHgDGAjcCXw+FqwoyQLZ0mS\nlpCEqspK1l3yCL+qDlTVTd30HcBu4IRD+15klc3AJVV1V1XtBfYAm1YSTpK0dpY1hp/kJOA04Lpu\n1quS3JTknUnWd/NOAG4bW20/P/0BIR1R5uZOIkmvj7m5k/p+GXSUmLjwu+GcDwKv7Y70LwZ+sapO\nAw4Ab5tORKk/8/P7gOr1Mcogrd66SRZKso5R2b+vqi4HqKpvjy3yDuDD3fR+4NFjX9vYzfsZ27dv\nv2d6MBgwGAwmjC1JbRgOhwyHwzXZ1pIf2gIkeS/wnap63di8uao60E3/IXBGVb0wyanA+4GnMxrK\n+RR+aKsjVBJGR9q9psD3ig5ZzYe2Sx7hJ3kG8CLgliQ3Mvq//03AC5OcBtwN7AVeAVBVu5JcCuwC\nDgKvtNklqX8THeFPZcce4esI4BG+Zs1UT8uUJB0dLHxJaoSFL0mNsPAlqREWviQ1wsKXpEZY+JLU\nCAtfkhph4UtSIyx8SWqEhS9JjbDwJakRFr4kNcLCl6RGWPiS1AgLX5IaYeFLUiMsfElqhIUvSY2w\n8CWpERa+JDXCwpekRlj4ktQIC1+SGmHhS1IjLHxJaoSFL0mNsPAlqREWviQ1YsnCT7IxyVVJvpLk\nliSv6eYfl2RnkluTfDLJ+rF1tibZk2R3krOm+Q+QJE0mVXXfCyRzwFxV3ZTkIcAXgc3AS4Hbq+qt\nSd4IHFdVW5KcCrwfOAPYCFwJPL4W7CjJwlnSzEkC9P3/afC9okOSUFVZybpLHuFX1YGquqmbvgPY\nzajINwM7usV2AOd10+cCl1TVXVW1F9gDbFpJOEnS2lnWGH6Sk4DTgGuBDVU1D6MfCsDx3WInALeN\nrba/mydJ6tG6SRfshnM+CLy2qu5IsvB3zGX/zrl9+/Z7pgeDAYPBYLmbkKSj2nA4ZDgcrsm2lhzD\nB0iyDvgI8PGqens3bzcwqKr5bpz/M1V1SpItQFXVhd1ynwC2VdV1C7bpGL5mnmP4mjVTHcPvvBvY\ndajsO1cAL+mmXwxcPjb/giTHJHkscDJw/UrCSZLWziRn6TwD+CxwC6NDnQLexKjELwUeDewDzq+q\n73frbAV+FzjIaAho5yLb9QhfM88jfM2a1RzhTzSkMw0Wvo4EFr5mzeEY0pEkHeEsfElqhIUvSY2w\n8CWpERNfeCUdbnNzJzE/v6/vGNJRw7N0NLNm5QyZWcjge0WHeJaOJGlJFr4kNcLCl6RGWPiS1AgL\nX5IaYeFLUiMsfElqhIUvSY3wSltp5h3bXYTWnw0bTuTAgb29ZtDqeaWtZpZX2s5WBt+vs8ErbSVJ\nS7LwJakRFr4kNcLCl6RGWPiS1AgLX5IaYeFLUiMsfElqhIUvSY2w8CWpERa+JDXCwpekRixZ+Ene\nlWQ+yZfG5m1L8o0kN3SPs8e+tjXJniS7k5w1reCSpOWZ5Aj/PcDzFpl/UVWd3j0+AZDkFOB84BTg\nHODi9P13XSVJwASFX1VXA99b5EuLFflm4JKququq9gJ7gE2rSihJWhOrGcN/VZKbkrwzyfpu3gnA\nbWPL7O/mSZJ6ttI7Xl0M/FlVVZK/AN4GvHy5G9m+ffs904PBgMFgsMI4knR0Gg6HDIfDNdnWRHe8\nSnIi8OGqesp9fS3JFqCq6sLua58AtlXVdYus5x2vdJ+849VsZfD9OhsOxx2vwtiYfZK5sa89H/hy\nN30FcEGSY5I8FjgZuH4lwSRJa2vJIZ0kHwAGwCOSfB3YBjw7yWnA3cBe4BUAVbUryaXALuAg8EoP\n4yVpNngTc80sh3RmK4Pv19ngTcwlSUuy8CWpERa+JDXCwpekRlj4ktSIlV5pq6Pc3NxJzM/v6zuG\npDXkaZlalKdEmmFhBt+vs8HTMiVJS7LwJakRFr4kNcLCl6RGWPiS1AhPy5Q0gWPp+/bUGzacyIED\ne3vNcKTztEwtytMyzTCLGewMT8uUJE3AwpekRlj4ktQIC1+SGmHhS1IjLHxJaoSFL0mNsPAlqREW\nviQ1wsKXpEZY+JLUCAtfkhph4UtSIyx8SWrEkoWf5F1J5pN8aWzecUl2Jrk1ySeTrB/72tYke5Ls\nTnLWtIJLkpZnkiP89wDPWzBvC3BlVT0BuArYCpDkVOB84BTgHODi9H3XBEkSMEHhV9XVwPcWzN4M\n7OimdwDnddPnApdU1V1VtRfYA2xam6iSpNVY6Rj+8VU1D1BVB4Dju/knALeNLbe/mydJ6tla3dN2\nRfcd2759+z3Tg8GAwWCwRnEk6egwHA4ZDodrsq2J7mmb5ETgw1X1lO75bmBQVfNJ5oDPVNUpSbYA\nVVUXdst9AthWVdctsk3vaTvDvKetGWYxg51xeO5pm+5xyBXAS7rpFwOXj82/IMkxSR4LnAxcv5Jg\nkqS1teSQTpIPAAPgEUm+DmwD3gL8c5KXAfsYnZlDVe1KcimwCzgIvNLDeEmaDRMN6Uxlxw7pzDSH\ndMwwixnsjMMzpCNJOsJZ+JLUCAtfkhph4UtSI9bqwitJmrJj6ftPc23YcCIHDuztNcNqeJaOFuVZ\nOmYww+IZ+u4tz9KRJC3JwpekRlj4ktQIC1+SGmHhS1IjLHxJaoSFL0mNsPAlqREWviQ1wsKXpEZY\n+JLUCAtfkhph4UtSIyx8SWqEhS9JjbDwJakRFr4kNcLCl6RGWPiS1AgLX5IaYeFLUiMsfElqxLrV\nrJxkL/AD4G7gYFVtSnIc8E/AicBe4Pyq+sEqc0qSVmm1R/h3A4OqempVbermbQGurKonAFcBW1e5\nD0nSGlht4WeRbWwGdnTTO4DzVrkPSdIaWG3hF/CpJJ9P8vJu3oaqmgeoqgPA8avchyRpDaxqDB94\nRlV9M8mjgJ1JbmX0Q2DcwueSpB6sqvCr6pvdf7+d5DJgEzCfZENVzSeZA751b+tv3779nunBYMBg\nMFhNHEk66gyHQ4bD4ZpsK1UrOwBP8iDgflV1R5IHAzuBPwWeC3y3qi5M8kbguKrassj6tdJ9a/qS\n0P8vZ2Yww+xl6Lu3klBVWcm6qznC3wD8a5LqtvP+qtqZ5AvApUleBuwDzl/FPiRJa2TFR/ir3rFH\n+DPNI3wzmGHxDH331mqO8L3SVpIaYeFLUiMsfElqhIUvSY2w8CWpERa+JDXCwpekRlj4ktSI1f7x\nNE3B3NxJzM/v6zuGpKOMV9rOIK9yNYMZZjdD373llbaSpCVZ+JLUCAtfkhph4UtSIzxLR5Imdmx3\nUsWRycKXpIn9mFk4U2ilHNKRpEZY+JLUCAtfkhph4UtSIyx8SWqEhS9JjbDwJakRFr4kNcLCl6RG\nWPiS1AgLX5IaYeFLUiOmVvhJzk7yH0n+M8kbp7UfSdJkplL4Se4H/C3wPOBJwAuSPHEa+1prw+Gw\n7wiLGPYdYBHDvgMsYth3gEUM+w5wL4Z9B1jEsO8Aixj2HWBNTesIfxOwp6r2VdVB4BJg85T2taYs\n/EkN+w6wiGHfARYx7DvAvRj2HWARw74DLGLYd4A1Na2/h38CcNvY828w+iEw0y677DKuueYaLrro\not4yPOpRj+pt35KObt4AZcwFF7yIH//4Tnbu3Nl3FElac6la+7u3JDkT2F5VZ3fPtwBVVReOLdP3\nbWMk6YhUVSu67dW0Cv/+wK3Ac4FvAtcDL6iq3Wu+M0nSRKYypFNVP0nyKmAnow+G32XZS1K/pnKE\nL0maPVO/0naSC7CS/E2SPUluSnJa35mSPCHJ55L8KMnrpp1nwkwvTHJz97g6yZNnJNe5XaYbk3wh\nyXP6zjS23BlJDiZ5ft+ZkjwryfeT3NA9/rjvTN0yg+579+Ukn+k7U5I3dHluSHJLkruSPGwGcj0i\nyce7jrolyUtmINPDkvxL9/67NsmpS260qqb2YPQD5avAicADgJuAJy5Y5hzgo93004FrZyDTI4Ff\nAf4ceN008ywj05nA+m767Gm/TsvI9aCx6ScDX+0709hynwY+Ajy/70zAs4Arpv09W2am9cBXgBO6\n54/sO9OC5X8TuHJGXqttwJsPvU7A7cC6njO9FfiTbvoJk7xW0z7Cn+QCrM3AewGq6jpgfZINfWaq\nqu9U1ReBu6aYY7mZrq2qH3RPr2V0rcMs5Lpz7OlDgO/0nanzauCDwLemnGc5mVZ0ZsUUM70Q+FBV\n7YfR//czkGncC4B/nHKmSXMdAB7aTT8UuL2qptkPk2Q6FbgKoKpuBU5Kcp8X8ky78Be7AGthUS1c\nZv8iyxzuTIfbcjO9HPj4VBONTJQryXlJdgMfA17Td6YkvwCcV1V/x+Ep2Um/f7/aDQl8dKJfv6ef\n6ZeAhyf5TJLPJ/mdGcgEQJIHMvpN9kNTzjRprncAT0ryP8DNwGtnINPNwPMBkmwCHgNsvK+NeuHV\nESbJs4GXAs/sO8shVXUZcFmSZwLvY/TrZZ/+Ghgf8zycR9b35ovAY6rqziTnAJcxKtw+rQNOB54D\nPBi4Jsk1VfXVfmMB8FvA1VX1/b6DdLYCN1fVs5M8DvhUkqdU1R09ZnoL8PYkNwC3ADcCP7mvFaZd\n+PsZ/dQ5ZGM3b+Eyj15imcOd6XCbKFOSpwB/D5xdVd+blVyHVNXVSdYleURV3d5jpqcBlyQJo/HW\nc5IcrKor+so0XgxV9fEkFyd5eFV9t69MjI4av1NVPwJ+lOSzwC8zGjvuK9MhF3B4hnNgslzPAP4S\noKq+luS/gScCX+grU1X9EHjZoeddpv+6z61O+cOQ+/PTDx6OYfTBwykLlvkNfvqh7ZlM/0PbJTON\nLbsNeP008yzjdXoMsAc4c9p5lpnrcWPTpwNf6zvTguXfw/Q/tJ3kddowNr0J2DsDmZ4IfKpb9kGM\njhJP7ft7x+jD5NuBB07zNVrma/U2YNuh7yWj4ZaH95xpPfCAbvr3gH9YcruH4cU8m9FVt3uALd28\nVwC/P7bM33b/uJuB0/vONPYN/T7wXeDrwEN6zvSO7k1wA6Nf3a6f9us0Ya4/Ar7c5fo34Gl9Z1qw\n7LuZcuFP+Dr9Qfc63Qh8Dnh635m6529gdKbOl4BXz0imFwMfmHaWZX7/Hgl8uOuoLzH6ywF9Zzqz\n+/puRicorF9qm154JUmN8BaHktQIC1+SGmHhS1IjLHxJaoSFL0mNsPAlqREWviQ1wsKXpEb8H8B6\n+LYT4HUJAAAAAElFTkSuQmCC\n", "text/plain": [ "