This notebook shows an example of supervised topic model with the dataset provided by Bo Pang
import os
import logging
import numpy as np
import matplotlib.pyplot as plt
from ptm import GibbsSupervisedLDA
from ptm.nltk_corpus import get_ids_cnt
from ptm.utils import convert_cnt_to_list, get_top_words
%matplotlib inline
logger = logging.getLogger('GibbsSupervisedLDA')
logger.propagate = False
datafolder = '../data/scaledata/Dennis+Schwartz/'
rating_file = os.path.join(datafolder, 'rating.Dennis+Schwartz')
review_file = os.path.join(datafolder, 'subj.Dennis+Schwartz')
with open(rating_file, 'r') as f:
ratings = np.array([float(line.strip()) for line in f.readlines()])
with open(review_file, 'r') as f:
reviews = [line for line in f.readlines()]
voca, word_ids, word_cnt = get_ids_cnt(reviews)
corpus = convert_cnt_to_list(word_ids, word_cnt)
n_doc = len(corpus)
n_voca = voca.size
print('num doc', n_doc, 'num_voca', n_voca)
plt.hist(ratings, bins=9)
plt.show()
print('max rating', np.max(ratings), '\tmin rating', np.min(ratings))
num doc 1027 num_voca 10526
max rating 0.9 min rating 0.1
n_topic = 50
r_var = 0.01
model = GibbsSupervisedLDA(n_doc, n_voca, n_topic, sigma=r_var)
model.fit(corpus, ratings)
2016-02-10 19:43:04 INFO:GibbsSupervisedLDA:[ITER] 0, MAE:0.07, log_likelihood:-1200104.10 2016-02-10 19:43:13 INFO:GibbsSupervisedLDA:[ITER] 1, MAE:0.07, log_likelihood:-1115555.47 2016-02-10 19:43:22 INFO:GibbsSupervisedLDA:[ITER] 2, MAE:0.07, log_likelihood:-1073563.29 2016-02-10 19:43:32 INFO:GibbsSupervisedLDA:[ITER] 3, MAE:0.07, log_likelihood:-1048468.36 2016-02-10 19:43:41 INFO:GibbsSupervisedLDA:[ITER] 4, MAE:0.07, log_likelihood:-1032117.31 2016-02-10 19:43:49 INFO:GibbsSupervisedLDA:[ITER] 5, MAE:0.08, log_likelihood:-1020609.06 2016-02-10 19:43:58 INFO:GibbsSupervisedLDA:[ITER] 6, MAE:0.08, log_likelihood:-1012588.62 2016-02-10 19:44:07 INFO:GibbsSupervisedLDA:[ITER] 7, MAE:0.08, log_likelihood:-1006078.19 2016-02-10 19:44:17 INFO:GibbsSupervisedLDA:[ITER] 8, MAE:0.08, log_likelihood:-1000468.34 2016-02-10 19:44:26 INFO:GibbsSupervisedLDA:[ITER] 9, MAE:0.08, log_likelihood:-996064.31 2016-02-10 19:44:35 INFO:GibbsSupervisedLDA:[ITER] 10, MAE:0.08, log_likelihood:-991448.64 2016-02-10 19:44:44 INFO:GibbsSupervisedLDA:[ITER] 11, MAE:0.09, log_likelihood:-989309.68 2016-02-10 19:44:53 INFO:GibbsSupervisedLDA:[ITER] 12, MAE:0.08, log_likelihood:-986026.11 2016-02-10 19:45:02 INFO:GibbsSupervisedLDA:[ITER] 13, MAE:0.08, log_likelihood:-983278.74 2016-02-10 19:45:12 INFO:GibbsSupervisedLDA:[ITER] 14, MAE:0.08, log_likelihood:-980648.03 2016-02-10 19:45:20 INFO:GibbsSupervisedLDA:[ITER] 15, MAE:0.09, log_likelihood:-978593.27 2016-02-10 19:45:29 INFO:GibbsSupervisedLDA:[ITER] 16, MAE:0.09, log_likelihood:-977028.47 2016-02-10 19:45:38 INFO:GibbsSupervisedLDA:[ITER] 17, MAE:0.09, log_likelihood:-975142.51 2016-02-10 19:45:47 INFO:GibbsSupervisedLDA:[ITER] 18, MAE:0.09, log_likelihood:-974758.06 2016-02-10 19:45:57 INFO:GibbsSupervisedLDA:[ITER] 19, MAE:0.08, log_likelihood:-972879.10 2016-02-10 19:46:05 INFO:GibbsSupervisedLDA:[ITER] 20, MAE:0.09, log_likelihood:-971601.43 2016-02-10 19:46:15 INFO:GibbsSupervisedLDA:[ITER] 21, MAE:0.09, log_likelihood:-970929.30 2016-02-10 19:46:24 INFO:GibbsSupervisedLDA:[ITER] 22, MAE:0.08, log_likelihood:-969826.58 2016-02-10 19:46:34 INFO:GibbsSupervisedLDA:[ITER] 23, MAE:0.08, log_likelihood:-968402.00 2016-02-10 19:46:43 INFO:GibbsSupervisedLDA:[ITER] 24, MAE:0.08, log_likelihood:-968001.73 2016-02-10 19:46:52 INFO:GibbsSupervisedLDA:[ITER] 25, MAE:0.08, log_likelihood:-967423.17 2016-02-10 19:47:02 INFO:GibbsSupervisedLDA:[ITER] 26, MAE:0.08, log_likelihood:-966159.14 2016-02-10 19:47:12 INFO:GibbsSupervisedLDA:[ITER] 27, MAE:0.08, log_likelihood:-965307.04 2016-02-10 19:47:21 INFO:GibbsSupervisedLDA:[ITER] 28, MAE:0.08, log_likelihood:-964563.86 2016-02-10 19:47:31 INFO:GibbsSupervisedLDA:[ITER] 29, MAE:0.08, log_likelihood:-963570.42 2016-02-10 19:47:42 INFO:GibbsSupervisedLDA:[ITER] 30, MAE:0.08, log_likelihood:-963055.59 2016-02-10 19:47:52 INFO:GibbsSupervisedLDA:[ITER] 31, MAE:0.08, log_likelihood:-962735.30 2016-02-10 19:48:02 INFO:GibbsSupervisedLDA:[ITER] 32, MAE:0.08, log_likelihood:-961139.44 2016-02-10 19:48:12 INFO:GibbsSupervisedLDA:[ITER] 33, MAE:0.08, log_likelihood:-960502.03 2016-02-10 19:48:22 INFO:GibbsSupervisedLDA:[ITER] 34, MAE:0.08, log_likelihood:-959892.16 2016-02-10 19:48:32 INFO:GibbsSupervisedLDA:[ITER] 35, MAE:0.08, log_likelihood:-959321.29 2016-02-10 19:48:42 INFO:GibbsSupervisedLDA:[ITER] 36, MAE:0.08, log_likelihood:-959294.60 2016-02-10 19:48:51 INFO:GibbsSupervisedLDA:[ITER] 37, MAE:0.08, log_likelihood:-958416.57 2016-02-10 19:49:00 INFO:GibbsSupervisedLDA:[ITER] 38, MAE:0.08, log_likelihood:-958708.81 2016-02-10 19:49:09 INFO:GibbsSupervisedLDA:[ITER] 39, MAE:0.08, log_likelihood:-958331.49 2016-02-10 19:49:18 INFO:GibbsSupervisedLDA:[ITER] 40, MAE:0.08, log_likelihood:-957697.51 2016-02-10 19:49:28 INFO:GibbsSupervisedLDA:[ITER] 41, MAE:0.08, log_likelihood:-956916.19 2016-02-10 19:49:37 INFO:GibbsSupervisedLDA:[ITER] 42, MAE:0.08, log_likelihood:-955973.23 2016-02-10 19:49:46 INFO:GibbsSupervisedLDA:[ITER] 43, MAE:0.08, log_likelihood:-955332.56 2016-02-10 19:49:54 INFO:GibbsSupervisedLDA:[ITER] 44, MAE:0.08, log_likelihood:-955296.00 2016-02-10 19:50:03 INFO:GibbsSupervisedLDA:[ITER] 45, MAE:0.08, log_likelihood:-955303.42 2016-02-10 19:50:12 INFO:GibbsSupervisedLDA:[ITER] 46, MAE:0.08, log_likelihood:-954540.14 2016-02-10 19:50:22 INFO:GibbsSupervisedLDA:[ITER] 47, MAE:0.08, log_likelihood:-954232.94 2016-02-10 19:50:32 INFO:GibbsSupervisedLDA:[ITER] 48, MAE:0.08, log_likelihood:-952474.63 2016-02-10 19:50:41 INFO:GibbsSupervisedLDA:[ITER] 49, MAE:0.08, log_likelihood:-952360.92 2016-02-10 19:50:51 INFO:GibbsSupervisedLDA:[ITER] 50, MAE:0.08, log_likelihood:-953415.19 2016-02-10 19:51:00 INFO:GibbsSupervisedLDA:[ITER] 51, MAE:0.08, log_likelihood:-952345.91 2016-02-10 19:51:10 INFO:GibbsSupervisedLDA:[ITER] 52, MAE:0.08, log_likelihood:-952259.97 2016-02-10 19:51:20 INFO:GibbsSupervisedLDA:[ITER] 53, MAE:0.08, log_likelihood:-952232.09 2016-02-10 19:51:30 INFO:GibbsSupervisedLDA:[ITER] 54, MAE:0.08, log_likelihood:-952488.02 2016-02-10 19:51:40 INFO:GibbsSupervisedLDA:[ITER] 55, MAE:0.08, log_likelihood:-951400.56 2016-02-10 19:51:49 INFO:GibbsSupervisedLDA:[ITER] 56, MAE:0.08, log_likelihood:-951612.91 2016-02-10 19:51:59 INFO:GibbsSupervisedLDA:[ITER] 57, MAE:0.08, log_likelihood:-951843.57 2016-02-10 19:52:08 INFO:GibbsSupervisedLDA:[ITER] 58, MAE:0.08, log_likelihood:-951312.42 2016-02-10 19:52:18 INFO:GibbsSupervisedLDA:[ITER] 59, MAE:0.08, log_likelihood:-951363.82 2016-02-10 19:52:28 INFO:GibbsSupervisedLDA:[ITER] 60, MAE:0.08, log_likelihood:-950682.99 2016-02-10 19:52:38 INFO:GibbsSupervisedLDA:[ITER] 61, MAE:0.08, log_likelihood:-950734.54 2016-02-10 19:52:47 INFO:GibbsSupervisedLDA:[ITER] 62, MAE:0.08, log_likelihood:-950539.13 2016-02-10 19:52:57 INFO:GibbsSupervisedLDA:[ITER] 63, MAE:0.08, log_likelihood:-950733.91 2016-02-10 19:53:07 INFO:GibbsSupervisedLDA:[ITER] 64, MAE:0.08, log_likelihood:-949927.04 2016-02-10 19:53:16 INFO:GibbsSupervisedLDA:[ITER] 65, MAE:0.08, log_likelihood:-949374.16 2016-02-10 19:53:25 INFO:GibbsSupervisedLDA:[ITER] 66, MAE:0.08, log_likelihood:-949330.23 2016-02-10 19:53:35 INFO:GibbsSupervisedLDA:[ITER] 67, MAE:0.08, log_likelihood:-948267.36 2016-02-10 19:53:44 INFO:GibbsSupervisedLDA:[ITER] 68, MAE:0.08, log_likelihood:-949421.16 2016-02-10 19:53:53 INFO:GibbsSupervisedLDA:[ITER] 69, MAE:0.08, log_likelihood:-948148.44 2016-02-10 19:54:02 INFO:GibbsSupervisedLDA:[ITER] 70, MAE:0.08, log_likelihood:-947131.47 2016-02-10 19:54:11 INFO:GibbsSupervisedLDA:[ITER] 71, MAE:0.08, log_likelihood:-947165.15 2016-02-10 19:54:19 INFO:GibbsSupervisedLDA:[ITER] 72, MAE:0.08, log_likelihood:-947004.44 2016-02-10 19:54:28 INFO:GibbsSupervisedLDA:[ITER] 73, MAE:0.08, log_likelihood:-947023.80 2016-02-10 19:54:36 INFO:GibbsSupervisedLDA:[ITER] 74, MAE:0.08, log_likelihood:-946379.40 2016-02-10 19:54:45 INFO:GibbsSupervisedLDA:[ITER] 75, MAE:0.08, log_likelihood:-946648.86 2016-02-10 19:54:53 INFO:GibbsSupervisedLDA:[ITER] 76, MAE:0.08, log_likelihood:-947049.11 2016-02-10 19:55:01 INFO:GibbsSupervisedLDA:[ITER] 77, MAE:0.08, log_likelihood:-946696.94 2016-02-10 19:55:10 INFO:GibbsSupervisedLDA:[ITER] 78, MAE:0.08, log_likelihood:-947052.36 2016-02-10 19:55:18 INFO:GibbsSupervisedLDA:[ITER] 79, MAE:0.08, log_likelihood:-945975.22 2016-02-10 19:55:27 INFO:GibbsSupervisedLDA:[ITER] 80, MAE:0.08, log_likelihood:-945828.06 2016-02-10 19:55:35 INFO:GibbsSupervisedLDA:[ITER] 81, MAE:0.08, log_likelihood:-945327.94 2016-02-10 19:55:43 INFO:GibbsSupervisedLDA:[ITER] 82, MAE:0.08, log_likelihood:-945460.64 2016-02-10 19:55:52 INFO:GibbsSupervisedLDA:[ITER] 83, MAE:0.08, log_likelihood:-944772.90 2016-02-10 19:56:00 INFO:GibbsSupervisedLDA:[ITER] 84, MAE:0.08, log_likelihood:-944241.21 2016-02-10 19:56:09 INFO:GibbsSupervisedLDA:[ITER] 85, MAE:0.08, log_likelihood:-944988.86 2016-02-10 19:56:17 INFO:GibbsSupervisedLDA:[ITER] 86, MAE:0.08, log_likelihood:-944814.22 2016-02-10 19:56:26 INFO:GibbsSupervisedLDA:[ITER] 87, MAE:0.08, log_likelihood:-945325.78 2016-02-10 19:56:34 INFO:GibbsSupervisedLDA:[ITER] 88, MAE:0.08, log_likelihood:-945067.91 2016-02-10 19:56:43 INFO:GibbsSupervisedLDA:[ITER] 89, MAE:0.08, log_likelihood:-944895.82 2016-02-10 19:56:51 INFO:GibbsSupervisedLDA:[ITER] 90, MAE:0.08, log_likelihood:-944184.63 2016-02-10 19:56:59 INFO:GibbsSupervisedLDA:[ITER] 91, MAE:0.08, log_likelihood:-944688.10 2016-02-10 19:57:08 INFO:GibbsSupervisedLDA:[ITER] 92, MAE:0.08, log_likelihood:-944621.01 2016-02-10 19:57:16 INFO:GibbsSupervisedLDA:[ITER] 93, MAE:0.08, log_likelihood:-944162.71 2016-02-10 19:57:25 INFO:GibbsSupervisedLDA:[ITER] 94, MAE:0.08, log_likelihood:-943703.28 2016-02-10 19:57:33 INFO:GibbsSupervisedLDA:[ITER] 95, MAE:0.08, log_likelihood:-943792.98 2016-02-10 19:57:42 INFO:GibbsSupervisedLDA:[ITER] 96, MAE:0.08, log_likelihood:-944534.72 2016-02-10 19:57:50 INFO:GibbsSupervisedLDA:[ITER] 97, MAE:0.08, log_likelihood:-944806.54 2016-02-10 19:57:59 INFO:GibbsSupervisedLDA:[ITER] 98, MAE:0.08, log_likelihood:-944684.98 2016-02-10 19:58:07 INFO:GibbsSupervisedLDA:[ITER] 99, MAE:0.08, log_likelihood:-943800.65
for ti in model.eta.argsort():
top_words = get_top_words(model.TW, voca, ti, n_words=10)
print('Eta', model.eta[ti] ,'Topic', ti ,':\t', ','.join(top_words))
Eta -0.234767820987 Topic 34 : bad,could,anything,get,never,movie,another,go,ca,script Eta -0.214572023979 Topic 41 : could,would,get,made,might,anything,many,good,look,never Eta -0.160506173159 Topic 38 : got,far,time,right,way,every,making,part,dull,guess Eta 0.0219171933193 Topic 49 : found,still,subject,intellectual,becomes,sense,tension,fine,talk,century Eta 0.0419401650241 Topic 10 : make,action,good,everything,acting,made,funny,dialogue,script,place Eta 0.0764470841209 Topic 27 : entertainment,value,lot,theme,violent,red,take,mostly,many,easy Eta 0.128975547132 Topic 21 : much,part,might,great,bad,getting,two,away,audience,least Eta 0.159533085017 Topic 36 : done,time,manner,dialogue,never,version,great,production,feeling,right Eta 0.184797336584 Topic 44 : opera,long,interesting,soap,go,telling,picture,chance,different,offering Eta 0.210652963064 Topic 31 : director,enough,done,many,shown,money,movie,must,give,blood Eta 0.232820782727 Topic 37 : good,look,everything,looking,also,romance,tale,new,thriller,find Eta 0.260212275042 Topic 11 : comedy,black,satire,comic,humor,love,going,cast,main,romantic Eta 0.339720354837 Topic 30 : mystery,thriller,suspense,plot,role,usual,give,horror,atmosphere,john Eta 0.346229992245 Topic 39 : role,case,watch,long,sense,back,problem,experience,well,type Eta 0.35836129898 Topic 28 : good,believe,without,go,god,church,back,interesting,still,much Eta 0.381319468906 Topic 23 : funny,also,way,think,humor,character,good,see,comedy,still Eta 0.384113637482 Topic 6 : special,make,movie,see,effects,would,first,something,though,horror Eta 0.396933018793 Topic 32 : director,scene,time,able,part,interesting,shot,interest,acting,set Eta 0.404554661869 Topic 5 : really,enough,way,could,something,much,movie,time,since,seem Eta 0.425037264045 Topic 33 : musical,music,best,french,dance,way,song,dancing,great,lively Eta 0.457287458236 Topic 22 : love,made,much,life,though,part,modern,times,acting,kind Eta 0.490046516159 Topic 40 : many,something,people,us,good,could,times,might,instead,small Eta 0.535690662439 Topic 14 : make,movie,great,would,work,comes,though,really,mark,audience Eta 0.544370355785 Topic 7 : get,really,anything,seeing,life,feel,difficult,see,enough,emotional Eta 0.545175280796 Topic 8 : killer,two,movie,together,couple,script,serial,though,best,ca Eta 0.565327709237 Topic 45 : almost,joan,making,sense,two,easily,care,lot,someone,person Eta 0.567068916427 Topic 12 : would,made,still,main,audience,goes,music,man,subject,felt Eta 0.580004732236 Topic 24 : could,showing,see,house,come,life,say,psychological,going,city Eta 0.588440544275 Topic 25 : noir,character,dark,gave,role,hero,also,study,genre,protagonist Eta 0.589570607942 Topic 17 : much,also,best,camera,thought,together,two,work,added,every Eta 0.594475657781 Topic 16 : western,artist,good,man,scene,action,art,pleasing,plenty,still Eta 0.598417849919 Topic 26 : war,men,made,sides,place,show,battle,submarine,military,performance Eta 0.607731679912 Topic 35 : message,world,audience,political,people,point,showing,time,public,country Eta 0.635974893904 Topic 15 : sex,culture,first,seem,comes,sexual,play,performance,direction,sense Eta 0.64483388889 Topic 3 : noir,shot,dark,made,ending,feel,style,city,mood,little Eta 0.646089932446 Topic 19 : love,tale,director,hollywood,get,much,make,could,never,away Eta 0.65050684023 Topic 48 : way,make,see,though,life,know,family,might,someone,real Eta 0.66143777888 Topic 1 : hollywood,action,role,say,star,get,office,certain,original,come Eta 0.686787742783 Topic 46 : sense,think,thing,would,much,something,new,better,saying,feeling Eta 0.709956673591 Topic 9 : director,political,never,right,making,bad,without,taken,seen,plot Eta 0.712450862064 Topic 29 : something,violence,way,say,many,get,society,think,everyone,point Eta 0.735116092694 Topic 13 : character,performance,real,someone,also,without,director,work,well,interesting Eta 0.779470138429 Topic 42 : much,work,though,works,yet,well,rather,would,odd,make Eta 0.80049939286 Topic 18 : many,better,movie,look,way,last,white,directed,book,mood Eta 0.891479893452 Topic 47 : way,also,two,making,good,fun,much,scene,seem,romantic Eta 0.900176187326 Topic 0 : could,made,used,seen,violence,man,different,cast,scene,john Eta 1.0136094881 Topic 43 : people,us,human,see,history,part,right,point,message,place Eta 1.05242575044 Topic 2 : american,might,also,director,work,take,much,always,society,america Eta 1.50749544462 Topic 20 : screen,might,see,masterpiece,way,work,version,must,better,well Eta 1.8658611489 Topic 4 : many,great,life,performance,best,look,something,way,see,theme
The review about one movie, so the topics does not seem to be clearly distinguishable. At least, however, the most negative topics contain words such as bad
, never
, and dull
. And the most positive topics contain word like great
, best
, and masterpeice
.