"The Author-Topic Model for Authors and Documents" by Rosen-Zvi, et al. (UAI 2004)
import pickle
import logging
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from ptm import AuthorTopicModel
from ptm.utils import convert_cnt_to_list, get_top_words
logger = logging.getLogger('AuthorTopicModel')
logger.propagate=False
%matplotlib inline
The original dataset from: https://people.cs.umass.edu/~mccallum/data.html
doc_ids = pickle.load(open('../data/cora/doc_ids.pkl', 'rb'))
doc_cnt = pickle.load(open('../data/cora/doc_cnt.pkl', 'rb'))
doc_author = pickle.load(open('../data/cora/doc_authorid.pkl', 'rb'))
author_name = pickle.load(open('../data/cora/authorid_authorname.pkl', 'rb'))
voca = pickle.load(open('../data/cora/voca.pkl', 'rb'))
corpus = convert_cnt_to_list(doc_ids, doc_cnt)
n_doc = len(corpus)
n_topic = 10
n_author = len(author_name)
n_voca = len(voca)
max_iter = 50
model = AuthorTopicModel(n_doc, n_voca, n_topic, n_author)
model.fit(corpus, doc_author, max_iter=max_iter)
2016-02-14 22:04:27 INFO:AuthorTopicModel:[INIT] 0 elapsed_time:63.54 log_likelihood:-10863554.38 2016-02-14 22:05:30 INFO:AuthorTopicModel:[INIT] 1 elapsed_time:63.58 log_likelihood:-10647481.99 2016-02-14 22:06:34 INFO:AuthorTopicModel:[INIT] 2 elapsed_time:63.74 log_likelihood:-10492422.12 2016-02-14 22:07:38 INFO:AuthorTopicModel:[INIT] 3 elapsed_time:63.77 log_likelihood:-10357087.07 2016-02-14 22:08:40 INFO:AuthorTopicModel:[INIT] 4 elapsed_time:62.19 log_likelihood:-10229123.70 2016-02-14 22:09:35 INFO:AuthorTopicModel:[INIT] 5 elapsed_time:54.96 log_likelihood:-10096179.15 2016-02-14 22:10:30 INFO:AuthorTopicModel:[INIT] 6 elapsed_time:54.89 log_likelihood:-9943646.09 2016-02-14 22:11:25 INFO:AuthorTopicModel:[INIT] 7 elapsed_time:54.84 log_likelihood:-9769853.39 2016-02-14 22:12:22 INFO:AuthorTopicModel:[INIT] 8 elapsed_time:57.85 log_likelihood:-9598314.53 2016-02-14 22:13:23 INFO:AuthorTopicModel:[INIT] 9 elapsed_time:60.43 log_likelihood:-9453899.31 2016-02-14 22:14:22 INFO:AuthorTopicModel:[INIT] 10 elapsed_time:59.53 log_likelihood:-9338106.69 2016-02-14 22:15:21 INFO:AuthorTopicModel:[INIT] 11 elapsed_time:58.89 log_likelihood:-9244523.47 2016-02-14 22:16:21 INFO:AuthorTopicModel:[INIT] 12 elapsed_time:59.88 log_likelihood:-9173893.80 2016-02-14 22:17:23 INFO:AuthorTopicModel:[INIT] 13 elapsed_time:61.79 log_likelihood:-9116831.15 2016-02-14 22:18:24 INFO:AuthorTopicModel:[INIT] 14 elapsed_time:61.34 log_likelihood:-9068511.26 2016-02-14 22:19:24 INFO:AuthorTopicModel:[INIT] 15 elapsed_time:59.51 log_likelihood:-9030260.41 2016-02-14 22:20:26 INFO:AuthorTopicModel:[INIT] 16 elapsed_time:61.86 log_likelihood:-8996108.24 2016-02-14 22:21:27 INFO:AuthorTopicModel:[INIT] 17 elapsed_time:61.55 log_likelihood:-8964674.92 2016-02-14 22:22:31 INFO:AuthorTopicModel:[INIT] 18 elapsed_time:63.73 log_likelihood:-8941120.13 2016-02-14 22:23:34 INFO:AuthorTopicModel:[INIT] 19 elapsed_time:63.34 log_likelihood:-8921381.73 2016-02-14 22:24:36 INFO:AuthorTopicModel:[INIT] 20 elapsed_time:61.47 log_likelihood:-8903072.00 2016-02-14 22:25:37 INFO:AuthorTopicModel:[INIT] 21 elapsed_time:60.91 log_likelihood:-8886887.71 2016-02-14 22:26:39 INFO:AuthorTopicModel:[INIT] 22 elapsed_time:62.48 log_likelihood:-8872823.62 2016-02-14 22:27:43 INFO:AuthorTopicModel:[INIT] 23 elapsed_time:63.59 log_likelihood:-8856336.04 2016-02-14 22:28:47 INFO:AuthorTopicModel:[INIT] 24 elapsed_time:63.87 log_likelihood:-8845108.89 2016-02-14 22:29:49 INFO:AuthorTopicModel:[INIT] 25 elapsed_time:62.69 log_likelihood:-8834276.61 2016-02-14 22:30:49 INFO:AuthorTopicModel:[INIT] 26 elapsed_time:59.47 log_likelihood:-8823068.52 2016-02-14 22:31:50 INFO:AuthorTopicModel:[INIT] 27 elapsed_time:61.48 log_likelihood:-8814344.53 2016-02-14 22:32:50 INFO:AuthorTopicModel:[INIT] 28 elapsed_time:59.72 log_likelihood:-8806725.65 2016-02-14 22:33:49 INFO:AuthorTopicModel:[INIT] 29 elapsed_time:58.68 log_likelihood:-8799515.99 2016-02-14 22:34:50 INFO:AuthorTopicModel:[INIT] 30 elapsed_time:61.03 log_likelihood:-8792988.33 2016-02-14 22:35:50 INFO:AuthorTopicModel:[INIT] 31 elapsed_time:59.95 log_likelihood:-8787366.00 2016-02-14 22:36:52 INFO:AuthorTopicModel:[INIT] 32 elapsed_time:62.10 log_likelihood:-8780941.95 2016-02-14 22:37:55 INFO:AuthorTopicModel:[INIT] 33 elapsed_time:62.92 log_likelihood:-8776050.13 2016-02-14 22:38:57 INFO:AuthorTopicModel:[INIT] 34 elapsed_time:62.04 log_likelihood:-8771034.29 2016-02-14 22:39:56 INFO:AuthorTopicModel:[INIT] 35 elapsed_time:59.79 log_likelihood:-8763705.60 2016-02-14 22:40:57 INFO:AuthorTopicModel:[INIT] 36 elapsed_time:60.35 log_likelihood:-8759335.53 2016-02-14 22:41:56 INFO:AuthorTopicModel:[INIT] 37 elapsed_time:58.78 log_likelihood:-8755129.16 2016-02-14 22:42:54 INFO:AuthorTopicModel:[INIT] 38 elapsed_time:58.65 log_likelihood:-8754418.15 2016-02-14 22:43:51 INFO:AuthorTopicModel:[INIT] 39 elapsed_time:56.47 log_likelihood:-8747837.15 2016-02-14 22:44:49 INFO:AuthorTopicModel:[INIT] 40 elapsed_time:58.43 log_likelihood:-8743544.53 2016-02-14 22:45:50 INFO:AuthorTopicModel:[INIT] 41 elapsed_time:60.66 log_likelihood:-8738763.21 2016-02-14 22:46:51 INFO:AuthorTopicModel:[INIT] 42 elapsed_time:61.46 log_likelihood:-8733850.16 2016-02-14 22:47:54 INFO:AuthorTopicModel:[INIT] 43 elapsed_time:63.01 log_likelihood:-8733093.74 2016-02-14 22:48:51 INFO:AuthorTopicModel:[INIT] 44 elapsed_time:56.68 log_likelihood:-8732169.09 2016-02-14 22:49:49 INFO:AuthorTopicModel:[INIT] 45 elapsed_time:58.09 log_likelihood:-8728986.80 2016-02-14 22:50:49 INFO:AuthorTopicModel:[INIT] 46 elapsed_time:60.09 log_likelihood:-8727756.31 2016-02-14 22:51:51 INFO:AuthorTopicModel:[INIT] 47 elapsed_time:61.90 log_likelihood:-8726765.65 2016-02-14 22:52:53 INFO:AuthorTopicModel:[INIT] 48 elapsed_time:61.55 log_likelihood:-8720959.99 2016-02-14 22:53:54 INFO:AuthorTopicModel:[INIT] 49 elapsed_time:60.93 log_likelihood:-8718195.57
for k in range(n_topic):
top_words = get_top_words(model.TW, voca, k, 10)
print('topic ', k , ','.join(top_words))
topic 0 algorithm,problem,time,model,function,bound,show,result,optimal,complexity topic 1 network,service,realtime,control,performance,application,paper,routing,traffic,packet topic 2 data,query,database,information,algorithm,rule,view,technique,document,structure topic 3 system,distributed,protocol,communication,application,message,file,paper,performance,network topic 4 learning,network,system,method,approach,task,paper,problem,model,neural topic 5 image,model,object,using,surface,motion,robot,algorithm,method,visual topic 6 parallel,program,performance,memory,data,processor,analysis,application,compiler,machine topic 7 system,design,software,language,application,paper,research,tool,object,support topic 8 agent,system,model,language,planning,logic,constraint,plan,action,paper topic 9 problem,method,algorithm,linear,function,result,paper,solution,technique,matrix
author_id = 7
fig = plt.figure(figsize=(12,6))
plt.bar(range(n_topic), model.AT[author_id]/np.sum(model.AT[author_id]))
plt.title(author_name[author_id])
plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(model.TW, voca, k, 10)) for k in range(n_topic)])
plt.show()
author_id = 32
fig = plt.figure(figsize=(12,6))
plt.bar(range(n_topic), model.AT[author_id]/np.sum(model.AT[author_id]))
plt.title(author_name[author_id])
plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(model.TW, voca, k, 10)) for k in range(n_topic)])
plt.show()