Example of AuthorTopicModel

"The Author-Topic Model for Authors and Documents" by Rosen-Zvi, et al. (UAI 2004)


In [1]:
import pickle
import logging

import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from ptm import AuthorTopicModel
from ptm.utils import convert_cnt_to_list, get_top_words

logger = logging.getLogger('AuthorTopicModel')
logger.propagate=False

%matplotlib inline

Load CORA dataset


In [2]:
doc_ids = pickle.load(open('../data/cora/doc_ids.pkl', 'rb'))
doc_cnt = pickle.load(open('../data/cora/doc_cnt.pkl', 'rb'))
doc_author = pickle.load(open('../data/cora/doc_authorid.pkl', 'rb'))
author_name = pickle.load(open('../data/cora/authorid_authorname.pkl', 'rb'))
voca = pickle.load(open('../data/cora/voca.pkl', 'rb'))

In [3]:
corpus = convert_cnt_to_list(doc_ids, doc_cnt)
n_doc = len(corpus)
n_topic = 10
n_author = len(author_name)
n_voca = len(voca)
max_iter = 50

Fit author-topic model


In [4]:
model = AuthorTopicModel(n_doc, n_voca, n_topic, n_author)
model.fit(corpus, doc_author, max_iter=max_iter)


2016-02-14 22:04:27 INFO:AuthorTopicModel:[INIT] 0	elapsed_time:63.54	log_likelihood:-10863554.38
2016-02-14 22:05:30 INFO:AuthorTopicModel:[INIT] 1	elapsed_time:63.58	log_likelihood:-10647481.99
2016-02-14 22:06:34 INFO:AuthorTopicModel:[INIT] 2	elapsed_time:63.74	log_likelihood:-10492422.12
2016-02-14 22:07:38 INFO:AuthorTopicModel:[INIT] 3	elapsed_time:63.77	log_likelihood:-10357087.07
2016-02-14 22:08:40 INFO:AuthorTopicModel:[INIT] 4	elapsed_time:62.19	log_likelihood:-10229123.70
2016-02-14 22:09:35 INFO:AuthorTopicModel:[INIT] 5	elapsed_time:54.96	log_likelihood:-10096179.15
2016-02-14 22:10:30 INFO:AuthorTopicModel:[INIT] 6	elapsed_time:54.89	log_likelihood:-9943646.09
2016-02-14 22:11:25 INFO:AuthorTopicModel:[INIT] 7	elapsed_time:54.84	log_likelihood:-9769853.39
2016-02-14 22:12:22 INFO:AuthorTopicModel:[INIT] 8	elapsed_time:57.85	log_likelihood:-9598314.53
2016-02-14 22:13:23 INFO:AuthorTopicModel:[INIT] 9	elapsed_time:60.43	log_likelihood:-9453899.31
2016-02-14 22:14:22 INFO:AuthorTopicModel:[INIT] 10	elapsed_time:59.53	log_likelihood:-9338106.69
2016-02-14 22:15:21 INFO:AuthorTopicModel:[INIT] 11	elapsed_time:58.89	log_likelihood:-9244523.47
2016-02-14 22:16:21 INFO:AuthorTopicModel:[INIT] 12	elapsed_time:59.88	log_likelihood:-9173893.80
2016-02-14 22:17:23 INFO:AuthorTopicModel:[INIT] 13	elapsed_time:61.79	log_likelihood:-9116831.15
2016-02-14 22:18:24 INFO:AuthorTopicModel:[INIT] 14	elapsed_time:61.34	log_likelihood:-9068511.26
2016-02-14 22:19:24 INFO:AuthorTopicModel:[INIT] 15	elapsed_time:59.51	log_likelihood:-9030260.41
2016-02-14 22:20:26 INFO:AuthorTopicModel:[INIT] 16	elapsed_time:61.86	log_likelihood:-8996108.24
2016-02-14 22:21:27 INFO:AuthorTopicModel:[INIT] 17	elapsed_time:61.55	log_likelihood:-8964674.92
2016-02-14 22:22:31 INFO:AuthorTopicModel:[INIT] 18	elapsed_time:63.73	log_likelihood:-8941120.13
2016-02-14 22:23:34 INFO:AuthorTopicModel:[INIT] 19	elapsed_time:63.34	log_likelihood:-8921381.73
2016-02-14 22:24:36 INFO:AuthorTopicModel:[INIT] 20	elapsed_time:61.47	log_likelihood:-8903072.00
2016-02-14 22:25:37 INFO:AuthorTopicModel:[INIT] 21	elapsed_time:60.91	log_likelihood:-8886887.71
2016-02-14 22:26:39 INFO:AuthorTopicModel:[INIT] 22	elapsed_time:62.48	log_likelihood:-8872823.62
2016-02-14 22:27:43 INFO:AuthorTopicModel:[INIT] 23	elapsed_time:63.59	log_likelihood:-8856336.04
2016-02-14 22:28:47 INFO:AuthorTopicModel:[INIT] 24	elapsed_time:63.87	log_likelihood:-8845108.89
2016-02-14 22:29:49 INFO:AuthorTopicModel:[INIT] 25	elapsed_time:62.69	log_likelihood:-8834276.61
2016-02-14 22:30:49 INFO:AuthorTopicModel:[INIT] 26	elapsed_time:59.47	log_likelihood:-8823068.52
2016-02-14 22:31:50 INFO:AuthorTopicModel:[INIT] 27	elapsed_time:61.48	log_likelihood:-8814344.53
2016-02-14 22:32:50 INFO:AuthorTopicModel:[INIT] 28	elapsed_time:59.72	log_likelihood:-8806725.65
2016-02-14 22:33:49 INFO:AuthorTopicModel:[INIT] 29	elapsed_time:58.68	log_likelihood:-8799515.99
2016-02-14 22:34:50 INFO:AuthorTopicModel:[INIT] 30	elapsed_time:61.03	log_likelihood:-8792988.33
2016-02-14 22:35:50 INFO:AuthorTopicModel:[INIT] 31	elapsed_time:59.95	log_likelihood:-8787366.00
2016-02-14 22:36:52 INFO:AuthorTopicModel:[INIT] 32	elapsed_time:62.10	log_likelihood:-8780941.95
2016-02-14 22:37:55 INFO:AuthorTopicModel:[INIT] 33	elapsed_time:62.92	log_likelihood:-8776050.13
2016-02-14 22:38:57 INFO:AuthorTopicModel:[INIT] 34	elapsed_time:62.04	log_likelihood:-8771034.29
2016-02-14 22:39:56 INFO:AuthorTopicModel:[INIT] 35	elapsed_time:59.79	log_likelihood:-8763705.60
2016-02-14 22:40:57 INFO:AuthorTopicModel:[INIT] 36	elapsed_time:60.35	log_likelihood:-8759335.53
2016-02-14 22:41:56 INFO:AuthorTopicModel:[INIT] 37	elapsed_time:58.78	log_likelihood:-8755129.16
2016-02-14 22:42:54 INFO:AuthorTopicModel:[INIT] 38	elapsed_time:58.65	log_likelihood:-8754418.15
2016-02-14 22:43:51 INFO:AuthorTopicModel:[INIT] 39	elapsed_time:56.47	log_likelihood:-8747837.15
2016-02-14 22:44:49 INFO:AuthorTopicModel:[INIT] 40	elapsed_time:58.43	log_likelihood:-8743544.53
2016-02-14 22:45:50 INFO:AuthorTopicModel:[INIT] 41	elapsed_time:60.66	log_likelihood:-8738763.21
2016-02-14 22:46:51 INFO:AuthorTopicModel:[INIT] 42	elapsed_time:61.46	log_likelihood:-8733850.16
2016-02-14 22:47:54 INFO:AuthorTopicModel:[INIT] 43	elapsed_time:63.01	log_likelihood:-8733093.74
2016-02-14 22:48:51 INFO:AuthorTopicModel:[INIT] 44	elapsed_time:56.68	log_likelihood:-8732169.09
2016-02-14 22:49:49 INFO:AuthorTopicModel:[INIT] 45	elapsed_time:58.09	log_likelihood:-8728986.80
2016-02-14 22:50:49 INFO:AuthorTopicModel:[INIT] 46	elapsed_time:60.09	log_likelihood:-8727756.31
2016-02-14 22:51:51 INFO:AuthorTopicModel:[INIT] 47	elapsed_time:61.90	log_likelihood:-8726765.65
2016-02-14 22:52:53 INFO:AuthorTopicModel:[INIT] 48	elapsed_time:61.55	log_likelihood:-8720959.99
2016-02-14 22:53:54 INFO:AuthorTopicModel:[INIT] 49	elapsed_time:60.93	log_likelihood:-8718195.57

In [5]:
for k in range(n_topic):
    top_words = get_top_words(model.TW, voca, k, 10)
    print('topic ', k , ','.join(top_words))


topic  0 algorithm,problem,time,model,function,bound,show,result,optimal,complexity
topic  1 network,service,realtime,control,performance,application,paper,routing,traffic,packet
topic  2 data,query,database,information,algorithm,rule,view,technique,document,structure
topic  3 system,distributed,protocol,communication,application,message,file,paper,performance,network
topic  4 learning,network,system,method,approach,task,paper,problem,model,neural
topic  5 image,model,object,using,surface,motion,robot,algorithm,method,visual
topic  6 parallel,program,performance,memory,data,processor,analysis,application,compiler,machine
topic  7 system,design,software,language,application,paper,research,tool,object,support
topic  8 agent,system,model,language,planning,logic,constraint,plan,action,paper
topic  9 problem,method,algorithm,linear,function,result,paper,solution,technique,matrix

Plot topic distribution of random author


In [6]:
author_id = 7
fig = plt.figure(figsize=(12,6))
plt.bar(range(n_topic), model.AT[author_id]/np.sum(model.AT[author_id]))
plt.title(author_name[author_id])
plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(model.TW, voca, k, 10)) for k in range(n_topic)])
plt.show()



In [7]:
author_id = 32
fig = plt.figure(figsize=(12,6))
plt.bar(range(n_topic), model.AT[author_id]/np.sum(model.AT[author_id]))
plt.title(author_name[author_id])
plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(model.TW, voca, k, 10)) for k in range(n_topic)])
plt.show()



In [ ]: