"The Author-Topic Model for Authors and Documents" by Rosen-Zvi, et al. (UAI 2004)
In [1]:
import pickle
import logging
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from ptm import AuthorTopicModel
from ptm.utils import convert_cnt_to_list, get_top_words
logger = logging.getLogger('AuthorTopicModel')
logger.propagate=False
%matplotlib inline
The original dataset from: https://people.cs.umass.edu/~mccallum/data.html
In [2]:
doc_ids = pickle.load(open('../data/cora/doc_ids.pkl', 'rb'))
doc_cnt = pickle.load(open('../data/cora/doc_cnt.pkl', 'rb'))
doc_author = pickle.load(open('../data/cora/doc_authorid.pkl', 'rb'))
author_name = pickle.load(open('../data/cora/authorid_authorname.pkl', 'rb'))
voca = pickle.load(open('../data/cora/voca.pkl', 'rb'))
In [3]:
corpus = convert_cnt_to_list(doc_ids, doc_cnt)
n_doc = len(corpus)
n_topic = 10
n_author = len(author_name)
n_voca = len(voca)
max_iter = 50
In [4]:
model = AuthorTopicModel(n_doc, n_voca, n_topic, n_author)
model.fit(corpus, doc_author, max_iter=max_iter)
In [5]:
for k in range(n_topic):
top_words = get_top_words(model.TW, voca, k, 10)
print('topic ', k , ','.join(top_words))
In [6]:
author_id = 7
fig = plt.figure(figsize=(12,6))
plt.bar(range(n_topic), model.AT[author_id]/np.sum(model.AT[author_id]))
plt.title(author_name[author_id])
plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(model.TW, voca, k, 10)) for k in range(n_topic)])
plt.show()
In [7]:
author_id = 32
fig = plt.figure(figsize=(12,6))
plt.bar(range(n_topic), model.AT[author_id]/np.sum(model.AT[author_id]))
plt.title(author_name[author_id])
plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(model.TW, voca, k, 10)) for k in range(n_topic)])
plt.show()
In [ ]: