LDA's generative processs
In [1]:
ntopic = 10
nvoca = 20
alpha = np.ones(ntopic)*0.1
beta = np.ones(nvoca)*0.01
theta = np.random.dirichlet(alpha)
phi = np.random.dirichlet(beta, size=ntopic)
In [2]:
_z = np.random.multinomial(1, theta)
In [3]:
_z
Out[3]:
In [4]:
z = _z.argmax()
In [5]:
_w = np.random.multinomial(1, phi[z,:])
w = _w.argmax()
In [6]:
w
Out[6]:
In [7]:
# read sample corpus from nltk.corpus.brown corpus
# install nltk package, import nltk, and run nltk.download() to get corpora provided by nltk
import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
from scipy.special import gammaln
alpha = 0.1
beta = 0.01
ndoc = 500
ntopics = 10
st = set(stopwords.words())
st.add(u'.')
st.add(u',')
st.add(u'\'\'')
st.add(u'``')
st.add(u':')
st.add(u'--')
_docs = brown.sents()
docs = list()
for di in xrange(ndoc):
doc = _docs[di]
new_doc = list()
for word in doc:
if word.lower() not in st:
new_doc.append(word.lower())
docs.append(new_doc)
In [8]:
# construct vocabulary
_voca = set()
for doc in docs:
_voca = _voca.union(set(doc))
nvoca = len(_voca)
voca = dict()
for word in _voca:
voca[word] = len(voca)
voca_list = np.array(list(_voca))
In [9]:
# set sampling variables
word_topic = np.zeros([nvoca,ntopic])
topic_sum = np.zeros(ntopic)
doc_sum = np.zeros([ndoc, ntopic])
assigned_topics = [list() for i in xrange(ndoc)]
In [10]:
#initial sampling process
for di in xrange(ndoc):
doc = docs[di]
for word in doc:
w_no = voca[word]
prob = np.zeros(ntopic)
for topic in xrange(ntopic):
prob[topic] = (word_topic[w_no,topic] + beta)/(topic_sum[topic] + beta*nvoca)*(alpha + doc_sum[di,topic])
prob /= np.sum(prob)
# draw random sample
new_topic = np.random.multinomial(1, prob).argmax()
assigned_topics[di].append(new_topic)
doc_sum[di,new_topic] += 1
topic_sum[new_topic] += 1
word_topic[w_no,new_topic] += 1
In [11]:
# iterate sampling process
niter = 100
for it in xrange(niter):
for di in xrange(ndoc):
doc = docs[di]
for wi in xrange(len(doc)):
word = doc[wi]
w_no = voca[word]
prev_topic = assigned_topics[di][wi]
doc_sum[di,prev_topic] -= 1
topic_sum[prev_topic] -= 1
word_topic[w_no,prev_topic] -= 1
prob = np.zeros(ntopic)
for topic in xrange(ntopic):
prob[topic] = (word_topic[w_no,topic] + beta)/(topic_sum[topic] + beta*nvoca)*(alpha + doc_sum[di,topic])
prob /= np.sum(prob)
# draw random sample
new_topic = np.random.multinomial(1, prob).argmax()
assigned_topics[di][wi] = new_topic
doc_sum[di,new_topic] += 1
topic_sum[new_topic] += 1
word_topic[w_no,new_topic] += 1
In [12]:
# print top probability words for each topic
for topic in xrange(ntopic):
print 'topic %d : %s' % (topic, ' '.join(voca_list[np.argsort(word_topic[:,topic])[::-1][0:10]]))
Once you infer topics from corpus, you need to validate the performance of our model. Computing heldout likelihood is the easiest way to compare your model with others. Typically, it consists of four steps:
In [13]:
test_ndoc = 100
test_docs = list()
for di in xrange(test_ndoc):
doc = _docs[di]
new_doc = list()
for word in doc:
if word.lower() not in st:
new_doc.append(word.lower())
test_docs.append(new_doc)
In [14]:
test_first = list()
test_second = list()
for doc in test_docs:
first_half = list()
second_half = list()
#split it your way!
for wi in xrange(len(doc)):
word = doc[wi]
if voca.has_key(word):
if wi%2 == 0:
first_half.append(doc[wi])
else:
second_half.append(doc[wi])
test_first.append(first_half)
test_second.append(second_half)
In [15]:
# iterate sampling process
# note that this time we do not assign new words to word_topic matrix
#topic_sum = np.zeros(ntopic) # Jooyeon : topic_sum is identical for training and testing
t_doc_sum = np.zeros([test_ndoc, ntopic])
t_assigned_topics = [[0 for j in xrange(len(test_first[i]))]for i in xrange(test_ndoc)]
# Jooyeon : t_doc_sum and t_assigned_topics need to be initialized.
niter = 100
for it in xrange(niter):
for di in xrange(test_ndoc):
doc = test_first[di]
for wi in xrange(len(doc)):
word = doc[wi]
w_no = voca[word]
#prev_topic = assigned_topics[di][wi]
prev_topic = t_assigned_topics[di][wi] # Jooyeon : changed to t_assigned_topics
if it != 0:
t_doc_sum[di,prev_topic] -= 1
#topic_sum[prev_topic] -= 1
#word_topic[w_no,prev_topic] -= 1
prob = np.zeros(ntopic)
for topic in xrange(ntopic):
prob[topic] = (word_topic[w_no,topic] + beta)/(topic_sum[topic] + beta*nvoca)*(alpha + t_doc_sum[di,topic])
prob /= np.sum(prob)
# draw random sample
new_topic = np.random.multinomial(1, prob).argmax()
#assigned_topics[di][wi] = new_topic
t_assigned_topics[di][wi] = new_topic # Jooyeon : changed to t_assigned_topics
t_doc_sum[di,new_topic] += 1
#topic_sum[new_topic] += 1
#word_topic[w_no,new_topic] += 1
In [16]:
heldout_ll = 0.
cnt = 0.
for di in xrange(test_ndoc):
doc = test_second[di]
for wi in xrange(len(doc)):
word = doc[wi]
w_no = voca[word]
cnt += 1
# see A Collapsed Variational Bayesian Inference
# Algorithm for Latent Dirichlet Allocation by Teh & et al (2006)
# for how to compute the heldout likelihood
ll = 0
for topic in xrange(ntopic):
ll += ((alpha + doc_sum[di,topic])/(alpha * ntopic + len(test_first[di]))) \
* ((word_topic[w_no,topic] + beta)/(topic_sum[topic] + beta*nvoca))
heldout_ll += np.log(ll)
print cnt
print heldout_ll/cnt
Usually, with a general corpus, per-word heldout log likelihood ranges between -5 to -8.
Typically, MCMC method with naive python is more than 100 times slower than the same implementation written in compiled language languages such as c, and java. In this section, we will see the performance of pymc package as an alternative approach.
I tested LDA with PyMC below, but it was extremely slow. Even it didn't work with 500 documents.
In [17]:
# convert word list to vector
word_ids = list() # word appearance
word_cnt = list() # word count
for doc in docs:
ids = np.zeros(nvoca, dtype=int)
cnt = np.zeros(nvoca, dtype=int)
for word in doc:
ids[voca[word]] = 1
cnt[voca[word]] += 1
word_ids.append(ids)
word_cnt.append(cnt)
In [18]:
# implementation of LDA with pymc
# original source code from http://stats.stackexchange.com/questions/104771/latent-dirichlet-allocation-in-pymc
import numpy as np
import pymc as pm
K = ntopic # number of topics
V = nvoca # number of words
D = 10 # number of documents
data = np.array(word_cnt)
alpha = np.ones(K)
beta = np.ones(V)
theta = pm.Container([pm.CompletedDirichlet("theta_%s" % i, pm.Dirichlet("ptheta_%s" % i, theta=alpha)) for i in range(D)])
phi = pm.Container([pm.CompletedDirichlet("phi_%s" % k, pm.Dirichlet("pphi_%s" % k, theta=beta)) for k in range(K)])
Wd = [len(doc) for doc in data]
z = pm.Container([pm.Categorical('z_%i' % d,
p = theta[d],
size=Wd[d],
value=np.random.randint(K, size=Wd[d]))
for d in range(D)])
# cannot use p=phi[z[d][i]] here since phi is an ordinary list while z[d][i] is stochastic
w = pm.Container([pm.Categorical("w_%i_%i" % (d,i),
p = pm.Lambda('phi_z_%i_%i' % (d,i),
lambda z=z[d][i], phi=phi: phi[z]),
value=data[d][i],
observed=True)
for d in range(D) for i in range(Wd[d])])
model = pm.Model([theta, phi, z, w])
mcmc = pm.MCMC(model)
mcmc.sample(100)
In [18]: