LDA's generative processs


In [1]:
ntopic = 10
nvoca = 20
alpha = np.ones(ntopic)*0.1
beta = np.ones(nvoca)*0.01
theta = np.random.dirichlet(alpha)
phi = np.random.dirichlet(beta, size=ntopic)

In [2]:
_z = np.random.multinomial(1, theta)

In [3]:
_z


Out[3]:
array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [4]:
z = _z.argmax()

In [5]:
_w = np.random.multinomial(1, phi[z,:])
w = _w.argmax()

In [6]:
w


Out[6]:
5

Collapsed Gibbs Sampling


In [7]:
# read sample corpus from nltk.corpus.brown corpus
# install nltk package, import nltk, and run nltk.download() to get corpora provided by nltk
import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
from scipy.special import gammaln

alpha = 0.1
beta = 0.01

ndoc = 500
ntopics = 10

st = set(stopwords.words())
st.add(u'.')
st.add(u',')
st.add(u'\'\'')
st.add(u'``')
st.add(u':')
st.add(u'--')

_docs = brown.sents()
docs = list()
for di in xrange(ndoc):
    doc = _docs[di]
    new_doc = list()
    for word in doc:
        if word.lower() not in st:
            new_doc.append(word.lower())
    docs.append(new_doc)

In [8]:
# construct vocabulary
_voca = set()
for doc in docs:
    _voca = _voca.union(set(doc))
    
nvoca = len(_voca)
voca = dict()

for word in _voca:
    voca[word] = len(voca)
voca_list = np.array(list(_voca))

In [9]:
# set sampling variables

word_topic = np.zeros([nvoca,ntopic])
topic_sum = np.zeros(ntopic)
doc_sum = np.zeros([ndoc, ntopic])

assigned_topics = [list() for i in xrange(ndoc)]

In [10]:
#initial sampling process

for di in xrange(ndoc):
    doc = docs[di]
    for word in doc:
        w_no = voca[word]
        prob = np.zeros(ntopic)
        for topic in xrange(ntopic):
            prob[topic] = (word_topic[w_no,topic] + beta)/(topic_sum[topic] + beta*nvoca)*(alpha + doc_sum[di,topic])
        prob /= np.sum(prob)
        
        # draw random sample
        new_topic = np.random.multinomial(1, prob).argmax()
        
        assigned_topics[di].append(new_topic)
        doc_sum[di,new_topic] += 1
        topic_sum[new_topic] += 1
        word_topic[w_no,new_topic] += 1

In [11]:
# iterate sampling process

niter = 100
for it in xrange(niter):
    for di in xrange(ndoc):
        doc = docs[di]
        for wi in xrange(len(doc)):
            word = doc[wi]
            w_no = voca[word]
            prev_topic = assigned_topics[di][wi]

            doc_sum[di,prev_topic] -= 1
            topic_sum[prev_topic] -= 1
            word_topic[w_no,prev_topic] -= 1

            prob = np.zeros(ntopic)
            for topic in xrange(ntopic):
                prob[topic] = (word_topic[w_no,topic] + beta)/(topic_sum[topic] + beta*nvoca)*(alpha + doc_sum[di,topic])
            prob /= np.sum(prob)
            
            # draw random sample
            new_topic = np.random.multinomial(1, prob).argmax()

            assigned_topics[di][wi] = new_topic
            doc_sum[di,new_topic] += 1
            topic_sum[new_topic] += 1
            word_topic[w_no,new_topic] += 1

In [12]:
# print top probability words for each topic
for topic in xrange(ntopic):
    print 'topic %d : %s' % (topic, ' '.join(voca_list[np.argsort(word_topic[:,topic])[::-1][0:10]]))


topic 0 : would year president pay medical ) ( care schools texas
topic 1 : said jury federal county program fulton election funds health president
topic 2 : mr. charter last campaign council night town election yesterday group
topic 3 : administration laos cases persons policy another ward policies karns involved
topic 4 : million state dallas texas 1 new dollars department bonds hospital
topic 5 : said committee bill senate party public house republican passed state
topic 6 : made court general said governor two highway work expected laws
topic 7 : states united state since would nato taken secretary get property
topic 8 : said resolution house home days vote day sunday adc board
topic 9 : city said would mr. hawksley college one dr. aid back

Computing Heldout Likelihood

Once you infer topics from corpus, you need to validate the performance of our model. Computing heldout likelihood is the easiest way to compare your model with others. Typically, it consists of four steps:

  1. Prepare test set documents
  2. Split each test document into two parts
  3. Infer topic proportion of test documents from the first half by using sampling
  4. Compute heldout likelihood on the second half

Step 1

Prepare test set


In [13]:
test_ndoc = 100
test_docs = list()

for di in xrange(test_ndoc):
    doc = _docs[di]
    new_doc = list()
    for word in doc:
        if word.lower() not in st:
            new_doc.append(word.lower())
    test_docs.append(new_doc)

Step 2

Split each test document into two parts


In [14]:
test_first = list()
test_second = list()
for doc in test_docs:
    first_half = list()
    second_half = list()
    #split it your way!
    for wi in xrange(len(doc)):
        word = doc[wi]
        if voca.has_key(word):
            if wi%2 == 0:
                first_half.append(doc[wi])
            else:
                second_half.append(doc[wi])
    test_first.append(first_half)
    test_second.append(second_half)

Step 3

Infer topic proportion of test documents from the first half by using sampling


In [15]:
# iterate sampling process
# note that this time we do not assign new words to word_topic matrix

#topic_sum = np.zeros(ntopic) # Jooyeon : topic_sum is identical for training and testing
t_doc_sum = np.zeros([test_ndoc, ntopic])

t_assigned_topics = [[0 for j in xrange(len(test_first[i]))]for i in xrange(test_ndoc)]

# Jooyeon : t_doc_sum and t_assigned_topics need to be initialized.

niter = 100
for it in xrange(niter):
    for di in xrange(test_ndoc):
        doc = test_first[di]
        for wi in xrange(len(doc)):
            word = doc[wi]
            w_no = voca[word]
            #prev_topic = assigned_topics[di][wi]
            prev_topic = t_assigned_topics[di][wi]  # Jooyeon : changed to t_assigned_topics

            if it != 0:
                t_doc_sum[di,prev_topic] -= 1
            #topic_sum[prev_topic] -= 1
            #word_topic[w_no,prev_topic] -= 1

            prob = np.zeros(ntopic)
            for topic in xrange(ntopic):
                prob[topic] = (word_topic[w_no,topic] + beta)/(topic_sum[topic] + beta*nvoca)*(alpha + t_doc_sum[di,topic])
            prob /= np.sum(prob)

            # draw random sample
            new_topic = np.random.multinomial(1, prob).argmax()

            #assigned_topics[di][wi] = new_topic
            t_assigned_topics[di][wi] = new_topic # Jooyeon : changed to t_assigned_topics
            t_doc_sum[di,new_topic] += 1
            #topic_sum[new_topic] += 1
            #word_topic[w_no,new_topic] += 1

Step 4

Compute per-word heldout log likelihood on the second half


In [16]:
heldout_ll = 0.
cnt = 0.

for di in xrange(test_ndoc):
    doc = test_second[di]
    for wi in xrange(len(doc)):
        word = doc[wi]
        w_no = voca[word]
        cnt += 1

        # see A Collapsed Variational Bayesian Inference
        # Algorithm for Latent Dirichlet Allocation by Teh & et al (2006)
        # for how to compute the heldout likelihood
        ll = 0
        for topic in xrange(ntopic):
            ll += ((alpha + doc_sum[di,topic])/(alpha * ntopic + len(test_first[di]))) \
                * ((word_topic[w_no,topic] + beta)/(topic_sum[topic] + beta*nvoca))
        
        heldout_ll += np.log(ll)
            
print cnt
print heldout_ll/cnt


571.0
-5.65557797348

Usually, with a general corpus, per-word heldout log likelihood ranges between -5 to -8.

Gibbs Sampling with PyMC

Typically, MCMC method with naive python is more than 100 times slower than the same implementation written in compiled language languages such as c, and java. In this section, we will see the performance of pymc package as an alternative approach.

I tested LDA with PyMC below, but it was extremely slow. Even it didn't work with 500 documents.


In [17]:
# convert word list to vector
word_ids = list()   # word appearance
word_cnt = list()   # word count
for doc in docs:
    ids = np.zeros(nvoca, dtype=int)
    cnt = np.zeros(nvoca, dtype=int)
    for word in doc:
        ids[voca[word]] = 1
        cnt[voca[word]] += 1
    word_ids.append(ids)
    word_cnt.append(cnt)

In [18]:
# implementation of LDA with pymc
# original source code from http://stats.stackexchange.com/questions/104771/latent-dirichlet-allocation-in-pymc

import numpy as np
import pymc as pm

K = ntopic # number of topics
V = nvoca # number of words
D = 10 # number of documents

data = np.array(word_cnt)

alpha = np.ones(K)
beta = np.ones(V)

theta = pm.Container([pm.CompletedDirichlet("theta_%s" % i, pm.Dirichlet("ptheta_%s" % i, theta=alpha)) for i in range(D)])
phi = pm.Container([pm.CompletedDirichlet("phi_%s" % k, pm.Dirichlet("pphi_%s" % k, theta=beta)) for k in range(K)])
Wd = [len(doc) for doc in data]

z = pm.Container([pm.Categorical('z_%i' % d, 
                     p = theta[d], 
                     size=Wd[d],
                     value=np.random.randint(K, size=Wd[d]))
                  for d in range(D)])

# cannot use p=phi[z[d][i]] here since phi is an ordinary list while z[d][i] is stochastic
w = pm.Container([pm.Categorical("w_%i_%i" % (d,i),
                    p = pm.Lambda('phi_z_%i_%i' % (d,i), 
                              lambda z=z[d][i], phi=phi: phi[z]),
                    value=data[d][i], 
                    observed=True)
                  for d in range(D) for i in range(Wd[d])])

model = pm.Model([theta, phi, z, w])
mcmc = pm.MCMC(model)
mcmc.sample(100)


Couldn't import dot_parser, loading of dot files will not be possible.
 [-----------------101%-----------------] 101 of 100 complete in 63.8 sec

In [18]: