In this notebook I will try to implement and analyze LDA algorithm using PyMC package. The implementation is replication of the code provided at: http://stats.stackexchange.com/questions/104771/latent-dirichlet-allocation-in-pymc

The LDA model is described below:


In [1]:
import numpy as np
import pymc as pm
#K, V, D = 2, 4, 3 # number of topics, words, documents
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("poster")
%matplotlib inline

In [31]:
K, V, D = 5, 10, 20 # number of topics, words, documents

#data = np.array([[1, 1, 1, 1], [1, 1, 1, 1], [0, 0, 0, 0]])

data_temp = np.random.randint(0,10,size=(D,V))

In [42]:
class LDA(object):
    
    def __init__(self, data, topics=K, vocab=V):
        """
        Takes the data variable and outputs a model
        """
        self.data = data
        self.topics = topics
        self.vocab = vocab+1
        self.docs = len(self.data)
        self.alpha = np.ones(self.topics)
        self.beta = np.ones(self.vocab)

        self.theta = pm.Container([pm.CompletedDirichlet("theta_%s" % i,\
                                                         pm.Dirichlet("ptheta_%s" % i, theta=self.alpha))
                                   for i in range(self.docs)])
        self.phi = pm.Container([pm.CompletedDirichlet("phi_%s" % i,\
                                                       pm.Dirichlet("pphi_%s" % i, theta=self.beta))
                                 for i in range(self.topics)])
        self.Wd = [len(doc) for doc in self.data]

        self.Z = pm.Container([pm.Categorical("z_%s" % d, 
                                              p=self.theta[d],
                                              size=self.Wd[d],
                                              value=np.random.randint(self.topics,size=self.Wd[d]))
                               for d in range(self.docs)])
        self.W = pm.Container([pm.Categorical("w_%s,%s" % (d,i),
                                              p=pm.Lambda("phi_z_%s_%s" % (d,i), 
                                                          lambda z=self.Z[d][i], phi=self.phi: phi[z]),
                                              value=self.data[d][i],
                                              observed=True)
                               for d in range(self.docs) for i in range(self.Wd[d])])

        self.model = pm.Model([self.theta, self.phi, self.Z, self.W])
        self.mcmc = pm.MCMC(self.model)
    
    def fit(self, iterations=1000, burn_in=10):
        # Fit the model by sampling from the data iterations times with burn in of burn_in.
        self.mcmc.sample(iterations, burn=burn_in)
        
    def show_topics(self):
        # Show distribution of topics over words
        return self.phi.value
    
    def show_words(self):
        # Show distribution of words in documents over topics
        return self.W.value
    
    def KLDiv(self, p,q):
        return np.sum(p*np.log10(p/q))
    
    def cosine_sim(self, x,y):
        return np.dot(x,y)/np.sqrt(np.dot(x,x)*np.dot(y,y))
    
    def sorted_docs_sim(self):
        kldivs_docs = [(i, j, self.KLDiv(self.theta[i].value,self.theta[j].value),
                        self.cosine_sim(self.data[i], self.data[j]))
                       for i in range(len(self.theta)) for j in range(len(self.theta))
                       if i != j]
        return sorted(kldivs_docs, key=lambda x: x[3], reverse=True)
    
    def show_topic_words(self, idwords, n=10):
        for i, t in enumerate(self.phi.value):
            print "Topic %i : " % i, ", ".join(idwords[w_] for w_ in np.argsort(t[0])[-10:] if w_ < (self.vocab-1-1))
    
    def plot_data(self):
        plt.clf()
        plt.matshow(data, fignum=1000, cmap=plt.cm.Reds)
        plt.gca().set_aspect('auto')
        plt.xlabel("Words")
        plt.ylabel("Documents")
    
    def plot_words_per_topic(self, ax=None):
        if ax is None:
            plt.clf()
            fig, ax = plt.subplots(1,1)
        words = self.Z.value
        topic_dist = dict()
        for k_i in words:
            for k in k_i:
                if k not in topic_dist:
                    topic_dist[k] = 0
                topic_dist[k] += 1
        ax.bar(topic_dist.keys(), topic_dist.values())
        ax.set_xlabel("Topics")
        ax.set_ylabel("Counts")
        ax.set_title("Document words per topics")
        plt.show()
        
    def plot_word_dist(self, ax=None):
        topics = self.phi.value
        if ax is None:
            plt.clf()
            fig, ax = plt.subplots((len(topics)+1)/2, 2, figsize=(10,10))
        for i, t in enumerate(topics):
            ax[i/2][i%2].bar(range(len(t[0])), t[0])
            ax[i/2][i%2].set_title("Topic %s" % i)
        plt.suptitle("Vocab word proportions per topic")
        fig.subplots_adjust(hspace=0.5, wspace=0.5)


  File "<ipython-input-42-f08bbc70fdc3>", line 60
    def show_topic_words(self, n=10, idwords):
SyntaxError: non-default argument follows default argument

In [34]:
lda = LDA(data_temp)
lda.fit()
lda.plot_words_per_topic()


 [-----------------100%-----------------] 1000 of 1000 complete in 39.4 sec
<matplotlib.figure.Figure at 0x13151b4a8>

In [5]:
lda.plot_word_dist()


<matplotlib.figure.Figure at 0x20602a58>

In [6]:
kld_sorted = lda.sorted_docs_sim()
kld_sorted[:10]


Out[6]:
[(11, 12, 0.12794402850656339, 0.91550703216514051),
 (12, 11, 0.23233086565032021, 0.91550703216514051),
 (7, 16, 0.069081961101030681, 0.91529433139300531),
 (16, 7, 0.084241263235092026, 0.91529433139300531),
 (11, 14, 0.11389710569477, 0.89897158850740189),
 (14, 11, 0.11807633003998763, 0.89897158850740189),
 (0, 15, 0.27583826339271433, 0.89552039314012999),
 (15, 0, 0.22658878049302303, 0.89552039314012999),
 (8, 16, 0.20735934380211171, 0.88181818181818183),
 (16, 8, 0.28884846182266505, 0.88181818181818183)]

REAL DATA

Now lets move to trying on some real world data. I will use the reuters corpus.


In [7]:
from nltk.corpus import inaugural
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

In [8]:
# Create a vocabulary from the corpus
vocab = dict()
for fileid in inaugural.fileids():
    for word in inaugural.words(fileid):
        word = word.lower()
        if word not in stops and word.isalpha():
            if word not in vocab:
                vocab[word] = 0
            vocab[word] += 1

In [12]:
"""
Sort the vocab keep only words which occur more than 50 times
Then Create word to id and id to word dictionaries
"""
vocab_sorted = filter(lambda x: x[1] > 50, sorted(vocab.items(), key=lambda x: x[1], reverse=True))
wordids = {v[0]: i for i, v in enumerate(vocab_sorted)}
idwords = {i: v[0] for i, v in enumerate(vocab_sorted)}
vocab_size = len(wordids)
print vocab_size


191

In [13]:
# Generate corpus document vectors
data = []
for fileid in inaugural.fileids():
    data.append([0]*vocab_size)
    for word in inaugural.words(fileid):
        word = word.lower()
        if word in wordids:
            data[-1][wordids[word]] += 1

len(data)


Out[13]:
56

In [14]:
# Plot the document word matrix
print data[0][:10]
data = np.array(data)
plt.clf()
plt.matshow(data, fignum=1000, cmap=plt.cm.Reds)
plt.gca().set_aspect('auto')
plt.xlabel("Words")
plt.ylabel("Documents")


[8, 4, 1, 0, 3, 6, 3, 1, 2, 3]
Out[14]:
<matplotlib.text.Text at 0x2378a710>
<matplotlib.figure.Figure at 0x2382e2b0>

In [15]:
inaugural_lda = LDA(data, topics=10, vocab=vocab_size)

In [16]:
inaugural_lda.fit()
inaugural_lda.plot_words_per_topic()


 [-----------------100%-----------------] 1001 of 1000 complete in 756.2 sec
<matplotlib.figure.Figure at 0x47367c88>

In [17]:
inaugural_lda.plot_word_dist()


<matplotlib.figure.Figure at 0x2301fc18>

In [29]:
# Above diagram is wrong for Document words per topics

plt.clf()
fig, ax = plt.subplots(1,1)
words = inaugural_lda.Z.value
topic_dist = dict()
for k_i in words:
    for k in k_i:
        if k not in topic_dist:
            topic_dist[k] = 0
        topic_dist[k] += 1
ax.bar(topic_dist.keys(), topic_dist.values())
ax.set_xlabel("Topics")
ax.set_ylabel("Counts")
ax.set_title("Document words per topics")
plt.show()


<matplotlib.figure.Figure at 0x131965cc0>

In [41]:
for i, t in enumerate(inaugural_lda.phi.value):
    print "Topic %i : " % i, ", ".join(idwords[w_] for w_ in np.argsort(t[0])[-10:] if w_ < vocab_size -1)


Topic 0 :  must, without, subject, best, true, institutions, help, beyond, state
Topic 1 :  make, home, much, strong, spirit, interest, know, high, commerce, long
Topic 2 :  future, federal, duties, congress, seek, purpose, public, time, nations, administration
Topic 3 :  united, people, authority, power, constitution, free, administration, state, become, trust
Topic 4 :  whose, liberty, service, right, national, trust, proper, secure, help, state
Topic 5 :  far, character, security, fellow, administration, together, subject, seek, faith, things
Topic 6 :  responsibility, old, future, every, nation, called, party, public, interests
Topic 7 :  united, union, character, general, subject, always, century, americans, future, people
Topic 8 :  live, justice, might, fellow, action, duties, give, high, done, ought
Topic 9 :  done, full, peace, secure, president, god, administration, home, may, way

In [ ]: