In [190]:
import numpy as np
import scipy as sp
from scipy.special import gammaln

import numpy as np


# Words
W = np.array([0, 1, 2, 3,4])

# D := document words
X = np.array([
    [0, 0, 0, 0, 0],
    [0, 0, 0, 2,1],
    [4],
    [2,3,2,3],
  [2,3,2,3,2]
])
W.shape


Out[190]:
(5,)

In [191]:
n_topics = 3
alpha = 0.1
beta = 0.1

Initalize


In [192]:
n_docs = len(X)
n_words = len(W)

In [193]:
doc_topic = np.zeros((n_docs, n_topics)) #nmz number of words per topic for each doc
word_topic = np.zeros((n_topics, n_words)) #nzw count of each word for each topic
doc = np.zeros(n_docs) #nm
topic = np.zeros(n_topics) #nz
topics_peridx = {} # topic assigned for each word for each document 
topics_perw = {} # topic assigned for each word for each document

for d in range(n_docs):
    idx =0
    for w in X[d,]:
        t=np.random.randint(n_topics)
        doc_topic[d, t] +=1 #
        doc[d] +=1
        word_topic[t,w] +=1
        topic[t] +=1
        topics_peridx[(d,idx)] = t
        topics_perw[(d,w)] = t
        idx +=1

In [198]:
def conditional_distrib(d,w):
    left = (word_topic[:,w] + beta) / (topic + beta*n_words)
    right = (doc_topic[d,:] + alpha) / (doc[d] + alpha*n_topics)
    p_t = left*right
    p_t /= np.sum(p_t)
    return(p_t)

def log_multi_beta(alpha, K=None):
    """
    Logarithm of the multinomial beta function.
    """
    if K is None:
        # alpha is assumed to be a vector
        return np.sum(gammaln(alpha)) - gammaln(np.sum(alpha))
    else:
        # alpha is assumed to be a scalar
        return K * gammaln(alpha) - gammaln(K*alpha)

def loglikelihood():
    """
    Compute the likelihood that the model generated the data.
    """

    loglike = 0

    for t in range(n_topics):
        loglike += log_multi_beta(word_topic[t,:]+beta)
        loglike -= log_multi_beta(beta, n_words)

    for d in range(n_docs):
        loglike += log_multi_beta(doc_topic[d,:]+alpha)
        loglike -= log_multi_beta(alpha, n_topics)
    
    return loglike

In [201]:
for iteration in range(10):
    for d in range(n_docs):
        idx =0
        for w in X[d,]:        
            t = topics_peridx[(d,idx)]
            doc_topic[d, t] -=1 
            doc[d] -=1
            word_topic[t,w] -=1
            topic[t] -=1

            p_t = conditional_distrib(d,w)
            t = np.random.multinomial(1,p_t)
            t= t.argmax()

            doc_topic[d, t] +=1 
            doc[d] +=1
            word_topic[t,w] +=1
            topic[t] +=1
            topics_peridx[(d,idx)] = t
            topics_perw[(d,w)] =t

            idx +=1

    print(loglikelihood())


-46.72724496251951
-42.66358559382525
-41.24732156301417
-33.06367036013793
-36.4649343594338
-34.23925836087318
-33.06367036013793
-34.23925836087318
-33.06367036013793
-33.06367036013793

In [202]:
word_topic


Out[202]:
array([[0., 0., 5., 4., 0.],
       [5., 0., 0., 0., 0.],
       [3., 1., 1., 0., 1.]])

In [188]:
X


Out[188]:
array([list([0, 0, 0, 0, 0]), list([0, 0, 0, 2, 1]), list([4]),
       list([2, 3, 2, 3]), list([2, 3, 2, 3, 2])], dtype=object)

In [ ]: