In [190]:
import numpy as np
import scipy as sp
from scipy.special import gammaln
import numpy as np
# Words
W = np.array([0, 1, 2, 3,4])
# D := document words
X = np.array([
[0, 0, 0, 0, 0],
[0, 0, 0, 2,1],
[4],
[2,3,2,3],
[2,3,2,3,2]
])
W.shape
Out[190]:
In [191]:
n_topics = 3
alpha = 0.1
beta = 0.1
In [192]:
n_docs = len(X)
n_words = len(W)
In [193]:
doc_topic = np.zeros((n_docs, n_topics)) #nmz number of words per topic for each doc
word_topic = np.zeros((n_topics, n_words)) #nzw count of each word for each topic
doc = np.zeros(n_docs) #nm
topic = np.zeros(n_topics) #nz
topics_peridx = {} # topic assigned for each word for each document
topics_perw = {} # topic assigned for each word for each document
for d in range(n_docs):
idx =0
for w in X[d,]:
t=np.random.randint(n_topics)
doc_topic[d, t] +=1 #
doc[d] +=1
word_topic[t,w] +=1
topic[t] +=1
topics_peridx[(d,idx)] = t
topics_perw[(d,w)] = t
idx +=1
In [198]:
def conditional_distrib(d,w):
left = (word_topic[:,w] + beta) / (topic + beta*n_words)
right = (doc_topic[d,:] + alpha) / (doc[d] + alpha*n_topics)
p_t = left*right
p_t /= np.sum(p_t)
return(p_t)
def log_multi_beta(alpha, K=None):
"""
Logarithm of the multinomial beta function.
"""
if K is None:
# alpha is assumed to be a vector
return np.sum(gammaln(alpha)) - gammaln(np.sum(alpha))
else:
# alpha is assumed to be a scalar
return K * gammaln(alpha) - gammaln(K*alpha)
def loglikelihood():
"""
Compute the likelihood that the model generated the data.
"""
loglike = 0
for t in range(n_topics):
loglike += log_multi_beta(word_topic[t,:]+beta)
loglike -= log_multi_beta(beta, n_words)
for d in range(n_docs):
loglike += log_multi_beta(doc_topic[d,:]+alpha)
loglike -= log_multi_beta(alpha, n_topics)
return loglike
In [201]:
for iteration in range(10):
for d in range(n_docs):
idx =0
for w in X[d,]:
t = topics_peridx[(d,idx)]
doc_topic[d, t] -=1
doc[d] -=1
word_topic[t,w] -=1
topic[t] -=1
p_t = conditional_distrib(d,w)
t = np.random.multinomial(1,p_t)
t= t.argmax()
doc_topic[d, t] +=1
doc[d] +=1
word_topic[t,w] +=1
topic[t] +=1
topics_peridx[(d,idx)] = t
topics_perw[(d,w)] =t
idx +=1
print(loglikelihood())
In [202]:
word_topic
Out[202]:
In [188]:
X
Out[188]:
In [ ]: