In [1]:
import numpy as np
from scipy.special import gamma
import random
from collections import Counter
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
In [2]:
tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop = get_stop_words('en')
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
# create sample documents
doc_a = "Batman became popular soon after his introduction and gained his own comic book title, Batman, in 1940."
doc_b = "In 1971, Trump moved to Manhattan, where he became involved in larger construction projects, and used attractive architectural design to win public recognition."
doc_c = "Batman is, in his everyday identity, Bruce Wayne, a wealthy American business magnate living in Gotham City."
doc_d = "In 2001, Trump completed Trump World Tower, a 72-story residential tower across from the United Nations Headquarters."
doc_e = " Unlike most superheroes, Batman does not possess any superpowers; rather, he relies on his genius intellect, physical prowess, martial arts abilities, detective skills, science and technology, vast wealth, intimidation, and indomitable will. "
# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]
# list for tokenized documents in loop
texts = []
# loop through document list
for i in doc_set:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
# add tokens to list
texts.append(stemmed_tokens)
In [3]:
def CRP(topic, gamma):
'''CRP gives the probability of topic assignment for specific vocabulary'''
'''Return a j * 1 vector, where j is the number of topic'''
cm = []
m = sum([len(x) for x in topic])
p = gamma / (gamma + m) # prob for new topic
cm.append(p)
for j in range(len(topic)):
p = len(topic[j]) / (gamma + m) # prob for existing topics
cm.append(p)
return np.array(cm)
In [4]:
def node_sampling(corpus_s, gamma):
'''Node sampling samples the number of topics for next level'''
topic = []
for corpus in corpus_s:
for doc in corpus:
cm = CRP(topic, gamma)
theta = np.random.multinomial(1, (cm/sum(cm))).argmax()
if theta == 0:
# create new topic
topic.append([doc])
else:
# existing topic
topic[theta-1].append(doc)
return topic
In [5]:
def Z(corpus_s, topic, alpha, beta):
'''Z distributes each vocabulary to topics'''
'''Return a n * 1 vector, where n is the number of vocabularies'''
n_vocab = sum([len(x) for x in corpus_s])
# zm: n * 1
# return the assignment of each vocabulary
t_zm = np.zeros(n_vocab).astype('int')
# z_assigned: j * 1
# return a list of list topic where stores assigned vocabularies in each sublist
z_assigned = [[] for _ in topic]
z_doc = [[] for _ in topic]
z_tmp = np.zeros((n_vocab, len(topic)))
assigned = np.zeros((len(corpus_s), len(topic)))
n = 0
for i in range(len(corpus_s)):
for d in range(len(corpus_s[i])):
wi = corpus_s[i][d]
for j in range(len(topic)):
lik = (z_assigned[j].count(wi) + beta) / (assigned[i, j] + n_vocab * beta)
pri = (len(z_assigned[j]) + alpha) / ((len(corpus_s[i]) - 1) + len(topic) * alpha)
z_tmp[n, j] = lik * pri
t_zm[n] = np.random.multinomial(1, (z_tmp[n,:]/sum(z_tmp[n,:]))).argmax()
z_assigned[t_zm[n]].append(wi)
z_doc[t_zm[n]].append(i)
assigned[i, t_zm[n]] += 1
n += 1
z_assigned = [x for x in z_assigned if x != []]
z_doc = [x for x in z_doc if x != []]
return np.array(z_assigned)
In [6]:
def C(corpus_s, topic, gamma):
cm = []
for corpus in corpus_s:
for word in corpus:
for t in topic:
if type(t) == list:
y = t.count(word)
else:
y = t.tolist().count(word)
H = np.random.poisson(lam=(2), size=(len(topic)))
alpha = gamma*H
temp = np.random.dirichlet(y + alpha).transpose()
cm.append((temp/sum(temp)).tolist())
return np.array(cm)
In [7]:
most_common = lambda x: Counter(x).most_common(1)[0][0]
def wn(c_m, corpus_s, topic):
wn_topic = []
for i, corpus in enumerate(corpus_s):
for word in corpus:
theta = np.random.multinomial(1, c_m[i]).argmax()
wn_topic.append(theta)
return np.array(wn_topic)
def gibbs_wn(c_m, corpus_s, topic, ite):
n_vocab = sum([len(x) for x in corpus_s])
wn_gibbs = np.empty((n_vocab, ite)).astype('int')
for i in range(ite):
wn_gibbs[:, i] = wn(c_m, corpus_s, topic)
# drop first 1/10 data
wn_gibbs = wn_gibbs[:, int(ite/10):]
theta = [most_common(wn_gibbs[x]) for x in range(n_vocab)]
wn_topic = [[] for _ in topic]
wn_doc_topic = [[] for _ in topic]
doc = 0
n = 0
for i, corpus_s in enumerate(corpus_s):
if doc == i:
for word in corpus_s:
wn_doc_topic[theta[n]].append(word)
n += 1
for j in range(len(topic)):
if wn_doc_topic[j] != []:
wn_topic[j].append(wn_doc_topic[j])
wn_doc_topic = [[] for _ in topic]
doc += 1
wn_topic = [x for x in wn_topic if x != []]
return wn_topic
In [8]:
def hLDA(corpus_s, gamma, alpha, beta, ite, level):
# 1. Node sampling, samples max level L
topic = node_sampling(corpus_s, gamma)
def dis(corpus_s, gamma, alpha, beta, ite):
# 2. z_m, samples topic from L
z_topic = Z(corpus_s, topic, alpha, beta)
# 3. c_m, samples path
c_m = C(corpus_s, z_topic, gamma)
# 4. w_n, distributes words into topics
wn_topic = gibbs_wn(c_m, corpus_s, z_topic, ite)
return wn_topic
hLDA_tree = [[] for _ in range(level)]
tmp_tree = []
node = [[] for _ in range(level+1)]
node[0].append(1)
for i in range(level):
if i == 0:
wn_topic = dis(texts, gamma, alpha, beta, ite)
topic = set([x for list in wn_topic[0] for x in list])
hLDA_tree[0].append(topic)
tmp_tree.append(wn_topic[1:])
tmp_tree = tmp_tree[0]
node[1].append(len(wn_topic[1:]))
else:
for j in range(sum(node[i])):
if tmp_tree == []:
break
wn_topic = dis(tmp_tree[0], gamma, alpha, beta, ite)
topic = set([x for list in wn_topic[0] for x in list])
hLDA_tree[i].append(topic)
tmp_tree.remove(tmp_tree[0])
if wn_topic[1:] != []:
tmp_tree.extend(wn_topic[1:])
node[i+1].append(len(wn_topic[1:]))
return hLDA_tree, node[:level]
In [10]:
texts = [['batman',
'becam',
'popular',
'soon',
'introduct',
'gain',
'comic',
'book',
'titl',
'batman',
'1940'],
['1971',
'trump',
'move',
'manhattan',
'becam',
'involv',
'larger',
'construct',
'project',
'use',
'attract',
'architectur',
'design',
'win',
'public',
'recognit'],
['batman',
'everyday',
'ident',
'bruce',
'wayn',
'wealthi',
'american',
'busi',
'magnat',
'live',
'gotham',
'citi'],
['2001',
'trump',
'complet',
'trump',
'world',
'tower',
'72',
'stori',
'residenti',
'tower',
'across',
'unit',
'nation',
'headquart'],
['unlik',
'superhero',
'batman',
'possess',
'superpow',
'rather',
'reli',
'geniu',
'intellect',
'physic',
'prowess',
'martial',
'art',
'abil',
'detect',
'skill',
'scienc',
'technolog',
'vast',
'wealth',
'intimid',
'indomit',
'will']]
In [12]:
hLDA(texts, 10, 0.1, 0.01, 10000, 4)
Out[12]:
In [36]:
In [ ]: