In [1]:
import numpy as np
from scipy.special import gamma
import random
from collections import Counter
import pydot

0. Generate Corpus

Each sublist in corpus represent a document


In [133]:
n_rows = 10
n_cols = 10
corpuss = np.zeros((n_rows, n_cols), dtype=np.object)
word_count = 0
for i in range(n_rows):
    for j in range(n_cols):
        corpus[i, j] = 'w%s' % word_count
        word_count += 1
        
print(corpuss)


[['w0' 'w1' 'w2' 'w3' 'w4' 'w5' 'w6' 'w7' 'w8' 'w9']
 ['w10' 'w11' 'w12' 'w13' 'w14' 'w15' 'w16' 'w17' 'w18' 'w19']
 ['w20' 'w21' 'w22' 'w23' 'w24' 'w25' 'w26' 'w27' 'w28' 'w29']
 ['w30' 'w31' 'w32' 'w33' 'w34' 'w35' 'w36' 'w37' 'w38' 'w39']
 ['w40' 'w41' 'w42' 'w43' 'w44' 'w45' 'w46' 'w47' 'w48' 'w49']
 ['w50' 'w51' 'w52' 'w53' 'w54' 'w55' 'w56' 'w57' 'w58' 'w59']
 ['w60' 'w61' 'w62' 'w63' 'w64' 'w65' 'w66' 'w67' 'w68' 'w69']
 ['w70' 'w71' 'w72' 'w73' 'w74' 'w75' 'w76' 'w77' 'w78' 'w79']
 ['w80' 'w81' 'w82' 'w83' 'w84' 'w85' 'w86' 'w87' 'w88' 'w89']
 ['w90' 'w91' 'w92' 'w93' 'w94' 'w95' 'w96' 'w97' 'w98' 'w99']]

In [185]:
sim = hLDA(corpuss, 10, 0.01, 0.5 ,1, 1000 , 3)


/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:4: RuntimeWarning: invalid value encountered in true_divide
  after removing the cwd from sys.path.

In [186]:
sim2 = hLDA(corpuss, 10, 0.01, 0.5 ,1, 1000 , 4)


/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:4: RuntimeWarning: invalid value encountered in true_divide
  after removing the cwd from sys.path.

In [183]:
sim3 = hLDA(corpuss, 10, 0.01, 0.5 ,1, 1000 , 5)


/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:4: RuntimeWarning: invalid value encountered in true_divide
  after removing the cwd from sys.path.

In [189]:
HLDA_plot(sim,8,True)



In [190]:
HLDA_plot(sim2,8,True)



In [192]:
HLDA_plot(sim3)


hLDA


In [3]:
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# create sample documents
doc_a = "Batman became popular soon after his introduction and gained his own comic book title, Batman, in 1940."

doc_b = "In 1971, Trump moved to Manhattan, where he became involved in larger construction projects, and used attractive architectural design to win public recognition."

doc_c = "Batman is, in his everyday identity, Bruce Wayne, a wealthy American business magnate living in Gotham City."

doc_d = "In 2001, Trump completed Trump World Tower, a 72-story residential tower across from the United Nations Headquarters."

doc_e = " Unlike most superheroes, Batman does not possess any superpowers; rather, he relies on his genius intellect, physical prowess, martial arts abilities, detective skills, science and technology, vast wealth, intimidation, and indomitable will. "

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:

    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [4]:
def CRP(topic, phi):
    '''CRP gives the probability of topic assignment for specific vocabulary'''
    '''Return a j * 1 vector, where j is the number of topic'''
    cm = np.empty(len(topic)+1)
    m = sum([len(x) for x in topic])
    cm[0] = phi / (phi + m) # prob for new topic
    for i, word in enumerate(topic):
        cm[i+1] = len(word) / (phi + m) # prob for existing topics
    return cm
  
def node_sampling(corpus_s, phi):
    '''Node sampling samples the number of topics for next level'''
    topic = []    
    for corpus in corpus_s:
        for word in corpus:
            cm = CRP(topic, phi)
            theta = np.random.multinomial(1, (cm/sum(cm))).argmax()
            if theta == 0:
                # create new topic
                topic.append([word])
            else:
                # existing topic
                topic[theta-1].append(word)
    return topic

In [7]:
def Z(corpus_s, topic, alpha, beta):
    '''Z distributes each vocabulary to topics'''
    '''Return a n * 1 vector, where n is the number of vocabularies'''
    n_vocab = sum([len(x) for x in corpus_s])
    # zm: n * 1
    # return the assignment of each vocabulary
    t_zm = np.zeros(n_vocab).astype('int')
    # z_assigned: j * 1
    # return a list of list topic where stores assigned vocabularies in each sublist
    z_assigned = [[] for _ in topic]
    z_doc = [[] for _ in topic]
    z_tmp = np.zeros((n_vocab, len(topic)))
    assigned = np.zeros((len(corpus_s), len(topic)))
    n = 0
    for i in range(len(corpus_s)):
        for d in range(len(corpus_s[i])): 
            wi = corpus_s[i][d]   
            for j in range(len(topic)):
                lik = (z_assigned[j].count(wi) + beta) / (assigned[i, j] + n_vocab * beta)
                pri = (len(z_assigned[j]) + alpha) / ((len(corpus_s[i]) - 1) + len(topic) * alpha)
                z_tmp[n, j] = lik * pri
                t_zm[n] = np.random.multinomial(1, (z_tmp[n,:]/sum(z_tmp[n,:]))).argmax()
            z_assigned[t_zm[n]].append(wi)
            z_doc[t_zm[n]].append(i)
            assigned[i, t_zm[n]] += 1
            n += 1
    z_assigned = [x for x in z_assigned if x != []]
    z_doc = [x for x in z_doc if x != []]
    return z_assigned, z_doc

In [10]:
def CRP_prior(corpus_s, doc, phi):
    cp = np.empty((len(corpus_s), len(doc)))
    for i, corpus in enumerate(corpus_s):
        p_topic = [[x for x in doc[j] if x != i] for j in range(len(doc))]
        tmp = CRP(p_topic, phi)
        cp[i,:] = tmp[1:]
    return cp

In [193]:
def likelihood(corpus_s, topic, eta):
    w_m = np.empty((len(corpus_s), len(topic)))
    allword_topic = [word  for t in topic for word in t]
    n_vocab = sum([len(x) for x in corpus_s])
    for i, corpus in enumerate(corpus_s):
        prob_result = []
        for j in range(len(topic)):
            current_topic = topic[j]
            n_word_topic = len(current_topic)
            prev_dominator = 1
            later_numerator = 1
            prob_word = 1  

            overlap = [val for val in set(corpus) if val in current_topic]
            
            prev_numerator = gammaln(len(current_topic) - len(overlap) + n_vocab * 1)
            later_dominator = gammaln(len(current_topic) + n_vocab * 1)
            for word in corpus:                
                corpus_list = corpus                
                if current_topic.count(word) - corpus_list.count(word) < 0 :
                    a = 0
                else:
                    a = current_topic.count(word) - corpus_list.count(word)
                
                prev_dominator += gammaln(a + 1)
                later_numerator += gammaln(current_topic.count(word) + 1)
           
            prev = prev_numerator - prev_dominator
            later = later_numerator - later_dominator
            
            like = prev + later 
            w_m[i, j] = exp(like)
    return w_m

In [17]:
def post(w_m, c_p):
    c_m_nume = (w_m * c_p)
    c_m_deno = c_m_nume.sum(axis = 1)
    c_m = c_m_nume/c_m_deno[:, np.newaxis]
    return np.array(c_m)

In [19]:
most_common = lambda x: Counter(x).most_common(1)[0][0]

In [13]:
def wn(c_m, corpus_s, topic):
    wn_ass = []
    wn_topic = [[] for _ in range(len(topic))]
    for i, corpus in enumerate(corpus_s):
        for word in corpus:
            theta = np.random.multinomial(1, c_m[i]).argmax()
            wn_ass.append(theta)
            wn_topic[theta].append(word)
    return np.array(wn_ass), wn_topic

In [181]:
def gibbs(corpus_s, topic, alpha, beta, phi, eta, ite):
    n_vocab = sum([len(x) for x in corpus_s])
    gibbs = np.empty((n_vocab, ite)).astype('int')
   
    for i in range(ite):
        z_topic, z_doc = Z(corpus_s, topic, alpha, beta)
        c_p = CRP_prior(corpus_s, z_doc, phi)
        w_m = likelihood(corpus_s, z_topic, eta)
        c_m = post(w_m, c_p)
        gibbs[:, i], w_topic = wn(c_m, corpus_s, z_topic) 
    # drop first 1/10 data
    gibbs = gibbs[:, int(ite/10):]
    theta = [most_common(gibbs[x]) for x in range(n_vocab)]
    
    n_topic = max(theta)+1
    
    wn_topic = [[] for _ in range(n_topic)]
    wn_doc_topic = [[] for _ in range(n_topic)]

    doc = 0
    n = 0
    for i, corpus_s in enumerate(corpus_s):
        if doc == i:
            for word in corpus_s:
                wn_doc_topic[theta[n]].append(word)
                n += 1
            for j in range(n_topic):
                if wn_doc_topic[j] != []:
                    wn_topic[j].append(wn_doc_topic[j])
        wn_doc_topic = [[] for _ in range(n_topic)]        
        doc += 1
    wn_topic = [x for x in wn_topic if x != []]
    return wn_topic

In [182]:
def hLDA(corpus_s, alpha, beta, phi, eta, ite, level):
    
    topic = node_sampling(corpus_s, phi)
    
    hLDA_tree = [[] for _ in range(level)]
    tmp_tree = []
    node = [[] for _ in range(level+1)]
    node[0].append(1)
    
    for i in range(level):
        if i == 0:
            wn_topic = gibbs(corpus_s, topic, alpha, beta, phi, eta, ite)
            topic = set([x for list in wn_topic[0] for x in list])
            hLDA_tree[0].append(topic)
            tmp_tree.append(wn_topic[1:])
            tmp_tree = tmp_tree[0]
            node[1].append(len(wn_topic[1:]))
        else:
            for j in range(sum(node[i])):
                if tmp_tree == []:
                    break
                wn_topic = gibbs(tmp_tree[0], topic, alpha, beta, phi, eta, ite)
                topic = set([x for list in wn_topic[0] for x in list])
                hLDA_tree[i].append(topic)
                tmp_tree.remove(tmp_tree[0])
                if wn_topic[1:] != []:
                    tmp_tree.extend(wn_topic[1:])
                node[i+1].append(len(wn_topic[1:]))
        
    return hLDA_tree, node[:level]

In [116]:
ans = hLDA(texts,  1, 0.01, 0.1 ,1, 1000 , 4)


/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:4: RuntimeWarning: invalid value encountered in true_divide
  after removing the cwd from sys.path.

In [117]:
ans


Out[117]:
([[{'1940',
    '2001',
    '72',
    'abil',
    'across',
    'american',
    'architectur',
    'art',
    'batman',
    'becam',
    'book',
    'bruce',
    'busi',
    'citi',
    'comic',
    'complet',
    'construct',
    'detect',
    'everyday',
    'gain',
    'geniu',
    'gotham',
    'ident',
    'indomit',
    'intellect',
    'intimid',
    'introduct',
    'live',
    'magnat',
    'manhattan',
    'martial',
    'move',
    'nation',
    'physic',
    'popular',
    'possess',
    'prowess',
    'public',
    'rather',
    'reli',
    'residenti',
    'scienc',
    'skill',
    'soon',
    'stori',
    'superhero',
    'superpow',
    'technolog',
    'titl',
    'tower',
    'trump',
    'unit',
    'unlik',
    'vast',
    'wayn',
    'wealth',
    'wealthi',
    'will'}],
  [{'headquart', 'world'}],
  [{'attract', 'design', 'larger', 'use', 'win'},
   {'1971', 'involv', 'project', 'recognit'}],
  []],
 [[1], [1], [2], [0, 0]])

Plot Function


In [178]:
def HLDA_plot(hLDA_object,Len = 8, save = False):
    
    from IPython.display import Image, display
    def viewPydot(pdot):
        plt = Image(pdot.create_png())
        display(plt)

    words = hLDA_object[0]
    struc = hLDA_object[1]
      
    graph = pydot.Dot(graph_type='graph')
    end_index = [np.insert(np.cumsum(i),0,0) for i in struc]
    for level in range(len(struc)-1):

        leaf_level = level + 1
        leaf_word = words[leaf_level]
        leaf_struc = struc[leaf_level]
        word = words[level]
        end_leaf_index = end_index[leaf_level]

        for len_root in range(len(word)):
            #print(list(word[len_root]))
            root_word = '\n'.join(str(v) for v in list(word[len_root])[:Len])
            leaf_index = leaf_struc[len_root]  
            start = end_leaf_index[len_root]
            end = end_leaf_index[len_root+1]
            lf = leaf_word[start:end]  
            for l in lf:
                #print(list(l))
                leaf_w = '\n'.join(str(v) for v in list(l)[:Len]) 
                edge = pydot.Edge(root_word, leaf_w)
                graph.add_edge(edge)
    if save == True:
        graph.write_png('graph.png')
    viewPydot(graph)

In [162]:
HLDA_plot(ans)



In [ ]: