Sample Data


In [1]:
import numpy as np
from scipy.special import gamma
import random
from collections import Counter

import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-1-7c55d655fdae> in <module>()
      5 
      6 import matplotlib.pyplot as plt
----> 7 from nltk.tokenize import RegexpTokenizer
      8 from stop_words import get_stop_words
      9 from nltk.stem.porter import PorterStemmer

ImportError: No module named nltk.tokenize

In [2]:
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# create sample documents
doc_a = "Batman became popular soon after his introduction and gained his own comic book title, Batman, in 1940."

doc_b = "In 1971, Trump moved to Manhattan, where he became involved in larger construction projects, and used attractive architectural design to win public recognition."

doc_c = "Batman is, in his everyday identity, Bruce Wayne, a wealthy American business magnate living in Gotham City."

doc_d = "In 2001, Trump completed Trump World Tower, a 72-story residential tower across from the United Nations Headquarters."

doc_e = " Unlike most superheroes, Batman does not possess any superpowers; rather, he relies on his genius intellect, physical prowess, martial arts abilities, detective skills, science and technology, vast wealth, intimidation, and indomitable will. "

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:

    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    texts.append(stemmed_tokens)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-a7be45175137> in <module>()
----> 1 tokenizer = RegexpTokenizer(r'\w+')
      2 
      3 # create English stop words list
      4 en_stop = get_stop_words('en')
      5 

NameError: name 'RegexpTokenizer' is not defined

CRP


In [3]:
def CRP(topic, gamma):
    '''CRP gives the probability of topic assignment for specific vocabulary'''
    '''Return a j * 1 vector, where j is the number of topic'''
    cm = []
    m = sum([len(x) for x in topic])
    p = gamma / (gamma + m) # prob for new topic
    cm.append(p)
    for j in range(len(topic)):
        p = len(topic[j]) / (gamma + m) # prob for existing topics
        cm.append(p)
    return np.array(cm)

node sampling


In [4]:
def node_sampling(corpus_s, gamma):
    '''Node sampling samples the number of topics for next level'''
    topic = []    
    for corpus in corpus_s:
        for doc in corpus:
            cm = CRP(topic, gamma)
            theta = np.random.multinomial(1, (cm/sum(cm))).argmax()
            if theta == 0:
                # create new topic
                topic.append([doc])
            else:
                # existing topic
                topic[theta-1].append(doc)
    return topic

Z

$P(z_{i}=j\hspace{1ex}|\hspace{1ex}{\bf z}_{-i},{\bf w})\propto\frac{n_{-i,j}^{(w_{i})}+\beta}{n_{-i,j}^{(\cdot)}+W\beta}\frac{n_{-i,j}^{(d_{i})}+\alpha}{n_{-i,\cdot}^{(d_{i})}+T\alpha}$


In [5]:
def Z(corpus_s, topic, alpha, beta):
    '''Z distributes each vocabulary to topics'''
    '''Return a n * 1 vector, where n is the number of vocabularies'''
    n_vocab = sum([len(x) for x in corpus_s])
    # zm: n * 1
    # return the assignment of each vocabulary
    t_zm = np.zeros(n_vocab).astype('int')
    # z_assigned: j * 1
    # return a list of list topic where stores assigned vocabularies in each sublist
    z_assigned = [[] for _ in topic]
    z_doc = [[] for _ in topic]
    z_tmp = np.zeros((n_vocab, len(topic)))
    assigned = np.zeros((len(corpus_s), len(topic)))
    n = 0
    for i in range(len(corpus_s)):
        for d in range(len(corpus_s[i])): 
            wi = corpus_s[i][d]   
            for j in range(len(topic)):
                lik = (z_assigned[j].count(wi) + beta) / (assigned[i, j] + n_vocab * beta)
                pri = (len(z_assigned[j]) + alpha) / ((len(corpus_s[i]) - 1) + len(topic) * alpha)
                z_tmp[n, j] = lik * pri
                t_zm[n] = np.random.multinomial(1, (z_tmp[n,:]/sum(z_tmp[n,:]))).argmax()
            z_assigned[t_zm[n]].append(wi)
            z_doc[t_zm[n]].append(i)
            assigned[i, t_zm[n]] += 1
            n += 1
    z_assigned = [x for x in z_assigned if x != []]
    z_doc = [x for x in z_doc if x != []]
    return np.array(z_assigned)

C


In [6]:
def C(corpus_s, topic, gamma):
    cm = []
    for corpus in corpus_s:
        for word in corpus:
            for t in topic:
                if type(t) == list:
                    y = t.count(word)
                else:
                    y = t.tolist().count(word)
        H = np.random.poisson(lam=(2), size=(len(topic)))
        alpha = gamma*H
        temp = np.random.dirichlet(y + alpha).transpose()    
        cm.append((temp/sum(temp)).tolist())
    return np.array(cm)

wn


In [7]:
most_common = lambda x: Counter(x).most_common(1)[0][0]
def wn(c_m, corpus_s, topic):
    wn_topic = []
    for i, corpus in enumerate(corpus_s):
        for word in corpus:
            theta = np.random.multinomial(1, c_m[i]).argmax()
            wn_topic.append(theta)
    return np.array(wn_topic)

            
def gibbs_wn(c_m, corpus_s, topic, ite):
    n_vocab = sum([len(x) for x in corpus_s])
    wn_gibbs = np.empty((n_vocab, ite)).astype('int')
    for i in range(ite):
        wn_gibbs[:, i] = wn(c_m, corpus_s, topic)
    # drop first 1/10 data
    wn_gibbs = wn_gibbs[:, int(ite/10):]
    theta = [most_common(wn_gibbs[x]) for x in range(n_vocab)]
    
    wn_topic = [[] for _ in topic]
    wn_doc_topic = [[] for _ in topic]
    doc = 0
    n = 0
    for i, corpus_s in enumerate(corpus_s):
        if doc == i:
            for word in corpus_s:
                wn_doc_topic[theta[n]].append(word)
                n += 1
            for j in range(len(topic)):
                if wn_doc_topic[j] != []:
                    wn_topic[j].append(wn_doc_topic[j])
        wn_doc_topic = [[] for _ in topic]        
        doc += 1
    wn_topic = [x for x in wn_topic if x != []]
    return wn_topic

hLDA


In [8]:
def hLDA(corpus_s, gamma, alpha, beta, ite, level):
    
    # 1. Node sampling, samples max level L
    topic = node_sampling(corpus_s, gamma)
    
    def dis(corpus_s, gamma, alpha, beta, ite):
        
        # 2. z_m, samples topic from L
        z_topic = Z(corpus_s, topic, alpha, beta)
        
        # 3. c_m, samples path
        c_m = C(corpus_s, z_topic, gamma)
        
        # 4. w_n, distributes words into topics
        wn_topic = gibbs_wn(c_m, corpus_s, z_topic, ite)

        return wn_topic
    
    hLDA_tree = [[] for _ in range(level)]
    tmp_tree = []
    node = [[] for _ in range(level+1)]
    node[0].append(1)
    
    for i in range(level):
        if i == 0:
            wn_topic = dis(texts, gamma, alpha, beta, ite)
            topic = set([x for list in wn_topic[0] for x in list])
            hLDA_tree[0].append(topic)
            tmp_tree.append(wn_topic[1:])
            tmp_tree = tmp_tree[0]
            node[1].append(len(wn_topic[1:]))
        else:
            for j in range(sum(node[i])):
                if tmp_tree == []:
                    break
                wn_topic = dis(tmp_tree[0], gamma, alpha, beta, ite)
                topic = set([x for list in wn_topic[0] for x in list])
                hLDA_tree[i].append(topic)
                tmp_tree.remove(tmp_tree[0])
                if wn_topic[1:] != []:
                    tmp_tree.extend(wn_topic[1:])
                node[i+1].append(len(wn_topic[1:]))
        
    return hLDA_tree, node[:level]

In [10]:
texts = [['batman',
  'becam',
  'popular',
  'soon',
  'introduct',
  'gain',
  'comic',
  'book',
  'titl',
  'batman',
  '1940'],
 ['1971',
  'trump',
  'move',
  'manhattan',
  'becam',
  'involv',
  'larger',
  'construct',
  'project',
  'use',
  'attract',
  'architectur',
  'design',
  'win',
  'public',
  'recognit'],
 ['batman',
  'everyday',
  'ident',
  'bruce',
  'wayn',
  'wealthi',
  'american',
  'busi',
  'magnat',
  'live',
  'gotham',
  'citi'],
 ['2001',
  'trump',
  'complet',
  'trump',
  'world',
  'tower',
  '72',
  'stori',
  'residenti',
  'tower',
  'across',
  'unit',
  'nation',
  'headquart'],
 ['unlik',
  'superhero',
  'batman',
  'possess',
  'superpow',
  'rather',
  'reli',
  'geniu',
  'intellect',
  'physic',
  'prowess',
  'martial',
  'art',
  'abil',
  'detect',
  'skill',
  'scienc',
  'technolog',
  'vast',
  'wealth',
  'intimid',
  'indomit',
  'will']]

In [12]:
hLDA(texts, 10, 0.1, 0.01, 10000, 4)


/Applications/Pineapple.app/Contents/Resources/python2.7/lib/python2.7/site-packages/ipykernel/__main__.py:7: RuntimeWarning: divide by zero encountered in divide
Out[12]:
([[{'1940',
    '1971',
    '2001',
    '72',
    'abil',
    'across',
    'american',
    'architectur',
    'art',
    'attract',
    'batman',
    'becam',
    'book',
    'bruce',
    'busi',
    'citi',
    'comic',
    'complet',
    'construct',
    'design',
    'detect',
    'everyday',
    'gain',
    'geniu',
    'gotham',
    'headquart',
    'ident',
    'indomit',
    'intellect',
    'intimid',
    'introduct',
    'involv',
    'larger',
    'live',
    'magnat',
    'manhattan',
    'martial',
    'move',
    'nation',
    'physic',
    'popular',
    'possess',
    'project',
    'prowess',
    'public',
    'rather',
    'recognit',
    'reli',
    'residenti',
    'scienc',
    'skill',
    'soon',
    'stori',
    'superhero',
    'superpow',
    'technolog',
    'titl',
    'tower',
    'trump',
    'unit',
    'unlik',
    'use',
    'vast',
    'wayn',
    'wealth',
    'wealthi',
    'will',
    'win',
    'world'}],
  [],
  [],
  []],
 [[1], [0], [], []])

In [36]:



  File "<ipython-input-36-a9ce39a5d074>", line 1
    sudo -H pip3 install ete2
               ^
SyntaxError: invalid syntax

In [ ]: