SE Economics - Latent Dirichlet Allocation Analysis (Trigram)


In [2]:
input_fname = '../data/processed/final_processed_trigram_posts.txt'

In [9]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.word2vec import LineSentence

import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle

In [10]:
dictionary_fname = '../data/processed/trigram.dict'

In [13]:
trigram_posts = LineSentence(input_fname)
trigram_dict = Dictionary(trigram_posts)
trigram_dict.filter_extremes(no_below=10, no_above=0.35)
trigram_dict.compactify()
trigram_dict.save(dictionary_fname)

In [14]:
trigram_corpus_fname = '../data/processed/trigram.mm'

In [15]:
def bow_generator(fname):
    for post in LineSentence(fname):
        yield trigram_dict.doc2bow(post)

In [16]:
MmCorpus.serialize(trigram_corpus_fname, bow_generator(input_fname))
trigram_bows = MmCorpus(trigram_corpus_fname)

In [17]:
lda_model_fname = '../data/processed/lda-trigram.model'

In [18]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    lda = LdaMulticore(trigram_bows, num_topics=50, id2word=trigram_dict, workers=7)
    
    lda.save(lda_model_fname)

In [19]:
def explore_topic(topic, topn=20):
    for term, prob in lda.show_topic(topic, topn=topn):
        print('{:20} {:.3f}'.format(term, prob))

In [29]:
explore_topic(15)


$                    0.015
dollar               0.012
good                 0.010
market               0.008
economic             0.008
change               0.007
mean                 0.007
price                0.006
do_not               0.006
use                  0.006
's                   0.006
time                 0.005
economy              0.005
money                0.004
currency             0.004
1                    0.004
value                0.004
think                0.004
country              0.004
like                 0.004

In [ ]: