In [2]:
input_fname = '../data/processed/final_processed_trigram_posts.txt'
In [9]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.word2vec import LineSentence
import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle
In [10]:
dictionary_fname = '../data/processed/trigram.dict'
In [13]:
trigram_posts = LineSentence(input_fname)
trigram_dict = Dictionary(trigram_posts)
trigram_dict.filter_extremes(no_below=10, no_above=0.35)
trigram_dict.compactify()
trigram_dict.save(dictionary_fname)
In [14]:
trigram_corpus_fname = '../data/processed/trigram.mm'
In [15]:
def bow_generator(fname):
for post in LineSentence(fname):
yield trigram_dict.doc2bow(post)
In [16]:
MmCorpus.serialize(trigram_corpus_fname, bow_generator(input_fname))
trigram_bows = MmCorpus(trigram_corpus_fname)
In [17]:
lda_model_fname = '../data/processed/lda-trigram.model'
In [18]:
with warnings.catch_warnings():
warnings.simplefilter('ignore')
lda = LdaMulticore(trigram_bows, num_topics=50, id2word=trigram_dict, workers=7)
lda.save(lda_model_fname)
In [19]:
def explore_topic(topic, topn=20):
for term, prob in lda.show_topic(topic, topn=topn):
print('{:20} {:.3f}'.format(term, prob))
In [29]:
explore_topic(15)
In [ ]: