In [1]:
from glob import glob
import re
import string
import funcy as fp
from gensim import models
from gensim.corpora import Dictionary, MmCorpus
import nltk
import pandas as pd
from gensim import corpora, models, similarities
from gensim.models import hdpmodel, ldamodel, lsimodel
import sys
import codecs
import json
from nltk.corpus import stopwords
from textblob import TextBlob
import pyLDAvis.gensim as gensimvis
import pyLDAvis
In [ ]:
stoplist = stopwords.words('english')
stoplist.extend(stopwords.words('french'))
stoplist.extend(["also","said","work","one","two","three", "les", "like"])
In [ ]:
def tokenize(line):
np = TextBlob(line.encode('ascii', 'ignore')).noun_phrases
return [str(w).lower() for w in np + allcaps if len(w) > 2]
In [2]:
def make_documents(data):
return [' '.join(x['generated_tags']) for x in data]
In [3]:
with codecs.open('kadist.json', 'r', encoding='utf-8') as f:
data = json.loads(f.read())
documents = make_documents(data)
dictionary = corpora.Dictionary(text.encode('ascii', 'ignore').lower().split() for text in documents)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids) # remove stop words and words that appear only once
dictionary.compactify()
corpus = [dictionary.doc2bow(doc.split()) for doc in documents]
tfidf = models.TfidfModel(corpus, normalize=True)
corpus_tfidf = tfidf[corpus]
print 'LDA', '*'*50
lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=200, passes=10)
for topic in lda.show_topics(num_topics=200, num_words=10, log=False, formatted=False):
print [x[1] for x in topic]
In [4]:
vis_data = gensimvis.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis_data)
In [4]:
In [ ]: