In [1]:
from glob import glob
import re
import string
import funcy as fp
from gensim import models
from gensim.corpora import Dictionary, MmCorpus
import nltk
import pandas as pd
from gensim import corpora, models, similarities
from gensim.models import hdpmodel, ldamodel, lsimodel
import sys
import codecs
import json
from nltk.corpus import stopwords
from textblob import TextBlob
import pyLDAvis.gensim as gensimvis
import pyLDAvis


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-f15353c2f249> in <module>()
      6 from gensim.corpora import Dictionary, MmCorpus
      7 import nltk
----> 8 import pandas as pd
      9 from gensim import corpora, models, similarities
     10 from gensim.models import hdpmodel, ldamodel, lsimodel

/Library/Python/2.7/site-packages/pandas/__init__.py in <module>()
     36     raise ImportError('pandas {0} is incompatible with numpy < 1.7.0, '
     37                       'your numpy version is {1}. Please upgrade numpy to'
---> 38                       ' >= 1.7.0 to use pandas version {0}'.format(__version__,
     39                                                                    _np_version))
     40 

NameError: name '__version__' is not defined

In [ ]:
stoplist = stopwords.words('english')
stoplist.extend(stopwords.words('french'))
stoplist.extend(["also","said","work","one","two","three", "les", "like"])

In [ ]:
def tokenize(line):
  np = TextBlob(line.encode('ascii', 'ignore')).noun_phrases
  return [str(w).lower() for w in np + allcaps if len(w) > 2]

In [2]:
def make_documents(data):
  return [' '.join(x['generated_tags']) for x in data]

In [3]:
with codecs.open('kadist.json', 'r', encoding='utf-8') as f:
  data = json.loads(f.read())
  documents = make_documents(data)
  dictionary = corpora.Dictionary(text.encode('ascii', 'ignore').lower().split() for text in documents)
  dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
  stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
  dictionary.filter_tokens(stop_ids) # remove stop words and words that appear only once
  dictionary.compactify()

  corpus = [dictionary.doc2bow(doc.split()) for doc in documents]

  tfidf = models.TfidfModel(corpus, normalize=True)

  corpus_tfidf = tfidf[corpus]

  print 'LDA', '*'*50
  lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=200, passes=10)
  for topic in lda.show_topics(num_topics=200, num_words=10, log=False, formatted=False):
    print [x[1] for x in topic]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-aa879328781f> in <module>()
----> 1 with codecs.open('kadist.json', 'r', encoding='utf-8') as f:
      2   data = json.loads(f.read())
      3   documents = make_documents(data)
      4   dictionary = corpora.Dictionary(text.encode('ascii', 'ignore').lower().split() for text in documents)
      5   dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

NameError: name 'codecs' is not defined

In [4]:
vis_data = gensimvis.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis_data)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-9f5abe097c4a> in <module>()
----> 1 vis_data = gensimvis.prepare(lda, corpus, dictionary)
      2 pyLDAvis.display(vis_data)

NameError: name 'gensimvis' is not defined

In [4]:


In [ ]: