In [13]:
from glob import glob
import re
import string
import funcy as fp
from gensim import models
from gensim.corpora import Dictionary, MmCorpus
import nltk
import pandas as pd
from gensim import corpora, models, similarities
from gensim.models import hdpmodel, ldamodel, lsimodel
import sys
import codecs
import json
from nltk.corpus import stopwords
from textblob import TextBlob
import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [8]:
stoplist = stopwords.words('english')
stoplist.extend(["also","said","work","one","two","three", "les", "like"])

In [55]:
def tokenize(line):
  return [str(w).lower() for w in TextBlob(line.encode('ascii', 'ignore')).noun_phrases if len(w) > 2]

In [58]:
with'kadist.json', 'r', encoding='utf-8') as f:
  data = json.loads(
  documents = [x['description'] for x in data]
  dictionary = corpora.Dictionary(doc) for doc in documents)
  dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
  stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
  once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 3]
  dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once

  corpus = [dictionary.doc2bow(doc.split()) for doc in documents]

  tfidf = models.TfidfModel(corpus, normalize=True)

  corpus_tfidf = tfidf[corpus]

  print 'LDA', '*'*50
  lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=50, passes=10)
  for topic in lda.show_topics(num_topics=50, num_words=10, log=False, formatted=False):
    print [x[1] for x in topic]

TypeError                                 Traceback (most recent call last)
<ipython-input-58-02293a80fac7> in <module>()
      2   data = json.loads(
      3   documents = [x['description'] for x in data]
----> 4   dictionary = corpora.Dictionary(doc for doc in documents)
      5   dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
      6   stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]

/usr/local/lib/python2.7/dist-packages/gensim/corpora/dictionary.pyc in __init__(self, documents, prune_at)
     57         if documents is not None:
---> 58             self.add_documents(documents, prune_at=prune_at)

/usr/local/lib/python2.7/dist-packages/gensim/corpora/dictionary.pyc in add_documents(self, documents, prune_at)
    126             # update Dictionary with the document
--> 127             self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids
    129"built %s from %i documents (total %i corpus positions)",

/usr/local/lib/python2.7/dist-packages/gensim/corpora/dictionary.pyc in doc2bow(self, document, allow_update, return_missing)
    147         """
    148         if isinstance(document, string_types):
--> 149             raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string")
    151         # Construct (word, frequency) mapping.

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

In [49]:
vis_data = gensimvis.prepare(lda, corpus, dictionary)

/usr/local/lib/python2.7/dist-packages/skbio/stats/ordination/ RuntimeWarning: The result contains negative eigenvalues. Please compare their magnitude with the magnitude of some of the largest positive eigenvalues. If the negative ones are smaller, it's probably safe to ignore them, but if they are large in magnitude, the results won't be useful. See the Notes section for more details. The smallest eigenvalue is -0.0968938410784 and the largest is 0.958312633489.

In [11]:

In [ ]: