In [13]:
from glob import glob
import re
import string
import funcy as fp
from gensim import models
from gensim.corpora import Dictionary, MmCorpus
import nltk
import pandas as pd
from gensim import corpora, models, similarities
from gensim.models import hdpmodel, ldamodel, lsimodel
import sys
import codecs
import json
from nltk.corpus import stopwords
from textblob import TextBlob
import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [8]:
stoplist = stopwords.words('english')
stoplist.extend(stopwords.words('french'))
stoplist.extend(["also","said","work","one","two","three", "les", "like"])

In [55]:
def tokenize(line):
  return [str(w).lower() for w in TextBlob(line.encode('ascii', 'ignore')).noun_phrases if len(w) > 2]

In [58]:
with codecs.open('kadist.json', 'r', encoding='utf-8') as f:
  data = json.loads(f.read())
  documents = [x['description'] for x in data]
  dictionary = corpora.Dictionary(doc) for doc in documents)
  dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
  stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
  once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 3]
  dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
  dictionary.compactify()

  corpus = [dictionary.doc2bow(doc.split()) for doc in documents]

  tfidf = models.TfidfModel(corpus, normalize=True)

  corpus_tfidf = tfidf[corpus]

  print 'LDA', '*'*50
  lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=50, passes=10)
  for topic in lda.show_topics(num_topics=50, num_words=10, log=False, formatted=False):
    print [x[1] for x in topic]


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-58-02293a80fac7> in <module>()
      2   data = json.loads(f.read())
      3   documents = [x['description'] for x in data]
----> 4   dictionary = corpora.Dictionary(doc for doc in documents)
      5   dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
      6   stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]

/usr/local/lib/python2.7/dist-packages/gensim/corpora/dictionary.pyc in __init__(self, documents, prune_at)
     56 
     57         if documents is not None:
---> 58             self.add_documents(documents, prune_at=prune_at)
     59 
     60 

/usr/local/lib/python2.7/dist-packages/gensim/corpora/dictionary.pyc in add_documents(self, documents, prune_at)
    125 
    126             # update Dictionary with the document
--> 127             self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids
    128 
    129         logger.info("built %s from %i documents (total %i corpus positions)",

/usr/local/lib/python2.7/dist-packages/gensim/corpora/dictionary.pyc in doc2bow(self, document, allow_update, return_missing)
    147         """
    148         if isinstance(document, string_types):
--> 149             raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string")
    150 
    151         # Construct (word, frequency) mapping.

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

In [49]:
vis_data = gensimvis.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis_data)


/usr/local/lib/python2.7/dist-packages/skbio/stats/ordination/_principal_coordinate_analysis.py:107: RuntimeWarning: The result contains negative eigenvalues. Please compare their magnitude with the magnitude of some of the largest positive eigenvalues. If the negative ones are smaller, it's probably safe to ignore them, but if they are large in magnitude, the results won't be useful. See the Notes section for more details. The smallest eigenvalue is -0.0968938410784 and the largest is 0.958312633489.
  RuntimeWarning
Out[49]:

In [11]:


In [ ]: