In [2]:
import gensim


/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/numpy/lib/utils.py:95: DeprecationWarning: `scipy.sparse.sparsetools` is deprecated!
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  warnings.warn(depdoc, DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/home/harshit/anaconda/lib/python2.7/site-packages/scipy/lib/_util.py:67: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)

In [3]:
import pandas as pd
import nltk

In [4]:
pd.read_json("/home/harshit/trip-advisor/trip-advisor-dataset/json/72572.json", typ = "series")


Out[4]:
HotelInfo    {u'Name': u'BEST WESTERN PLUS Pioneer Square H...
Reviews      [{u'Ratings': {u'Service': u'4', u'Cleanliness...
dtype: object

In [5]:
data_file = "/home/harshit/trip-advisor/trip-advisor-dataset/json/72572.json"

In [6]:
with open(data_file, "rb") as f:
    data = f.readlines()

In [7]:
data_json_str = "["+','.join(data) + "]"
data_df = pd.read_json(data_json_str)

In [8]:
num_reviews_tpadv = len(data_df["Reviews"][0])

In [9]:
all_reviews = []

In [10]:
for i in range(num_reviews_tpadv):
    all_reviews.append(data_df["Reviews"][0][i]["Content"])

In [12]:
import re

In [14]:
from nltk.tokenize import RegexpTokenizer

In [15]:
tokenizer = RegexpTokenizer(r'\w+')

In [17]:
def clean_review(text):
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = tokenizer.tokenize(letters_only.lower())
    stops = set(nltk.corpus.stopwords.words("english")) 
    _words = [w for w in words if not w in stops]  
    return _words

In [18]:
clean_reviews = []

In [19]:
for i in range(num_reviews_tpadv):
    clean_reviews.append(clean_review(data_df["Reviews"][0][i]["Content"]))

In [20]:
from gensim import corpora, models, similarities

In [21]:
dictionary = corpora.Dictionary(clean_reviews)

In [22]:
dictionary.compactify()

In [23]:
corpus = [dictionary.doc2bow(doc) for doc in clean_reviews]

In [24]:
lda = models.LdaModel(corpus, id2word = dictionary,
                        num_topics = 10,
                        passes = 20,
                        alpha = "auto")

In [25]:
lda.show_topics()


Out[25]:
[(0,
  u'0.023*hotel + 0.013*seattle + 0.012*room + 0.010*square + 0.009*stay + 0.008*would + 0.008*best + 0.008*pioneer + 0.007*western + 0.007*cruise'),
 (1,
  u'0.030*hotel + 0.015*room + 0.012*staff + 0.011*pioneer + 0.010*square + 0.010*seattle + 0.008*good + 0.008*place + 0.007*breakfast + 0.007*close'),
 (2,
  u'0.027*hotel + 0.016*room + 0.011*seattle + 0.007*us + 0.007*one + 0.007*area + 0.007*would + 0.007*night + 0.007*free + 0.006*pioneer'),
 (3,
  u'0.034*hotel + 0.016*great + 0.015*room + 0.014*breakfast + 0.012*seattle + 0.011*staff + 0.010*location + 0.010*clean + 0.009*stay + 0.009*pioneer'),
 (4,
  u'0.013*hotel + 0.011*room + 0.009*staff + 0.008*stay + 0.006*rooms + 0.006*business + 0.006*us + 0.006*shops + 0.005*breakfast + 0.005*elevator'),
 (5,
  u'0.030*hotel + 0.016*seattle + 0.010*staff + 0.009*location + 0.008*stay + 0.007*great + 0.007*square + 0.007*pioneer + 0.007*breakfast + 0.007*walking'),
 (6,
  u'0.006*minutes + 0.004*interesting + 0.003*offered + 0.003*bottled + 0.003*locale + 0.003*bag + 0.003*amtrack + 0.003*cafes + 0.003*touristy + 0.003*imagine'),
 (7,
  u'0.013*hotel + 0.012*good + 0.011*location + 0.010*breakfast + 0.010*staff + 0.009*friendly + 0.009*free + 0.008*stay + 0.008*area + 0.007*nights'),
 (8,
  u'0.022*hotel + 0.015*room + 0.014*great + 0.011*seattle + 0.010*staff + 0.010*square + 0.010*pioneer + 0.009*us + 0.008*stay + 0.008*breakfast'),
 (9,
  u'0.022*hotel + 0.016*room + 0.012*good + 0.011*great + 0.009*staff + 0.008*breakfast + 0.007*stay + 0.007*square + 0.007*walking + 0.006*us')]

In [26]:
import pyLDAvis.gensim

In [27]:
import pyLDAvis.gensim as gensimvis

In [28]:
vis_data = gensimvis.prepare(lda, corpus, dictionary)

In [29]:
pyLDAvis.display(vis_data)


Out[29]: