In [ ]:
from gensim import models

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns # For prettier plots. Seaborn takes over pandas' default plotter
import nltk
import pyLDAvis
import pyLDAvis.sklearn

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

pyLDAvis.enable_notebook()
%matplotlib inline

In [3]:
review = pd.read_csv('../../../data/interim/clean_US_cities/2016_review.csv')

In [ ]:
review = pd.read_csv('../../../data/interim/clean_US_cities/review_clean.csv')

In [7]:
# How many businesses?

len(review['business_id'].unique())


Out[7]:
105893

In [100]:
# Number of reviews by date
# The sharp seasonal falls are Chrismas Day and New Year's Day
# The sharp seasonal spikes are in summer, where people presumably have more free time

review.groupby('date').agg({'review_id': len}).reset_index().plot(x='date', y='review_id', figsize=(10,6))


Out[100]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efe829c0c10>

In [18]:
review.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393275 entries, 0 to 393274
Data columns (total 10 columns):
business_id    393275 non-null object
cool           393275 non-null int64
date           393275 non-null object
funny          393275 non-null int64
review_id      393275 non-null object
stars          393275 non-null float64
text           393275 non-null object
type           393275 non-null object
useful         393275 non-null int64
user_id        393275 non-null object
dtypes: float64(1), int64(3), object(6)
memory usage: 30.0+ MB

In [6]:
review.describe()


Out[6]:
cool funny stars useful
count 3.457015e+06 3.457015e+06 3.457015e+06 3.457015e+06
mean 5.461839e-01 4.455893e-01 3.741661e+00 1.048886e+00
std 2.035434e+00 1.832262e+00 1.428476e+00 2.744492e+00
min 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00
25% 0.000000e+00 0.000000e+00 3.000000e+00 0.000000e+00
50% 0.000000e+00 0.000000e+00 4.000000e+00 0.000000e+00
75% 0.000000e+00 0.000000e+00 5.000000e+00 1.000000e+00
max 5.130000e+02 6.320000e+02 5.000000e+00 1.125000e+03

In [63]:
review['text'][0]


Out[63]:
'A Superficial place in Treasure Island, trying too hard to be upscale, however, the food quality does not meet its price tag, which is way overly priced.  Fast food quality, price higher than TAO, go figure. Staff also try too hard and gets annoying. No wonder their business is not good at all. TI is not an upscale place to visit either...'

In [4]:
review = review.fillna('')
review.isnull().sum()


Out[4]:
business_id    0
cool           0
date           0
funny          0
review_id      0
stars          0
text           0
type           0
useful         0
user_id        0
dtype: int64

In [5]:
tvec = TfidfVectorizer(stop_words='english', min_df=10, max_df=0.5, max_features=100,
                       norm='l2', 
                       strip_accents='unicode'
                       )
review_dtm_tfidf = tvec.fit_transform(review['text'])

cvec = CountVectorizer(stop_words='english', min_df=10, max_df=0.5, max_features=100,
                       strip_accents='unicode')
review_dtm_cvec = cvec.fit_transform(review['text'])

print review_dtm_tfidf.shape, review_dtm_cvec.shape


(393275, 100) (393275, 100)

In [6]:
# Fitting LDA models

# On cvec DTM
lda_cvec = LatentDirichletAllocation(n_topics=15, random_state=42)
lda_cvec.fit(review_dtm_cvec)

# On tfidf DTM
lda_tfidf = LatentDirichletAllocation(n_topics=15, random_state=42)
lda_tfidf.fit(review_dtm_tfidf)


Out[6]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_topics=15, perp_tol=0.1, random_state=42,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [34]:
lda_viz_20_topics_cvec = pyLDAvis.sklearn.prepare(lda_cvec, review_dtm_cvec, cvec)
lda_viz_20_topics_cvec

# Presentation notes:
# - Topics sorted by number of tokens or words

# Stuff:
# Distribution of topics to documents. Columns are "document_number", "t1", "t2", "t3", ..."tn" giving the % of


Out[34]:

In [35]:
lda_viz_20_topics_tdidf = pyLDAvis.sklearn.prepare(lda_tfidf, review_dtm_tfidf, tvec)
lda_viz_20_topics_tdidf


Out[35]:

In [7]:
lda_viz_15_topics_cvec = pyLDAvis.sklearn.prepare(lda_cvec, review_dtm_cvec, cvec)
lda_viz_15_topics_cvec


Out[7]:

In [8]:
lda_viz_15_topics_tdidf = pyLDAvis.sklearn.prepare(lda_tfidf, review_dtm_tfidf, tvec)
lda_viz_15_topics_tdidf


Out[8]:

In [32]:
lda_viz_10_topics_cvec = pyLDAvis.sklearn.prepare(lda_cvec, review_dtm_cvec, cvec)
lda_viz_10_topics_cvec


Out[32]:

In [29]:
# lda_viz_10_topics_tdidf = pyLDAvis.sklearn.prepare(lda_tfidf, review_dtm_tfidf, tvec)
lda_viz_10_topics_tdidf


Out[29]:

In [24]:
lda_viz_5_topics_cvec = pyLDAvis.sklearn.prepare(lda_cvec, review_dtm_cvec, cvec)
lda_viz_5_topics_cvec


Out[24]:

In [27]:
lda_viz_5_topics_tfidf = pyLDAvis.sklearn.prepare(lda_tfidf, review_dtm_tfidf, tvec)
lda_viz_5_topics_tfidf


Out[27]:

In [ ]: