In [ ]:
from gensim import models
In [2]:
import pandas as pd
import numpy as np
import seaborn as sns # For prettier plots. Seaborn takes over pandas' default plotter
import nltk
import pyLDAvis
import pyLDAvis.sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
pyLDAvis.enable_notebook()
%matplotlib inline
In [3]:
review = pd.read_csv('../../../data/interim/clean_US_cities/2016_review.csv')
In [ ]:
review = pd.read_csv('../../../data/interim/clean_US_cities/review_clean.csv')
In [7]:
# How many businesses?
len(review['business_id'].unique())
Out[7]:
In [100]:
# Number of reviews by date
# The sharp seasonal falls are Chrismas Day and New Year's Day
# The sharp seasonal spikes are in summer, where people presumably have more free time
review.groupby('date').agg({'review_id': len}).reset_index().plot(x='date', y='review_id', figsize=(10,6))
Out[100]:
In [18]:
review.info()
In [6]:
review.describe()
Out[6]:
In [63]:
review['text'][0]
Out[63]:
In [4]:
review = review.fillna('')
review.isnull().sum()
Out[4]:
In [5]:
tvec = TfidfVectorizer(stop_words='english', min_df=10, max_df=0.5, max_features=100,
norm='l2',
strip_accents='unicode'
)
review_dtm_tfidf = tvec.fit_transform(review['text'])
cvec = CountVectorizer(stop_words='english', min_df=10, max_df=0.5, max_features=100,
strip_accents='unicode')
review_dtm_cvec = cvec.fit_transform(review['text'])
print review_dtm_tfidf.shape, review_dtm_cvec.shape
In [6]:
# Fitting LDA models
# On cvec DTM
lda_cvec = LatentDirichletAllocation(n_topics=15, random_state=42)
lda_cvec.fit(review_dtm_cvec)
# On tfidf DTM
lda_tfidf = LatentDirichletAllocation(n_topics=15, random_state=42)
lda_tfidf.fit(review_dtm_tfidf)
Out[6]:
In [34]:
lda_viz_20_topics_cvec = pyLDAvis.sklearn.prepare(lda_cvec, review_dtm_cvec, cvec)
lda_viz_20_topics_cvec
# Presentation notes:
# - Topics sorted by number of tokens or words
# Stuff:
# Distribution of topics to documents. Columns are "document_number", "t1", "t2", "t3", ..."tn" giving the % of
Out[34]:
In [35]:
lda_viz_20_topics_tdidf = pyLDAvis.sklearn.prepare(lda_tfidf, review_dtm_tfidf, tvec)
lda_viz_20_topics_tdidf
Out[35]:
In [7]:
lda_viz_15_topics_cvec = pyLDAvis.sklearn.prepare(lda_cvec, review_dtm_cvec, cvec)
lda_viz_15_topics_cvec
Out[7]:
In [8]:
lda_viz_15_topics_tdidf = pyLDAvis.sklearn.prepare(lda_tfidf, review_dtm_tfidf, tvec)
lda_viz_15_topics_tdidf
Out[8]:
In [32]:
lda_viz_10_topics_cvec = pyLDAvis.sklearn.prepare(lda_cvec, review_dtm_cvec, cvec)
lda_viz_10_topics_cvec
Out[32]:
In [29]:
# lda_viz_10_topics_tdidf = pyLDAvis.sklearn.prepare(lda_tfidf, review_dtm_tfidf, tvec)
lda_viz_10_topics_tdidf
Out[29]:
In [24]:
lda_viz_5_topics_cvec = pyLDAvis.sklearn.prepare(lda_cvec, review_dtm_cvec, cvec)
lda_viz_5_topics_cvec
Out[24]:
In [27]:
lda_viz_5_topics_tfidf = pyLDAvis.sklearn.prepare(lda_tfidf, review_dtm_tfidf, tvec)
lda_viz_5_topics_tfidf
Out[27]:
In [ ]: