In [1]:

    
import pandas as pd
import pickle
import numpy as np

# Load the bar review dataset 
review = pd.read_pickle('../output/bar_reviews_cleaned_and_tokenized.pickle')



In [2]:

    
from itertools import chain
from collections import OrderedDict
reviews_merged = OrderedDict()

# Flatten the reviews, so each review is just a single list of words.
n_reviews = -1

for bus_id in set(review.business_id.values[:n_reviews]):
    # This horrible line first collapses each review of a corresponding business into a list
    # of lists, and then collapses the list of sentences to a long list of words
    reviews_merged[bus_id] = " ".join(list(chain.from_iterable( 
                                    chain.from_iterable( review.cleaned_tokenized[review.business_id==bus_id] )))
                                     )
    
docs = reviews_merged.values()



In [3]:

    
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


n_samples = -1
n_features = 5000
n_top_words = 10

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.75, min_df=2, max_features=n_features)
t0 = time()
tf = tf_vectorizer.fit_transform(docs[:n_samples])
print("done in %0.3fs." % (time() - t0))









    



Extracting tf features for LDA...
done in 16.689s.

Choosing N-Topics



In [ ]:

    
perplexity = [] 
for n_topics in range(1,40):
    print("N Topics %i"%n_topics)
    print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
          % (n_samples, n_features))
    lda = LatentDirichletAllocation(doc_topic_prior=topic_prior=topic_prior=7/n_topics, n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=10.,
                                    random_state=0, n_jobs=6)
    t0 = time()
    doc_topics = lda.fit(tf)
    print("done in %0.3fs." % (time() - t0))
    
    perplexity.append(lda.perplexity(tf))









    



N Topics 1
Fitting LDA models with tf features, n_samples=-1 and n_features=5000...
done in 3.122s.
N Topics 2
Fitting LDA models with tf features, n_samples=-1 and n_features=5000...
done in 11.230s.
N Topics 3
Fitting LDA models with tf features, n_samples=-1 and n_features=5000...
done in 25.444s.
N Topics 4
Fitting LDA models with tf features, n_samples=-1 and n_features=5000...



In [13]:

    
# Save the model 
with open('../output/LDA_perplexity.pickle', 'wb') as f: 
    pickle.dump(perplexity, f, )

Choosing Number of topics via elbow



In [10]:

    
import pickle
perplexity = pickle.load( open('../output/LDA_perplexity.pickle', 'rb' ))

plt.plot(range(1,30), np.log2(perplexity), marker='o', markersize=2, label='Business LDA')
plt.xlabel('Number of Topics')
plt.ylabel('$\log_2$(Perplexity)')
plt.legend(frameon=False)
plt.savefig('../images/LDA_num_topics_elbow.png')



In [ ]:



In [28]:

    
# This is a vector for each company in topic space
doc_topic_distr = lda.transform(tf)



In [50]:

    
print(doc_topic_distr.shape)
doc_topics = {'topics': reviews_merged.keys(), 'doc_topic_dist':doc_topic_distr,}
import cPickle as pickle

with open('../output/LDA_doc_topic_list.pickle', 'wb') as f: 
    pickle.dump(doc_topics, f, )



In [10]:

    
# Plot the sum of topics.
plt.bar(np.arange(len(np.sum(doc_topic_distr, axis=0))), np.average(doc_topic_distr, axis=0), color='steelblue')









    



NameErrorTraceback (most recent call last)
<ipython-input-10-024f7b589f23> in <module>()
      1 # Plot the sum of topics.
----> 2 plt.bar(np.arange(len(np.sum(doc_topic_distr, axis=0))), np.average(doc_topic_distr, axis=0), color='steelblue')

NameError: name 'doc_topic_distr' is not defined



In [ ]:



In [ ]:



In [11]:

    
from scipy.stats import beta



In [40]:

    
for i, a in enumerate(np.logspace(-.2,0,4)):
    b = 1
    x = np.linspace(beta.ppf(0.01, a, b), beta.ppf(0.99, a, b), 100)
    plt.plot(x, beta.pdf(x, a, b), color='r', lw=2, alpha=(i+2)/6., label='beta(%1.2f, 1)'%a)
plt.legend(frameon=False)
plt.yscale('log')
plt.ylim(.5, 10)
# plt.xscale('log')









    Out[40]:





(0.5, 10)



In [ ]: