In [1]:
import pandas as pd
import pickle
import numpy as np
# Load the bar review dataset
review = pd.read_pickle('../output/bar_reviews_cleaned_and_tokenized.pickle')
In [2]:
from itertools import chain
from collections import OrderedDict
reviews_merged = OrderedDict()
# Flatten the reviews, so each review is just a single list of words.
n_reviews = -1
for bus_id in set(review.business_id.values[:n_reviews]):
# This horrible line first collapses each review of a corresponding business into a list
# of lists, and then collapses the list of sentences to a long list of words
reviews_merged[bus_id] = " ".join(list(chain.from_iterable(
chain.from_iterable( review.cleaned_tokenized[review.business_id==bus_id] )))
)
docs = reviews_merged.values()
In [3]:
from __future__ import print_function
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
n_samples = -1
n_features = 5000
n_top_words = 10
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.75, min_df=2, max_features=n_features)
t0 = time()
tf = tf_vectorizer.fit_transform(docs[:n_samples])
print("done in %0.3fs." % (time() - t0))
In [ ]:
perplexity = []
for n_topics in range(1,40):
print("N Topics %i"%n_topics)
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
% (n_samples, n_features))
lda = LatentDirichletAllocation(doc_topic_prior=topic_prior=topic_prior=7/n_topics, n_topics=n_topics, max_iter=5,
learning_method='online', learning_offset=10.,
random_state=0, n_jobs=6)
t0 = time()
doc_topics = lda.fit(tf)
print("done in %0.3fs." % (time() - t0))
perplexity.append(lda.perplexity(tf))
In [13]:
# Save the model
with open('../output/LDA_perplexity.pickle', 'wb') as f:
pickle.dump(perplexity, f, )
In [10]:
import pickle
perplexity = pickle.load( open('../output/LDA_perplexity.pickle', 'rb' ))
plt.plot(range(1,30), np.log2(perplexity), marker='o', markersize=2, label='Business LDA')
plt.xlabel('Number of Topics')
plt.ylabel('$\log_2$(Perplexity)')
plt.legend(frameon=False)
plt.savefig('../images/LDA_num_topics_elbow.png')
In [ ]:
In [28]:
# This is a vector for each company in topic space
doc_topic_distr = lda.transform(tf)
In [50]:
print(doc_topic_distr.shape)
doc_topics = {'topics': reviews_merged.keys(), 'doc_topic_dist':doc_topic_distr,}
import cPickle as pickle
with open('../output/LDA_doc_topic_list.pickle', 'wb') as f:
pickle.dump(doc_topics, f, )
In [10]:
# Plot the sum of topics.
plt.bar(np.arange(len(np.sum(doc_topic_distr, axis=0))), np.average(doc_topic_distr, axis=0), color='steelblue')
In [ ]:
In [ ]:
In [11]:
from scipy.stats import beta
In [40]:
for i, a in enumerate(np.logspace(-.2,0,4)):
b = 1
x = np.linspace(beta.ppf(0.01, a, b), beta.ppf(0.99, a, b), 100)
plt.plot(x, beta.pdf(x, a, b), color='r', lw=2, alpha=(i+2)/6., label='beta(%1.2f, 1)'%a)
plt.legend(frameon=False)
plt.yscale('log')
plt.ylim(.5, 10)
# plt.xscale('log')
Out[40]:
In [ ]: