In [18]:
from __future__ import print_function
import pandas as pd
import pickle
import numpy as np
from itertools import chain
from collections import OrderedDict
%load_ext autoreload
import random


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Load the dataset!


In [22]:
# Load the bar review dataset 
review = pd.read_pickle('../output/bar_reviews_cleaned_and_tokenized_SF.pickle')
review.head(5)
df_businesses = pd.read_pickle('../input/yelp_academic_dataset_business_SF.pickle')
city_state_list = list(set([df_businesses.city.iloc[i_city]+', '+df_businesses.state.iloc[i_city] for i_city, city in enumerate(df_businesses.city)]))[1:]
import pickle
pickle.dump(city_state_list, open('../output/city_state_list.pickle','wb'))

In [23]:
review.tail(5)


Out[23]:
business_id date review_id stars text type user_id votes_cool votes_funny votes_useful cleaned_tokenized
84301 02ef18a93c6b829f0c78790ce5709a3887fcd139 NaN 9cee722c-5489-46a7-b0cf-4b2c61e6a527 4.0 So my friends and I came one night, bringing m... NaN 5291adc8-90e2-49ae-ad7b-57794f6c9a2c NaN NaN NaN [[friend, came, one, night, bringing, good, na...
84302 02ef18a93c6b829f0c78790ce5709a3887fcd139 NaN 773badd2-fd39-430a-ab32-22597ce1d76b 5.0 Everyone has a little drag queen inside of the... NaN 5a921de7-ca35-4ca7-9125-4813cb88b1c2 NaN NaN NaN [[everyone, a-little, drag, queen, inside, pla...
84303 02ef18a93c6b829f0c78790ce5709a3887fcd139 NaN 1ca8bee7-dd87-487e-bad1-2310f874bf5e 3.0 the venue itself is amazing.  it's huge! Sever... NaN 10d6c7d4-c641-4db0-9f98-ca384ee69338 NaN NaN NaN [[venue, amazing], [huge], [several, floor, eq...
84304 02ef18a93c6b829f0c78790ce5709a3887fcd139 NaN 83512718-a8a5-4a87-9d07-7badab3e32ae 3.0 Ooooo Ennis loved this place! S/he felt the co... NaN deb75413-f53d-4f35-a403-d7d0048e2c97 NaN NaN NaN [[ooooo, loved, place], [felt, competition, ve...
84305 02ef18a93c6b829f0c78790ce5709a3887fcd139 NaN 83f4d507-9524-47d0-a337-7e67ba8093d5 3.0 Choosing a star rating for this is difficult a... NaN 67290bdd-9cfb-4200-9586-3e04d62b6e02 NaN NaN NaN [[choosing, star, rating, difficult, at-best],...

In [21]:
"deb75413-f53d-4f35-a403-d7d0048e2c97"


Out[21]:
'deb75413-f53d-4f35-a403-d7d0048e2c97'

In [24]:
# Drop 20% of the users from the dataset for testing
user_set = list(set(review.user_id.values[:]))

random.seed(0)
shuffle(user_set) # Randomize 
n_users = float(len(user_set))

user_set_training = user_set[:int(n_users*float(0.8))]
with open('../output/training_users.pickle', 'wb') as f: 
    pickle.dump(user_set_training, f)
    
# Save a test set
test_users = user_set[int(n_users*float(0.8)):]
with open('../output/test_users.pickle', 'wb') as f: 
    pickle.dump(test_users, f)
    
# Make the active review set training only 
review = review[review.user_id.isin(user_set_training)]

In [11]:
"deb75413-f53d-4f35-a403-d7d0048e2c97" in user_set_training


Out[11]:
True

Merging the documents by (i) business, (ii) users


In [25]:
# This is for review level not business level 
# docs = [" ".join(list(chain.from_iterable(l))) for l in review.cleaned_tokenized.iloc[:]]

n_reviews = -1 # all of them... 
# Flatten the reviews, so each review is just a single list of words.
reviews_merged_bus = OrderedDict()
business_set = set(review.business_id.values[:n_reviews])
for i_bus, bus_id in enumerate(business_set):
    if ((i_bus%2)==0):
        print ('\r Fraction Processed',float(i_bus+1)/len(business_set), end="") 
    # This horrible line first collapses each review of a corresponding business into a list
    # of lists, and then collapses the list of sentences to a long list of words
    reviews_merged_bus[bus_id] = " ".join(list(chain.from_iterable( 
                                    chain.from_iterable( review.cleaned_tokenized[review.business_id==bus_id] ))))    
docs_bus = reviews_merged_bus.values()

with open('../output/docs_bars_bus.pickle', 'wb') as f: 
    pickle.dump(docs_bus, f)

with open('../output/bus_ids_bars_LDA.pickle', 'wb') as f: 
    pickle.dump(reviews_merged_bus.keys(), f)


 Fraction Processed 0.999822064057

In [71]:

Note that this section merges all reviews by the same person.


In [4]:
# Flatten the reviews, so each review is just a single list of words.
# reviews_merged_user = OrderedDict()

# user_set = list(set(review.user_id.values[:n_reviews]))
# n_users = float(len(user_set))
# for i_user, user_id in enumerate(user_set[:]):
#     if ((i_user%50)==0):
#         print ('\r Fraction Processed',float(i_user+1)/n_users, end="") 
#     # This horrible line first collapses each review of a corresponding user reviews into a list
#     # of lists, and then collapses the list of sentences to a long list of words
#     reviews_merged_user[user_id] = " ".join(list(chain.from_iterable( 
#                                     chain.from_iterable( review.cleaned_tokenized[review.user_id==user_id] ))))    
# docs_users = reviews_merged_user.values()
# print()
# print("Merging Done...")

# with open('../output/docs_bars_users.pickle', 'wb') as f: 
#     pickle.dump(docs_users, f)


 Fraction Processed 0.999817553761
Merging Done...

In [26]:
# Flatten the reviews, so each review is just a single list of words.
docs_reviews = [" ".join(list(chain.from_iterable(rev))) for rev in review.cleaned_tokenized.values[:n_reviews]]

with open('../output/docs_reviews.pickle', 'wb') as f: 
    pickle.dump(docs_reviews, f)

In [ ]:

LDA Across Bars and Businesses


In [ ]:


In [27]:
%autoreload 2 
import sys
sys.path.append('../vectorsearch/')
import LDA
reload(LDA)
n_topics=30
n_features=10000
max_df=.75
min_df=3
max_iter=10
alpha=6./n_topics

In [ ]:
# Train the bar set over businesses
#doc_users = pickle.load(open('../output/docs_bars_users.pickle', 'rb'))

lda_bus = LDA.LDA(alpha=alpha, n_topics=n_topics, n_features=n_features, max_df=max_df, min_df=min_df, max_iter=max_iter,)
lda_bus.vectorizecounts(docs_bus)
lda_bus.fitLDA()
LDA.SaveLDAModel('../output/LDA_model_bus.pickle', lda_bus)
# Now can


Extracting tf features for LDA...
done in 14.924s.
Fitting LDA models with tf features, n_samples=5620 and n_features=10000...

In [7]:
# Train the bar set over users

# doc_users = pickle.load(open('../output/docs_bars_users.pickle', 'rb'))
# lda_user = LDA.LDA(n_topics=n_topics, n_features=n_features, max_df=max_df, min_df=min_df, max_iter=max_iter,)
# lda_user.vectorizecounts(docs_users)
# lda_user.fitLDA()
# LDA.SaveLDAModel('../output/LDA_model_user.pickle', lda_user)


Extracting tf features for LDA...
done in 10.863s.
Fitting LDA models with tf features, n_samples=82216 and n_features=10000...
done in 653.609s.

In [12]:
# Train the bar set over users
lda_reviews = pickle.load(open('../output/docs_reviews.pickle', 'rb'))
lda_reviews = LDA.LDA(alpha=alpha, n_topics=n_topics, n_features=n_features, max_df=max_df, min_df=min_df, max_iter=max_iter,)
lda_reviews.vectorizecounts(docs_reviews)
lda_reviews.fitLDA()
LDA.SaveLDAModel('../output/LDA_model_reviews.pickle', lda_reviews)


Extracting tf features for LDA...
done in 11.443s.
Fitting LDA models with tf features, n_samples=186751 and n_features=10000...
done in 496.987s.

In [51]:
# doc_users = pickle.load(open('../output/docs_bars_users.pickle', 'rb'))
# lda_user = LDA.LDA(n_topics=n_topics, n_features=n_features, max_df=max_df, min_df=min_df, max_iter=max_iter,)
# lda_user.vectorizecounts(docs_users)
# lda_user.fitLDA()
# LDA.SaveLDAModel('../output/LDA_model_user.pickle', lda_user)


#lda_bus.print_top_words(10)

#.get_doc_topics(doc_users[10:12])

Generate the training and test sets


In [ ]:
import sys
sys.path.append('../vectorsearch/')
import LDA

bus_lda = LDA.LoadLDAModel('../output/LDA_model_bus.pickle')

In [ ]:


In [ ]:
# The topic vector for a given business is given by this dataframe. 
bus_lda_ids = pickle.load(open('../output/bus_ids_bars_LDA.pickle', 'rb'))
bus_vectors = pd.DataFrame()
bus_vectors['business_id'] = bus_lda_ids
transformed = bus_lda.lda.transform(bus_lda.tf)

In [ ]:
print(transformed.shape)
print(len(bus_vectors))

bus_vectors['topic_vector'] = [bus_topic_vec for bus_topic_vec in transformed]
normed_topic_vecs = map(lambda topic_vec: topic_vec/sqrt(np.dot(topic_vec, topic_vec)),
                        bus_vectors.topic_vector) 


bus_vectors.topic_vector = normed_topic_vecs

bus_vectors.to_pickle('../output/business_LDA_vectors.pickle')

In [ ]:
# Visualizationlda_reviews.get_doc_topics(doc_reviews[10:20])

In [1]:
# import pyLDAvis
# import pandas as pd
# import funcy as fp
# from pyLDAvis import prepare as vis_prepare

# def _extract_data(docs, vect, lda):
#     #LDA scikit-learn implementation seems to have buggy code.
#     #Topic_term_dists and doc_topic_dists isn't accummulated to 1.
#     #Hence norm function implemented to normalize the distributions.
#     norm = lambda data: pd.DataFrame(data).div(data.sum(1),axis=0).values
#     vected = vect.fit_transform(docs)
#     doc_topic_dists = norm(lda.fit_transform(vected))
    
#     return lda,vect, dict(
#                       doc_lengths = docs.str.len(),
#                       vocab = vect.get_feature_names(),
#                       term_frequency = vected.sum(axis=0).tolist()[0],
#                       topic_term_dists = norm(lda.components_),
#                       doc_topic_dists = doc_topic_dists)

# def prepare(docs, vect, lda, **kwargs):
#     """Create Prepared Data from sklearn's vectorizer and Latent Dirichlet
#     Application.

#     Parameters
#     ----------
#     docs : Pandas Series.
#         Documents to be passed as an input.
#     vect : Scikit-Learn Vectorizer (CountVectorizer,TfIdfVectorizer).
#         vectorizer to convert documents into matrix sparser
#     lda  : sklearn.decomposition.LatentDirichletAllocation.
#         Latent Dirichlet Allocation

#     **kwargs: Keyword argument to be passed to pyLDAvis.prepare()


#     Returns
#     -------
#     prepared_data : PreparedData
#           the data structures used in the visualization


#     Example
#     --------
#     For example usage please see this notebook:
#     http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb

#     See
#     ------
#     See `pyLDAvis.prepare` for **kwargs.
#     """
    
#     opts = fp.merge(_extract_data(docs, vect, lda)[2], kwargs)

#     return vis_prepare(**opts)

# vis_data = prepare(docs, tf_vectorizer, lda)

# #

In [ ]:


In [13]:
import sys
sys.path.append('../vectorsearch/')
import LDA

bus_lda = LDA.LoadLDAModel('../output/LDA_model_bus.pickle')

In [15]:
bus_lda.lda.n_jobs = 1

In [ ]: