Import Modules


In [9]:
import json
import pandas as pd
import re
import random
import matplotlib.pyplot as plt
%matplotlib inline  
from ast import literal_eval as make_tuple
from scipy import sparse
import numpy as np
from pymongo import MongoClient
from nltk.corpus import stopwords
from sklearn import svm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
sys.path.append('../machine_learning')
import yelp_ml as yml
reload(yml)
from gensim import corpora, models, similarities, matutils
import tqdm

Import Scrapped Reviews


In [10]:
dc_reviews = json.load(open("../Yelp_web_scrapper/dc_reviews.json"))
newyork_reviews = json.load(open("../Yelp_web_scrapper/newyork_reviews.json"))
austin_reviews = json.load(open("../Yelp_web_scrapper/austin_reviews.json"))
chicago_reviews = json.load(open("../Yelp_web_scrapper/chicago_reviews.json"))
la_reviews = json.load(open("../Yelp_web_scrapper/la_reviews.json"))

scrapped_reviews = {'dc': dc_reviews, 'ny': newyork_reviews, 
                    'austin': austin_reviews, 'chicago': chicago_reviews, 
                    'la': la_reviews}

Import Hu & Liu (2004) Word Dictionary and Wrangled Large Users


In [11]:
lh_neg = open('../input/negative-words.txt', 'r').read()
lh_neg = lh_neg.split('\n')
lh_pos = open('../input/positive-words.txt', 'r').read()
lh_pos = lh_pos.split('\n')
users = json.load(open("cleaned_large_user_dictionary.json"))
word_list = list(set(lh_pos + lh_neg))

Connect to the AWS Instance and get the restaurant reviews from the cleaned data database


In [12]:
ip = '54.175.170.119'
conn = MongoClient(ip, 27017)
conn.database_names()
db = conn.get_database('cleaned_data')
reviews = db.get_collection('restaurant_reviews')

Testing

  1. Supply User ID
  2. Get all restaurant IDs that the user has reviewed
  3. Do a random 75 (training)/25(testing) split of the restaurant IDs
  4. For the training sample, get all of the user's reviews for those restaurants
  5. For each restaurant in the testing sample, get all of that restaurant's reviews
  6. Train each of the (feature, model) combinations on the reviews in the training sample
  7. For each review in the testing sample, classify that review using the model
    1. If the total proportion of positive reviews is greater than 70% for each restaurant, classify that restaurant's rating as positive.
    2. Else, classify that restaurant's rating as negative
  8. For each restaurant's predicted rating, check against what the user actually thought 9. Use these to determine log-loss, accuracy, and precision

The features and models we use are:

Features:

  1. (Sentiment %, TF-IDF w/ (2,2) N-Gram, LSA)
  2. (Sentiment %, LSA)
  3. (TF-IDF w/ (2,2) N-Gram, LDA, LSA)
  4. (Sentiment %, TF-IDF w/ (2,2) N-Gram, LDA, LSA)

Models:

  1. Linear Support Vector Machine
  2. Random Forest
  3. Naive Bayes

In [48]:
string_keys_dict = {}
for j in tqdm.tqdm(range(55, 56)):
    
    #Generate a dataframe that has the user's review text, review rating, and restaurant ID
    test_results = {}
    user_df = yml.make_user_df(users[users.keys()[j]])
    
    #Only predict for the user if they have at least 20 bad ratings
    if len([x for x in user_df['rating'] if x < 4]) < 20:
        string_keys_dict[str(users.keys()[j])] = test_results
        continue
    else:
        business_ids = list(set(user_df['biz_id']))
        restreview = {}
        
        #Create a training and test sample from the user reviewed restaurants
        #using a random 25% subset of all the restaurants the user has reviewed
        split_samp = .25
        len_random = int(len(business_ids) * split_samp)
        test_set = random.sample(business_ids, len_random)
        training_set = [x for x in business_ids if x not in test_set]
        sub_train_reviews, train_labels, train_reviews, train_ratings = [], [], [], []

        #Create a list with the tuple (training review, training rating) 
        for rest_id in training_set:
            train_reviews.append((user_df[user_df['biz_id'] == rest_id]['review_text'].iloc[0],
                                     user_df[user_df['biz_id'] == rest_id]['rating'].iloc[0]))

        #Note that the distribution is heavily skewed towards good reviews. 
        #Therefore, we create a training sample with the same amount of
        #positive and negative reviews
        sample_size = min(len([x[1] for x in train_reviews if x[1] < 4]),
                              len([x[1] for x in train_reviews if x[1] >= 4]))

        bad_reviews = [x for x in train_reviews if x[1] < 4]
        good_reviews = [x for x in train_reviews if x[1] >= 4]

        for L in range(0, int(float(sample_size)/float(2))):
            sub_train_reviews.append(bad_reviews[L][0])
            sub_train_reviews.append(good_reviews[L][0])
            train_labels.append(bad_reviews[L][1])
            train_labels.append(good_reviews[L][1])

        #Make the train labels binary
        train_labels = [1 if x >=4 else 0 for x in train_labels]
        
        #Sanity check for non-empty training reviews
        if not sub_train_reviews:
            string_keys_dict[str(users.keys()[j])] = test_results
            continue
        else:
            for i in range(0, len(business_ids)):
                rlist = []
                for obj in reviews.find({'business_id':business_ids[i]}):
                    rlist.append(obj)
                restreview[business_ids[i]] = rlist

            restaurant_df = yml.make_biz_df(users.keys()[j], restreview)

            #Make a FeatureUnion object with the desired features then fit to train reviews
            feature_selection = {"sent_tf":(True, True, False), 
                                 "sent": (True,False,False),
                                 "tf_lda": (False,True,True), 
                                 "all": (True, True, True)}

            for feature in feature_selection.keys():
                #Make a FeatureUnion object with the desired features then fit to train reviews
                comb_features = yml.make_featureunion(sent_percent=feature_selection[feature][0], 
                                                      tf = feature_selection[feature][1], 
                                                      lda = feature_selection[feature][2])

                delta_vect = None
                comb_features.fit(sub_train_reviews)
                train_features = comb_features.transform(sub_train_reviews)

                #Fit LSI model and return number of LSI topics
                lsi, topics, dictionary = yml.fit_lsi(sub_train_reviews)
                train_lsi = yml.get_lsi_features(sub_train_reviews, lsi, topics, dictionary)

                #Stack the LSI and combined features together
                train_features = sparse.hstack((train_features, train_lsi))
                train_features = train_features.todense()

                #fit each model in turn 
                model_runs = {"svm": (True, False, False),
                              "rf": (False, True, False), 
                              "naive_bayes": (False, False, True)}

                for model_run in model_runs.keys():
                    clf = yml.fit_model(train_features, train_labels, svm_clf = model_runs[model_run][0], 
                                    RandomForest = model_runs[model_run][1], 
                                        nb = model_runs[model_run][2])
                    threshold = 0.7
                    error = yml.test_user_set(test_set, clf, restaurant_df, user_df, comb_features, 
                                              threshold, lsi, topics, dictionary, delta_vect)
                    test_results[str((feature, model_run))] = (yml.get_log_loss(error), 
                                                    yml.get_accuracy_score(error), 
                                                    yml.get_precision_score(error))
                
    string_keys_dict[str(users.keys()[j])] = test_results
            
with open('test_results.json', 'wb') as fp:
    json.dump(string_keys_dict, fp)


  0%|          | 0/1 [00:00<?, ?it/s]/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/sklearn/decomposition/online_lda.py:508: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)
/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/sklearn/decomposition/online_lda.py:508: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)
/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/sklearn/decomposition/online_lda.py:508: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)
100%|██████████| 1/1 [09:06<00:00, 546.76s/it]

The best performing (feature, model) combination for this user was a combination of all of the features in a linear support vector machine.


In [49]:
test_results_df = pd.DataFrame.from_dict(test_results, orient = 'index')
test_results_df['model'] = test_results_df.index
test_results_df.columns = ['log_loss', 'accuracy', 'precision', 'model']
test_results_df.plot(x = 'model', y = ['log_loss', 'accuracy', 'precision'], kind = 'bar')
ax = plt.subplot(111)
ax.legend(bbox_to_anchor=(1.4, 1))
plt.show()


Make a Recommendation


In [50]:
top_results = []
#Get feature and model combination that yields the highest precision
for key in test_results.keys():
    feat_model = make_tuple(key)
    if not top_results:
        top_results = [(feat_model,test_results[key][2])]
    else:
        if test_results[key][2] > top_results[0][1]:
            top_results.pop()
            top_results = [(feat_model, test_results[key][2])]
feat_result = top_results[0][0][0]
model_result = top_results[0][0][1]

for j in tqdm.tqdm(range(55, 56)):
    user_df = yml.make_user_df(users[users.keys()[j]])
    business_ids = list(set(user_df['biz_id']))

    #Create a list of training reviews and training ratings
    for rest_id in business_ids:
        train_reviews.append((user_df[user_df['biz_id'] == rest_id]['review_text'].iloc[0],
                                 user_df[user_df['biz_id'] == rest_id]['rating'].iloc[0]))

    #Create an even sample s.t. len(positive_reviews) = len(negative_reviews)
    sample_size = min(len([x[1] for x in train_reviews if x[1] < 4]),
                          len([x[1] for x in train_reviews if x[1] >= 4]))
    
    bad_reviews = [x for x in train_reviews if x[1] < 4]
    good_reviews = [x for x in train_reviews if x[1] >= 4]
    
    train_labels = []
    sub_train_reviews = []
    for L in range(0, int(float(sample_size)/float(2))):
        sub_train_reviews.append(bad_reviews[L][0])
        sub_train_reviews.append(good_reviews[L][0])
        train_labels.append(bad_reviews[L][1])
        train_labels.append(good_reviews[L][1])
        
    #Make the train labels binary
    train_labels = [1 if x >=4 else 0 for x in train_labels]
    
    #Fit LSI model and return number of LSI topics
    lsi, topics, dictionary = yml.fit_lsi(sub_train_reviews)

    #Make a FeatureUnion object with the desired features then fit to train reviews
    feature_selection = {"sent_tf":(True, True, False), 
                         "sent": (True,False,False),
                         "tf_lda": (False,True,True), 
                         "all": (True, True, True)}
    top_feature = feature_selection['all']
    
    comb_features = yml.make_featureunion(sent_percent=top_feature[0], 
                                          tf = top_feature[1], 
                                          lda = top_feature[2])
        
    comb_features.fit(sub_train_reviews)
    train_features = comb_features.transform(sub_train_reviews)
    train_lsi = yml.get_lsi_features(sub_train_reviews, lsi, topics, dictionary)
    train_features = sparse.hstack((train_features, train_lsi))
    train_features = train_features.todense()

    #Fit LSI model and return number of LSI topics
    lsi, topics, dictionary = yml.fit_lsi(sub_train_reviews)
        
    #Get the top performing model and fit using that model
    model_runs = {"svm": (True, False, False),
                  "rf": (False, True, False), 
                  "naive_bayes": (False, False, True)}
    
    top_model = model_runs['svm']
    clf = yml.fit_model(train_features, train_labels, svm_clf = top_model[0], 
                RandomForest = top_model[1], 
                    nb = top_model[2])

    threshold = 0.7
    user_results = {}
    for key in scrapped_reviews.keys():
        user_results[key] = yml.make_rec(scrapped_reviews[key], clf, threshold, comb_features, 
                                lsi, topics, dictionary)


  0%|          | 0/1 [00:00<?, ?it/s]/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/sklearn/decomposition/online_lda.py:508: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)
100%|██████████| 1/1 [04:05<00:00, 245.68s/it]

In [51]:
################################################################
#Collect the results into a list of tuples, then select the top
#5 most confident good recs and top 5 most confident bad recs 
#for each location
################################################################
tuple_results = {}
for key in user_results.keys():
    tuple_results[key] = []
    for result in user_results[key]:
        tuple_results[key].append((result[1], result[2], result[3]))
    tuple_results[key] = sorted(tuple_results[key], key=lambda tup: tup[1])

Let's look at the top 5 restaurants for each location


In [52]:
for key in tuple_results.keys():
    print "The top 5 recommendations for " + key + " are: "
    print tuple_results[key][-5:]


The top 5 recommendations for ny are: 
[(u'/biz/beijing-dumpling-house-jamaica', 0.94999999999999996, 1), (u'/biz/jongro-bbq-new-york-3', 0.94999999999999996, 1), (u'/biz/grindhaus-new-york-2', 1.0, 1), (u'/biz/nana-ramen-brooklyn', 1.0, 1), (u'/biz/santa-panza-brooklyn', 1.0, 1)]
The top 5 recommendations for la are: 
[(u'/biz/sharkys-woodfired-mexican-grill-marina-del-rey-3', 1.0, 1), (u'/biz/sharksteeth-los-angeles', 1.0, 1), (u'/biz/la-barca-restaurant-los-angeles', 1.0, 1), (u'/biz/krimseys-cajun-kitchen-los-angeles', 1.0, 1), (u'/biz/spoke-bicycle-cafe-los-angeles-3', 1.0, 1)]
The top 5 recommendations for austin are: 
[(u'/biz/saigon-le-vendeur-austin', 0.94999999999999996, 1), (u'/biz/cafe-josie-austin-3', 0.94999999999999996, 1), (u'/biz/rudys-country-store-and-bar-b-q-austin-5', 1.0, 1), (u'/biz/blaze-fast-fired-pizza-austin', 1.0, 1), (u'/biz/mission-dogs-austin', 1.0, 1)]
The top 5 recommendations for dc are: 
[(u'/biz/flippin-pizza-washington-4', 0.94999999999999996, 1), (u'/biz/ben-tre-washington', 0.94999999999999996, 1), (u'/biz/timber-pizza-company-washington', 0.94999999999999996, 1), (u'/biz/flavio-restaurant-washington', 1.0, 1), (u'/biz/daikaya-ramen-shop-washington', 1.0, 1)]
The top 5 recommendations for chicago are: 
[(u'/biz/pizzaiolo-chicago', 1.0, 1), (u'/biz/buddy-guys-legends-chicago', 1.0, 1), (u'/biz/roti-modern-mediterranean-chicago-9', 1.0, 1), (u'/biz/coalfire-pizza-chicago-2', 1.0, 1), (u'/biz/kim-and-carlos-hot-dog-stand-chicago', 1.0, 1)]

Let's look at the bottom 5 restaurants for each location


In [53]:
for key in tuple_results.keys():
    print "The bottom 5 recommendations for " + key + " are: "
    print tuple_results[key][0:5]


The bottom 5 recommendations for ny are: 
[(u'/biz/salt-and-bone-smokehouse-astoria', 0.10000000000000001, 0), (u'/biz/grandmas-house-brooklyn', 0.20000000000000001, 0), (u'/biz/pies-n-thighs-brooklyn', 0.20000000000000001, 0), (u'/biz/amy-ruths-new-york', 0.25, 0), (u'/biz/roll-n-roaster-brooklyn', 0.25, 0)]
The bottom 5 recommendations for la are: 
[(u'/biz/poppy-rose-los-angeles', 0.14999999999999999, 0), (u'/biz/leos-tacos-truck-los-angeles', 0.20000000000000001, 0), (u'/biz/eggslut-los-angeles-5', 0.20000000000000001, 0), (u'/biz/joey-woodland-hills-los-angeles', 0.25, 0), (u'/biz/jacks-n-joe-los-angeles-4', 0.25, 0)]
The bottom 5 recommendations for austin are: 
[(u'/biz/j-t-youngbloods-austin-2', 0.14999999999999999, 0), (u'/biz/art-of-tacos-austin', 0.20000000000000001, 0), (u'/biz/rositas-al-pastor-austin', 0.29999999999999999, 0), (u'/biz/biscuits-groovy-south-austin', 0.29999999999999999, 0), (u'/biz/toaster-austin', 0.29999999999999999, 0)]
The bottom 5 recommendations for dc are: 
[(u'/biz/gcdc-grilled-cheese-bar-washington', 0.20000000000000001, 0), (u'/biz/central-michel-richard-washington', 0.25, 0), (u'/biz/gw-delicatessen-washington', 0.29999999999999999, 0), (u'/biz/blt-steak-washington-2', 0.29999999999999999, 0), (u'/biz/cornerstone-cafe-washington', 0.29999999999999999, 0)]
The bottom 5 recommendations for chicago are: 
[(u'/biz/primo-chukis-chicago', 0.25, 0), (u'/biz/wildberry-pancakes-and-cafe-chicago-2', 0.29999999999999999, 0), (u'/biz/palette-chicago-chicago', 0.34999999999999998, 0), (u'/biz/t-and-b-grill-chicago', 0.34999999999999998, 0), (u'/biz/antique-taco-chicago', 0.34999999999999998, 0)]

Let's take a step back and look at the user's word choice and tone. First let's look at the user_df dataframe, which contains all of the user's reviews and ratings.


In [54]:
user_df = yml.make_user_df(users[users.keys()[j]])
user_df.head()


Out[54]:
biz_id rating review_text
0 -aaVZUYi4Ys_DP0rcBsFTg 4 Been here a couple of times and enjoyed it ver...
1 1NbNo3v8XO9Sze_nqG2fyg 3 Family and I thought this place wasnt too bad ...
2 6CpAWkTcqqNtGAfOsPeFUw 3 I like Arbys because they are different Not y...
3 A6HfWbmTpJRlwl_-SxwFUg 4 Only been here once Had a small order so ther...
4 0qoUYvAGnkcc3vzY-P-0_A 2 I am actually a Filibertos fan however this lo...

Let's look at the most important TF-IDF features for our linear SVM model


In [55]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
    
tfv = TfidfVectorizer(ngram_range = (2,2), stop_words = 'english')
tfv.fit(sub_train_reviews)

X_train = tfv.transform(sub_train_reviews)
ex_clf = svm.LinearSVC()
ex_clf.fit(X_train, train_labels)
yml.plot_coefficients(ex_clf, tfv.get_feature_names())


Let's take a look at the topics generated by LDA. Specifically, we focus only on the good reviews and plot the topics that appeared most often in the good reviews.


In [56]:
#Let's take a look at sample weightings for the user's GOOD reviews
good_reviews = [a for (a,b) in zip(sub_train_reviews, train_labels) if b == 1]
vectorizer = TfidfVectorizer(ngram_range = (2,2), stop_words = 'english')
tf = vectorizer.fit(sub_train_reviews)
lda_fit = LatentDirichletAllocation(n_topics=50).fit(tf.transform(sub_train_reviews))

tf_good = tf.transform(good_reviews)
lda_good = lda_fit.transform(tf_good)

#Take the average of each topic weighting amongst good reviews and graph each topic
topic_strings = ["Topic " + str(x) for x in range(0,50)]

topic_dict = {}
for review in lda_good:
    for x in range(0,50):
        try:
            topic_dict[topic_strings[x]].append(review[x])
        except:
            topic_dict[topic_strings[x]] = [review[x]]
            
average_top_weight = {}
for x in range(0,50):
    average_top_weight[topic_strings[x]] = reduce(lambda x, y: x + y, topic_dict[topic_strings[x]]) 
    / len(topic_dict[topic_strings[x]])


/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/sklearn/decomposition/online_lda.py:508: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)

In [57]:
##############    
#Plot the average weights for each topic in the good reviews
##############     
#Find the average topic weights for each topic
average_topics = pd.DataFrame.from_dict(average_top_weight, orient = 'index')
average_topics.columns = ['topic_weight']
average_topics['topic'] = average_topics.index
average_topics['topic'] = [int(x[5:8]) for x in average_topics['topic']]
average_topics = average_topics.sort_values(['topic'])
x_max = average_topics.sort_values('topic_weight')['topic_weight'][-2] + 1

#Make the plot
good_plt = average_topics.plot(x='topic', y='topic_weight', kind='scatter', legend=False)
yml.label_point(average_topics.topic, average_topics.topic_weight, good_plt)
good_plt.set_ylim(0, x_max)
good_plt.set_xlim(0, 50)


Out[57]:
(0, 50)

Let's look at the top 5 words in each of these topics


In [58]:
#View the top words in the outlier topics from the LDA representation
yml.label_point(average_topics.topic, average_topics.topic_weight, good_plt)

a = pd.concat({'x': average_topics.topic, 
                   'y': average_topics.topic_weight}, axis=1)
top_topics = [a[a['y'] == max(a['y'])]['x'][0]]
a = a[a['y'] != max(a['y'])]
for i, point in a.iterrows():
    if (point['y'] > (a['y'].mean() + 1.5 * a['y'].std()) ):
        top_topics.append(int(point['x']))
            

#Display top words in each topic                     
no_top_words = 10
tf_feature_names = vectorizer.get_feature_names()

yml.display_topics(lda_fit, tf_feature_names, no_top_words, top_topics)


Topic 3:
peak hours wife daughter burgers good good fries family really place luckily luckily came environment burgers rings great times parking
Topic 5:
uniform policies original set times took run place tv series know original went inside shame run yr old inside shame
Topic 9:
place clean order problems clean kept problems place kept crew crew nice small order room like like little wanted watch
Topic 18:
grilled cheese change service service better better lot patio going going change make place inside hopefully hopefully going location outside
Topic 41:
bbq sauce 99 cents prime rib fries wchili great come visits blt impressed liked music great twice service meatball sandwich

Next, let's take a look at what kind of restaurants the user likes


In [59]:
good_restaurants = user_df[user_df['rating'] >= 4]['biz_id']
bis_data = db.get_collection('restaurants')
#Pull each restaurant attribute from the MongoDB
restreview_good = {}
for i in tqdm.tqdm(range(0, len(good_restaurants))):
    rlist = []
    for obj in bis_data.find({'business_id':business_ids[i]}):
        rlist.append(obj)
    restreview_good[business_ids[i]] = rlist


100%|██████████| 145/145 [00:02<00:00, 54.93it/s]

In [60]:
#Get all the categories for the good restaurants 
good_list = []
for key in restreview_good.keys():
    good_list.extend(restreview_good[key][0]['categories'])
good_list = [word for word in good_list if (word != u'Restaurants')]
good_list = [word for word in good_list if (word != u'Food')]
unique_categories = list(set(good_list))
category_count = [good_list.count(cat) for cat in unique_categories]
category_list = [(a,b) for (a,b) in zip(unique_categories, category_count) if b >= 10]
unique_categories = [a for (a,b) in category_list] 
category_count = [b for (a,b) in category_list]
biz_category = pd.DataFrame({'category': unique_categories, 'count': category_count})

#Plot only categories that show up at least 10 times
good_plt = biz_category.plot(x='category', y='count', kind='bar', legend=False)
good_plt.set_ylim(0,40)


Out[60]:
(0, 40)

The restaurants that we've recommended include many traditional and new American restaurants, as well as Burger places. This suggests that our recommendation system does a good job picking up on the latent preferences of the user.

With LSA, we arbitrarily chose the number of topics to be the mean of the singular values. This is a meaningless choice and completely arbitrary.

The words in the below representation represent word vectors in the initial term-document matrix, the coefficients on each word vector is a result of the singular value decomposition.


In [61]:
print "The number of LSA topics is: " + str(lsi.num_topics)
for x in lsi.show_topics():
    print str(x) + "\n"


The number of LSA topics is: 8
(0, u'0.122*"daughter" + 0.120*"pizza" + 0.119*"chicken" + 0.116*"sandwich" + 0.113*"place" + 0.111*"wife" + 0.110*"burger" + 0.105*"fries" + 0.103*"like" + 0.102*"great"')

(1, u'-0.375*"pizza" + 0.169*"times" + 0.168*"franchise" + 0.148*"burnt" + 0.148*"ends" + -0.139*"wings" + -0.128*"crust" + 0.126*"always" + -0.124*"pepperoni" + -0.114*"daughter"')

(2, u'-0.320*"pizza" + 0.223*"breakfast" + 0.180*"wei" + 0.159*"orange" + 0.148*"bacon" + -0.144*"donuts" + 0.130*"daughter" + 0.127*"sandwich" + -0.120*"crust" + 0.118*"golden"')

(3, u'0.383*"ribs" + 0.250*"salad" + -0.231*"pizza" + 0.133*"potato" + 0.132*"meat" + -0.129*"franchise" + 0.126*"awe" + 0.122*"half" + 0.116*"tank" + 0.115*"dillons"')

(4, u'-0.213*"breakfast" + -0.206*"donuts" + 0.189*"burger" + 0.178*"decent" + 0.172*"chili" + 0.150*"fries" + 0.136*"cheese" + 0.130*"environment" + 0.115*"double" + -0.113*"eggs"')

(5, u'0.258*"donuts" + -0.186*"burnt" + -0.186*"ends" + -0.168*"breakfast" + -0.133*"came" + 0.123*"sandwich" + -0.121*"eggs" + -0.121*"pizza" + 0.119*"wei" + 0.116*"donut"')

(6, u'0.296*"wei" + 0.195*"golden" + 0.157*"donuts" + -0.148*"pies" + 0.144*"noodles" + -0.137*"pie" + -0.132*"slice" + 0.116*"orange" + 0.114*"peak" + 0.112*"beef"')

(7, u'-0.217*"wei" + 0.210*"donuts" + -0.159*"salad" + -0.148*"golden" + -0.142*"ribs" + 0.133*"chili" + -0.120*"noodles" + 0.116*"steak" + 0.113*"outside" + 0.112*"parking"')


In [63]:
#Let's take a look at sample weightings for the user's GOOD reviews
good_reviews = [a for (a,b) in zip(sub_train_reviews, train_labels) if b == 1]
lsi, topics, dictionary = yml.fit_lsi(good_reviews)
train_lsi = yml.get_lsi_features(good_reviews, lsi, topics, dictionary).todense()
train_lsi = [np.array(x[0])[0] for x in train_lsi]

#Take the average of each topic weighting amongst good reviews and graph each topic
lsi_topic_strings = ["Topic " + str(x) for x in range(0,int(lsi.num_topics))]

lsi_topic_dict = {}
for review in train_lsi:
    for x in range(0,int(lsi.num_topics)):
        try:
            lsi_topic_dict[lsi_topic_strings[x]].append(review[x])
        except:
            lsi_topic_dict[lsi_topic_strings[x]] = [review[x]]
            
average_lsi_weight = {}
for x in range(0,int(lsi.num_topics)):
    average_lsi_weight[lsi_topic_strings[x]] = reduce(lambda x, y: x + y, 
                                                      lsi_topic_dict[lsi_topic_strings[x]]) 
    / len(lsi_topic_dict[lsi_topic_strings[x]])

In [64]:
##############    
#Plot the average weights for each topic in the good reviews
##############     
#Find the average topic weights for each topic
lsi_average_topics = pd.DataFrame.from_dict(average_lsi_weight, orient = 'index')
lsi_average_topics.columns = ['topic_weight']
lsi_average_topics['topic'] = lsi_average_topics.index
lsi_average_topics['topic'] = [int(x[5:8]) for x in lsi_average_topics['topic']]
lsi_average_topics = lsi_average_topics.sort_values(['topic'])
x_max = lsi_average_topics.sort_values('topic_weight')['topic_weight'][-2] + 1

#Make the plot
lsi_good_plt = lsi_average_topics.plot(x='topic', y='topic_weight', kind='scatter', legend=False)
yml.label_point(lsi_average_topics.topic, lsi_average_topics.topic_weight, lsi_good_plt)