Import necessary modules


In [56]:
import json
import pandas as pd
import re
import string
from scipy import sparse
import numpy as np
from pymongo import MongoClient
from nltk.corpus import stopwords
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from gensim import corpora, models, similarities, matutils
import tqdm

Import data files and word dictionaries


In [14]:
lh_neg = open('../input/negative-words.txt', 'r').read()
lh_neg = lh_neg.split('\n')
lh_pos = open('../input/positive-words.txt', 'r').read()
lh_pos = lh_pos.split('\n')
restaurant_reviews = json.load(open("../input/rest_review_dictionary.json"))
user_reviews_json = json.load(open("../input/user_review_dictionary.json"))
word_list = list(set(lh_pos + lh_neg))

Pick a subset of users that have at least 200 reviews, run an iterative test on these users


In [15]:
bigusers = []
for i in tqdm.tqdm(range(0, len(user_reviews_json.keys()))):
    if len(user_reviews_json[user_reviews_json.keys()[i]]) > 100:
        bigusers.append(user_reviews_json.keys()[i])


100%|██████████| 50/50 [00:00<00:00, 99391.09it/s]

Let's pick a specific user and start building out the recommendation on her data


In [121]:
user_specific_reviews = user_reviews_json[bigusers[3]]

Create a separate list for the review text and the review ratings, then aggregate them into a dataframe.


In [122]:
user_reviews = []
user_ratings = []
business_ids = []

for review in user_specific_reviews:
    user_reviews.append(review['text'])
    user_ratings.append(review['stars'])
    business_ids.append(review['business_id'])

user_reviews = [review.encode('utf-8').translate(None, string.punctuation) for review in user_reviews]
    
user_df = pd.DataFrame({'review_text': user_reviews, 'rating': user_ratings, 'biz_id': business_ids})

Connect to the mongodb and pull in the relevant restaurant reviews


In [123]:
#Start a connection with the AWS instance and pull in the business reviews database
ip = '54.146.170.140'
conn = MongoClient(ip, 27017)
conn.database_names()
db = conn.get_database('cleaned_data')
reviews = db.get_collection('restaurant_reviews')

In [124]:
restreview = {}
for i in tqdm.tqdm(range(0, len(business_ids))):
    rlist = []
    for obj in reviews.find({'business_id':business_ids[i]}):
        rlist.append(obj)
    restreview[business_ids[i]] = rlist


100%|██████████| 226/226 [03:54<00:00,  1.02s/it]

Create a separate list for each review for the businesses that show up in the business_id list. Remove all reviews that relate to the current user.


In [125]:
user_id = user_reviews_json.keys()[29]
rest_reviews = []
rest_ratings = []
biz_ids = []
for i in tqdm.tqdm(range(0, len(restreview.keys()))):
    for restaurant in restreview[restreview.keys()[i]]:
        if restaurant['user_id'] != user_id:
            rest_reviews.append(restaurant['text'])
            rest_ratings.append(restaurant['stars'])
            biz_ids.append(restreview.keys()[i])
        else:
            pass
restaurant_df = pd.DataFrame({'review_text': rest_reviews, 'rating': rest_ratings, 'biz_id': biz_ids})


100%|██████████| 226/226 [00:00<00:00, 2542.76it/s]

In [209]:
#Feature objects and functions
stop_words = set(stopwords.words('english'))

def sent_percent(review):
    regex_words = re.compile('[a-z]+')
    words = [x.lower() for x in review.split(' ')]
    words = [x for x in words if regex_words.match(x)]
    pos_count, neg_count = 0, 0
    for word in words:
        if word in lh_pos:
            pos_count += 1
        elif word in lh_neg:
            neg_count += 1
    return [float(pos_count)/float(len(words)), float(neg_count)/float(len(words))]

pos_vectorizer = CountVectorizer(vocabulary = lh_pos)
neg_vectorizer = CountVectorizer(vocabulary = lh_neg)
class SentimentPercentage(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, reviews):
        ##Take in a list of textual reviews and return a list with two elements:
        ##[Positive Percentage, Negative Percentage]
        pos_vect = pos_vectorizer.transform(reviews)
        neg_vect = neg_vectorizer.transform(reviews)
        features = []
        
        for i in range(0, len(reviews)):
            sent_percentage = []
            sent_percentage.append(float(pos_vect[i].sum())/float(len(reviews[i])))
            sent_percentage.append(float(neg_vect[i].sum())/float(len(reviews[i])))
            features.append(sent_percentage)
            
        return np.array(features)

    def fit(self, reviews, y=None, n_grams = None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class TfIdfGramTransformer(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, reviews):
        tf_vector = vectorizer.transform(reviews)
        return tf_vector

    def fit(self, reviews, y=None, n_grams = (0,1)):
        vectorizer = TfidfVectorizer(ngram_range = n_grams, stop_words = 'english')
        vectorizer.fit(reviews)
        """Returns `self` unless something different happens in train and test"""
        return vectorizer

Testing

Whether or not a user likes a recommendation is hard to capture because we don't know if they like it or not until after we recommend it. In the above design, we'd be recommending a new restaurant to the user. To see if they actually like it, we'd have to follow up after we make the recommendation and ask them how they felt. But this isn't feasible because we don't have a set of people we can just ask how they felt.

That is, we know $y_{pred}$ but we don't know $y_{actual}$.

But we can get around this because sometimes a user review rating is also that user's restaurant rating. For a given user, if she only has one review for a restaurant then the rating for that review is also her rating for the restaurant.

We can use this to test our recommendation system. We propose the following test design:

Build:

  • As before, let $B$ be the total set of user reviews.
  • Let $R$ be the set of restaurants that the user has reviewed only once.
  • Take some percentage, $p$, of the set $R$ and take the subset of reviews from $B$ that correspond to these restaurants. Let this be the set of test restaurants $B_{test}$.
  • Set the remaining $1-p$ percentage of the set $R$ and call this the training set of restaurants $R_{train}$.
  • Note every restaurant in $R_{train}$ and $B_{test}$ has User's Restaurant Rating = User's Review Rating

Run:

  1. For each review in $B_{test}$, create the tuple (User Review, Restaurant Rating, Restaurant ID) and replace the instance in $B_{test}$ with the tuple.

  2. For each restaurant in $R_{train}$, find the total set of reviews from the Reviews database. Let this set be $Y$, where each element in $Y$ is a tuple (User Review, Restaurant Rating, Restaurant ID)

  3. We run each of the algorithms above, using $Y$ as the training set and $B_{test}$ as the test set.

  4. Step 3 results in a set $B_{result}$ where each element is characterized by (User Review, Actual Restaurant Rating, Predicted Restaurant Rating, Predicted Restaurant). Note the cardinality of $B_{result}$ is the same as that of $B_{test}$
  5. Set $y_{pred} =$ I(Predicted Restaurant) and $y_{actual} =$ I(Restaurant) where the indicator function I() is 1 if the user rated the restaurant at least a 4 and is 0 if the user rated the restaurant less than a 4
  6. The RMSE for the recommended restaurants is given by the following loss function:
$$RMSE = \sum_{i=1}^{N} \sqrt{\frac{(y_{i, pred} - y_{i, actual})^{2}}{N}}$$

Where N is the number of recommended restaurants in $B_{result}$. $y_{i, pred}$ is the predicted restaurant rating, $y_{i, actual}$ is the actual rating that the user gave to the restaurant. A RMSE score of 0 is a perfect score and means that the recommendation system did really well. In this case, success means that the recommendation system was able to accurately predict how the user would feel about the restaurant on a binary scale (good or bad).

Note, this is equivalent to using the mean absolute error because of our label construction. This function is analagous to the mean squared error loss function used in the 2013 Yelp RecSys challenge with the difference being that $y_{i, pred}$ and $y_{i, actual}$ are discrete categorical variables $\in \{1, 2, 3, 4, 5\}$:

$$RMSE = \sum_{i=1}^{N} \sqrt{\frac{(y_{i, pred} - y_{i, actual})^{2}}{N}}$$

First, start by splitting the restaurants that the user has reviewed into training and test sets


In [ ]:
#Benchmark is simply 50/50 for each prediciton, so let's take a look at the log loss for that case
benchmark_results = [0.5] * len(test_labels)
print "The number to beat is: " + str(log_loss(test_labels, benchmark_results))

In [126]:
#Now, let's do this process iteratively for a larger sub sample of test reviews
#First, start by splitting the restaurants that the user has reviewed into training and test sets
split_samp = .20
test_set = business_ids[0:int(len(business_ids) * split_samp)]
training_set = business_ids[int(len(business_ids) * split_samp): len(business_ids)]
train_reviews, train_ratings = [], []

for rest_id in training_set:
    train_reviews.extend(list(user_df[user_df['biz_id'] == rest_id]['review_text']))
    train_ratings.extend(list(user_df[user_df['biz_id'] == rest_id]['rating']))

#Transform the star labels into a binary class problem, 0 if rating is < 4 else 1
train_labels = [1 if x >=4 else 0 for x in train_ratings ]

Visualization ideas:

  1. The results across all users. Which ML algorithms performed best? Was there a clear winner or did people differ?
  2. Show the performance of the algorithms across accuracy types (log-loss, mean squared error, accuracy score). Note that in our case accuracy score = 1 - mean squared error and that mean squared error = abs error
    • Log-loss will be more informative in our case. The probabilities will be different depending on the algorithm, we can get a measure of "how confident" our algorithms are
  3. Graph the mean squared error against the threshold probability we use to classify as 0 or 1

Choosing the top 50 by probability performs better than focusing on the entire set, show this graphically


In [236]:
###########################
####LSI Features
###########################
texts = [[word for word in review.lower().split() if (word not in stop_words)]
          for review in train_reviews]
dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

numpy_matrix = matutils.corpus2dense(corpus, num_terms=50000)
singular_values = np.linalg.svd(numpy_matrix, full_matrices=False, compute_uv=False)
mean_sv = sum(list(singular_values))/len(singular_values)
topics = int(mean_sv)
stop_words = set(stopwords.words('english'))

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
index = similarities.MatrixSimilarity(lsi[corpus_tfidf], num_features = 10000) 

train_lsi = lsi[corpus_tfidf]
train_lsi = [[train[1] for train in train_review] for train_review in train_lsi]
train_lsi = [[0.0000000001] * topics if len(x) != topics else x for x in train_lsi]
train_lsi = sparse.coo_matrix(train_lsi)

In [194]:
train_features = train_lsi
#XGBoost training
#gbm = xgb.XGBClassifier(max_depth=10, n_estimators=400, learning_rate=0.02).fit(train_features, train_labels)
#RandomForest training
#rf = RandomForestClassifier()
rf.fit(train_features, train_labels)
#SVM training
svm_classifier = svm.SVC(kernel='linear')
svm_classifier.fit(train_features, train_labels)
error = []
for i in tqdm.tqdm(range(0,len(test_set))):
    predicted_rating = 0
    #Get reviews for that restaurant
    test_reviews =[]
    test_reviews.extend(list(restaurant_df[restaurant_df['biz_id'] == test_set[i]]['review_text']))
    
    #Transform features
    test_features = comb_features.transform(test_reviews)
    
    #LSI Features
    test_texts = [[word for word in test_set[i].lower().split() if (word not in stop_words)]
          for review in test_reviews]
    test_corpus = [dictionary.doc2bow(test) for test in test_texts]
    test_tfidf = tfidf[test_corpus]
    test_lsi = lsi[test_tfidf]
    test_lsi = [[test[1] for test in test_review] for test_review in test_lsi]
    test_lsi = [[0.0000000001] * topics if len(x) != topics else x for x in test_lsi]
    
    test_lsi = sparse.coo_matrix(test_lsi)
    #stacked_test_features = sparse.hstack((test_features, test_lsi))
    stacked_test_features = test_lsi
    #Get XGBoost prediction
    #test_prediction = gbm.predict(stacked_test_features)
    #Get SVM prediction
    #test_prediction = svm_classifier.predict(stacked_test_features)
    #Get Random Forest prediction
    test_prediction = rf.predict(stacked_test_features)   

    if test_prediction.mean() > 0.7:
        predicted_rating = 1

    actual_rating = list(user_df[user_df['biz_id'] == test_set[i]]['rating'])[0]
    if actual_rating >= 4:
        actual_rating = 1
    else:
        actual_rating = 0

    error.append(abs(predicted_rating - actual_rating))


100%|██████████| 45/45 [00:05<00:00,  7.98it/s]

In [195]:
print "The LSA mean absolute error is: " + str(sum(error) / float(len(error)))
#print "The svm (1,1) average mean absolute error is: " + str(sum(svm_error) / float(len(svm_error)))


The LSA mean absolute error is: 0.533333333333

In [373]:
comb_features = FeatureUnion([('sent_percent',SentimentPercentage()),('tf', TfIdfGramTransformer()), 
                              ('lda', Pipeline([('bow', TfidfVectorizer(stop_words='english',ngram_range=(1,1))), 
                                        ('lda_transform', LatentDirichletAllocation(n_topics=int(mean_sv)))]))
                             ])

comb_features.fit(train_reviews)
train_features = comb_features.transform(train_reviews)

#XGBoost training
#gbm = xgb.XGBClassifier(max_depth=10, n_estimators=400, learning_rate=0.02).fit(train_features, train_labels)
#RandomForest training
rf = RandomForestClassifier(max_depth = 100, max_leaf_nodes=50)
rf.fit(train_features, train_labels)
#SVM training
# svm_classifier = svm.LinearSVC()
# svm_classifier.fit(train_features, train_labels)
test_error = []
for i in tqdm.tqdm(range(0,len(test_set))):
    predicted_rating = 0
    #Get reviews for that restaurant
    test_reviews =[]
    test_reviews.extend(list(restaurant_df[restaurant_df['biz_id'] == test_set[i]]['review_text']))
    
    #Transform features
    stacked_test_features = comb_features.transform(test_reviews)
    
    #Get XGBoost prediction
    #test_prediction = gbm.predict(stacked_test_features)
    #Get SVM prediction
    #test_prediction = svm_classifier.predict(stacked_test_features)
    #Get Random Forest prediction
    test_prediction = rf.predict(stacked_test_features)   
    if test_prediction.mean() > 0.7:
        predicted_rating = 1

    actual_rating = list(user_df[user_df['biz_id'] == test_set[i]]['rating'])[0]
    if actual_rating >= 4:
        actual_rating = 1
    else:
        actual_rating = 0

    test_error.append(abs(predicted_rating - actual_rating))


/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/sklearn/decomposition/online_lda.py:508: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)
100%|██████████| 45/45 [00:05<00:00,  8.99it/s]

In [374]:
print "The LDA mean absolute error is: " + str(sum(test_error) / float(len(test_error)))


The LDA mean absolute error is: 0.555555555556

In [228]:
from sklearn.naive_bayes import MultinomialNB

In [376]:
comb_features = FeatureUnion([('sent_percent',SentimentPercentage()),('tf', TfIdfGramTransformer()), 
                              ('lda', Pipeline([('bow', TfidfVectorizer(stop_words='english', ngram_range = (1,1))), 
                                        ('lda_transform', LatentDirichletAllocation(n_topics=500))]))
                             ])

comb_features.fit(train_reviews)
train_features = comb_features.transform(train_reviews)
train_features = sparse.hstack((train_features, train_lsi))
train_features = train_features.todense()

#XGBoost training
#gbm = xgb.XGBClassifier(max_depth=10, n_estimators=500, learning_rate=0.02, ).fit(train_features, train_labels)
#RandomForest training
#rf = RandomForestClassifier()
#rf.fit(train_features, train_labels)
#SVM training
svm_classifier = svm.LinearSVC()
svm_classifier.fit(train_features, train_labels)
#Naive Bayes Training, note Naive Bayes requires a dense matrix for training and testing
#nb_clf = GaussianNB()
#nb_clf.fit(train_features, train_labels)
comb_error = []
test_predictions = []
for i in tqdm.tqdm(range(0,len(test_set))):
    predicted_rating = 0
    #Get reviews for that restaurant
    test_reviews =[]
    test_reviews.extend(list(restaurant_df[restaurant_df['biz_id'] == test_set[i]]['review_text']))
    
    #Transform features
    test_features = comb_features.transform(test_reviews)
    
    #LSI Features
    test_texts = [[word for word in review.lower().split() if (word not in stop_words)]
          for review in test_reviews]
    test_corpus = [dictionary.doc2bow(test) for test in test_texts]
    test_tfidf = tfidf[test_corpus]
    test_lsi = lsi[test_tfidf]
    test_lsi = [[test[1] for test in test_review] for test_review in test_lsi]
    test_lsi = [[0.000000000001] * topics if len(x) != topics else x for x in test_lsi]
    
    test_lsi = sparse.coo_matrix(test_lsi)
    stacked_test_features = sparse.hstack((test_features, test_lsi))
    stacked_test_features = stacked_test_features.todense()
    #Get XGBoost prediction
    #test_prediction = gbm.predict(stacked_test_features)
    #Get SVM prediction
    test_prediction = svm_classifier.predict(stacked_test_features)
    #Get Random Forest prediction
    #test_prediction = rf.predict(stacked_test_features)   
    #Get NB prediction
    #test_prediction = nb_clf.predict(stacked_test_features)
    
    if test_prediction.mean() > 0.7:
        predicted_rating = 1

    actual_rating = list(user_df[user_df['biz_id'] == test_set[i]]['rating'])[0]
    if actual_rating >= 4:
        actual_rating = 1
    else:
        actual_rating = 0
        
    test_predictions.append((test_prediction, actual_rating))
    
    comb_error.append(abs(predicted_rating - actual_rating))


/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/sklearn/decomposition/online_lda.py:508: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)
100%|██████████| 45/45 [00:06<00:00,  7.37it/s]

In [377]:
print "The LDA + LSA mean absolute error is: " + str(sum(comb_error) / float(len(comb_error)))


The LDA + LSA mean absolute error is: 0.4

In [378]:
confidence_tuple = [(float(sum(list(x[0])))/float(len(x[0])),x[1]) for x in test_predictions]
confidence_tuple.sort()
top5 = confidence_tuple[-10:]
print "Got a rec accuracy of: " + str(float(sum([x[1] for x in confidence_tuple[-10:]]))/float(10))


Got a rec accuracy of: 0.8