Import necessary modules
In [56]:
import json
import pandas as pd
import re
import string
from scipy import sparse
import numpy as np
from pymongo import MongoClient
from nltk.corpus import stopwords
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from gensim import corpora, models, similarities, matutils
import tqdm
Import data files and word dictionaries
In [14]:
lh_neg = open('../input/negative-words.txt', 'r').read()
lh_neg = lh_neg.split('\n')
lh_pos = open('../input/positive-words.txt', 'r').read()
lh_pos = lh_pos.split('\n')
restaurant_reviews = json.load(open("../input/rest_review_dictionary.json"))
user_reviews_json = json.load(open("../input/user_review_dictionary.json"))
word_list = list(set(lh_pos + lh_neg))
Pick a subset of users that have at least 200 reviews, run an iterative test on these users
In [15]:
bigusers = []
for i in tqdm.tqdm(range(0, len(user_reviews_json.keys()))):
if len(user_reviews_json[user_reviews_json.keys()[i]]) > 100:
bigusers.append(user_reviews_json.keys()[i])
Let's pick a specific user and start building out the recommendation on her data
In [121]:
user_specific_reviews = user_reviews_json[bigusers[3]]
Create a separate list for the review text and the review ratings, then aggregate them into a dataframe.
In [122]:
user_reviews = []
user_ratings = []
business_ids = []
for review in user_specific_reviews:
user_reviews.append(review['text'])
user_ratings.append(review['stars'])
business_ids.append(review['business_id'])
user_reviews = [review.encode('utf-8').translate(None, string.punctuation) for review in user_reviews]
user_df = pd.DataFrame({'review_text': user_reviews, 'rating': user_ratings, 'biz_id': business_ids})
Connect to the mongodb and pull in the relevant restaurant reviews
In [123]:
#Start a connection with the AWS instance and pull in the business reviews database
ip = '54.146.170.140'
conn = MongoClient(ip, 27017)
conn.database_names()
db = conn.get_database('cleaned_data')
reviews = db.get_collection('restaurant_reviews')
In [124]:
restreview = {}
for i in tqdm.tqdm(range(0, len(business_ids))):
rlist = []
for obj in reviews.find({'business_id':business_ids[i]}):
rlist.append(obj)
restreview[business_ids[i]] = rlist
Create a separate list for each review for the businesses that show up in the business_id list. Remove all reviews that relate to the current user.
In [125]:
user_id = user_reviews_json.keys()[29]
rest_reviews = []
rest_ratings = []
biz_ids = []
for i in tqdm.tqdm(range(0, len(restreview.keys()))):
for restaurant in restreview[restreview.keys()[i]]:
if restaurant['user_id'] != user_id:
rest_reviews.append(restaurant['text'])
rest_ratings.append(restaurant['stars'])
biz_ids.append(restreview.keys()[i])
else:
pass
restaurant_df = pd.DataFrame({'review_text': rest_reviews, 'rating': rest_ratings, 'biz_id': biz_ids})
In [209]:
#Feature objects and functions
stop_words = set(stopwords.words('english'))
def sent_percent(review):
regex_words = re.compile('[a-z]+')
words = [x.lower() for x in review.split(' ')]
words = [x for x in words if regex_words.match(x)]
pos_count, neg_count = 0, 0
for word in words:
if word in lh_pos:
pos_count += 1
elif word in lh_neg:
neg_count += 1
return [float(pos_count)/float(len(words)), float(neg_count)/float(len(words))]
pos_vectorizer = CountVectorizer(vocabulary = lh_pos)
neg_vectorizer = CountVectorizer(vocabulary = lh_neg)
class SentimentPercentage(BaseEstimator, TransformerMixin):
"""Takes in two lists of strings, extracts the lev distance between each string, returns list"""
def __init__(self):
pass
def transform(self, reviews):
##Take in a list of textual reviews and return a list with two elements:
##[Positive Percentage, Negative Percentage]
pos_vect = pos_vectorizer.transform(reviews)
neg_vect = neg_vectorizer.transform(reviews)
features = []
for i in range(0, len(reviews)):
sent_percentage = []
sent_percentage.append(float(pos_vect[i].sum())/float(len(reviews[i])))
sent_percentage.append(float(neg_vect[i].sum())/float(len(reviews[i])))
features.append(sent_percentage)
return np.array(features)
def fit(self, reviews, y=None, n_grams = None):
"""Returns `self` unless something different happens in train and test"""
return self
class TfIdfGramTransformer(BaseEstimator, TransformerMixin):
"""Takes in two lists of strings, extracts the lev distance between each string, returns list"""
def __init__(self):
pass
def transform(self, reviews):
tf_vector = vectorizer.transform(reviews)
return tf_vector
def fit(self, reviews, y=None, n_grams = (0,1)):
vectorizer = TfidfVectorizer(ngram_range = n_grams, stop_words = 'english')
vectorizer.fit(reviews)
"""Returns `self` unless something different happens in train and test"""
return vectorizer
Whether or not a user likes a recommendation is hard to capture because we don't know if they like it or not until after we recommend it. In the above design, we'd be recommending a new restaurant to the user. To see if they actually like it, we'd have to follow up after we make the recommendation and ask them how they felt. But this isn't feasible because we don't have a set of people we can just ask how they felt.
That is, we know $y_{pred}$ but we don't know $y_{actual}$.
But we can get around this because sometimes a user review rating is also that user's restaurant rating. For a given user, if she only has one review for a restaurant then the rating for that review is also her rating for the restaurant.
We can use this to test our recommendation system. We propose the following test design:
Build:
Run:
For each review in $B_{test}$, create the tuple (User Review, Restaurant Rating, Restaurant ID) and replace the instance in $B_{test}$ with the tuple.
For each restaurant in $R_{train}$, find the total set of reviews from the Reviews database. Let this set be $Y$, where each element in $Y$ is a tuple (User Review, Restaurant Rating, Restaurant ID)
We run each of the algorithms above, using $Y$ as the training set and $B_{test}$ as the test set.
Where N is the number of recommended restaurants in $B_{result}$. $y_{i, pred}$ is the predicted restaurant rating, $y_{i, actual}$ is the actual rating that the user gave to the restaurant. A RMSE score of 0 is a perfect score and means that the recommendation system did really well. In this case, success means that the recommendation system was able to accurately predict how the user would feel about the restaurant on a binary scale (good or bad).
Note, this is equivalent to using the mean absolute error because of our label construction. This function is analagous to the mean squared error loss function used in the 2013 Yelp RecSys challenge with the difference being that $y_{i, pred}$ and $y_{i, actual}$ are discrete categorical variables $\in \{1, 2, 3, 4, 5\}$:
$$RMSE = \sum_{i=1}^{N} \sqrt{\frac{(y_{i, pred} - y_{i, actual})^{2}}{N}}$$First, start by splitting the restaurants that the user has reviewed into training and test sets
In [ ]:
#Benchmark is simply 50/50 for each prediciton, so let's take a look at the log loss for that case
benchmark_results = [0.5] * len(test_labels)
print "The number to beat is: " + str(log_loss(test_labels, benchmark_results))
In [126]:
#Now, let's do this process iteratively for a larger sub sample of test reviews
#First, start by splitting the restaurants that the user has reviewed into training and test sets
split_samp = .20
test_set = business_ids[0:int(len(business_ids) * split_samp)]
training_set = business_ids[int(len(business_ids) * split_samp): len(business_ids)]
train_reviews, train_ratings = [], []
for rest_id in training_set:
train_reviews.extend(list(user_df[user_df['biz_id'] == rest_id]['review_text']))
train_ratings.extend(list(user_df[user_df['biz_id'] == rest_id]['rating']))
#Transform the star labels into a binary class problem, 0 if rating is < 4 else 1
train_labels = [1 if x >=4 else 0 for x in train_ratings ]
Choosing the top 50 by probability performs better than focusing on the entire set, show this graphically
In [236]:
###########################
####LSI Features
###########################
texts = [[word for word in review.lower().split() if (word not in stop_words)]
for review in train_reviews]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
numpy_matrix = matutils.corpus2dense(corpus, num_terms=50000)
singular_values = np.linalg.svd(numpy_matrix, full_matrices=False, compute_uv=False)
mean_sv = sum(list(singular_values))/len(singular_values)
topics = int(mean_sv)
stop_words = set(stopwords.words('english'))
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
index = similarities.MatrixSimilarity(lsi[corpus_tfidf], num_features = 10000)
train_lsi = lsi[corpus_tfidf]
train_lsi = [[train[1] for train in train_review] for train_review in train_lsi]
train_lsi = [[0.0000000001] * topics if len(x) != topics else x for x in train_lsi]
train_lsi = sparse.coo_matrix(train_lsi)
In [194]:
train_features = train_lsi
#XGBoost training
#gbm = xgb.XGBClassifier(max_depth=10, n_estimators=400, learning_rate=0.02).fit(train_features, train_labels)
#RandomForest training
#rf = RandomForestClassifier()
rf.fit(train_features, train_labels)
#SVM training
svm_classifier = svm.SVC(kernel='linear')
svm_classifier.fit(train_features, train_labels)
error = []
for i in tqdm.tqdm(range(0,len(test_set))):
predicted_rating = 0
#Get reviews for that restaurant
test_reviews =[]
test_reviews.extend(list(restaurant_df[restaurant_df['biz_id'] == test_set[i]]['review_text']))
#Transform features
test_features = comb_features.transform(test_reviews)
#LSI Features
test_texts = [[word for word in test_set[i].lower().split() if (word not in stop_words)]
for review in test_reviews]
test_corpus = [dictionary.doc2bow(test) for test in test_texts]
test_tfidf = tfidf[test_corpus]
test_lsi = lsi[test_tfidf]
test_lsi = [[test[1] for test in test_review] for test_review in test_lsi]
test_lsi = [[0.0000000001] * topics if len(x) != topics else x for x in test_lsi]
test_lsi = sparse.coo_matrix(test_lsi)
#stacked_test_features = sparse.hstack((test_features, test_lsi))
stacked_test_features = test_lsi
#Get XGBoost prediction
#test_prediction = gbm.predict(stacked_test_features)
#Get SVM prediction
#test_prediction = svm_classifier.predict(stacked_test_features)
#Get Random Forest prediction
test_prediction = rf.predict(stacked_test_features)
if test_prediction.mean() > 0.7:
predicted_rating = 1
actual_rating = list(user_df[user_df['biz_id'] == test_set[i]]['rating'])[0]
if actual_rating >= 4:
actual_rating = 1
else:
actual_rating = 0
error.append(abs(predicted_rating - actual_rating))
In [195]:
print "The LSA mean absolute error is: " + str(sum(error) / float(len(error)))
#print "The svm (1,1) average mean absolute error is: " + str(sum(svm_error) / float(len(svm_error)))
In [373]:
comb_features = FeatureUnion([('sent_percent',SentimentPercentage()),('tf', TfIdfGramTransformer()),
('lda', Pipeline([('bow', TfidfVectorizer(stop_words='english',ngram_range=(1,1))),
('lda_transform', LatentDirichletAllocation(n_topics=int(mean_sv)))]))
])
comb_features.fit(train_reviews)
train_features = comb_features.transform(train_reviews)
#XGBoost training
#gbm = xgb.XGBClassifier(max_depth=10, n_estimators=400, learning_rate=0.02).fit(train_features, train_labels)
#RandomForest training
rf = RandomForestClassifier(max_depth = 100, max_leaf_nodes=50)
rf.fit(train_features, train_labels)
#SVM training
# svm_classifier = svm.LinearSVC()
# svm_classifier.fit(train_features, train_labels)
test_error = []
for i in tqdm.tqdm(range(0,len(test_set))):
predicted_rating = 0
#Get reviews for that restaurant
test_reviews =[]
test_reviews.extend(list(restaurant_df[restaurant_df['biz_id'] == test_set[i]]['review_text']))
#Transform features
stacked_test_features = comb_features.transform(test_reviews)
#Get XGBoost prediction
#test_prediction = gbm.predict(stacked_test_features)
#Get SVM prediction
#test_prediction = svm_classifier.predict(stacked_test_features)
#Get Random Forest prediction
test_prediction = rf.predict(stacked_test_features)
if test_prediction.mean() > 0.7:
predicted_rating = 1
actual_rating = list(user_df[user_df['biz_id'] == test_set[i]]['rating'])[0]
if actual_rating >= 4:
actual_rating = 1
else:
actual_rating = 0
test_error.append(abs(predicted_rating - actual_rating))
In [374]:
print "The LDA mean absolute error is: " + str(sum(test_error) / float(len(test_error)))
In [228]:
from sklearn.naive_bayes import MultinomialNB
In [376]:
comb_features = FeatureUnion([('sent_percent',SentimentPercentage()),('tf', TfIdfGramTransformer()),
('lda', Pipeline([('bow', TfidfVectorizer(stop_words='english', ngram_range = (1,1))),
('lda_transform', LatentDirichletAllocation(n_topics=500))]))
])
comb_features.fit(train_reviews)
train_features = comb_features.transform(train_reviews)
train_features = sparse.hstack((train_features, train_lsi))
train_features = train_features.todense()
#XGBoost training
#gbm = xgb.XGBClassifier(max_depth=10, n_estimators=500, learning_rate=0.02, ).fit(train_features, train_labels)
#RandomForest training
#rf = RandomForestClassifier()
#rf.fit(train_features, train_labels)
#SVM training
svm_classifier = svm.LinearSVC()
svm_classifier.fit(train_features, train_labels)
#Naive Bayes Training, note Naive Bayes requires a dense matrix for training and testing
#nb_clf = GaussianNB()
#nb_clf.fit(train_features, train_labels)
comb_error = []
test_predictions = []
for i in tqdm.tqdm(range(0,len(test_set))):
predicted_rating = 0
#Get reviews for that restaurant
test_reviews =[]
test_reviews.extend(list(restaurant_df[restaurant_df['biz_id'] == test_set[i]]['review_text']))
#Transform features
test_features = comb_features.transform(test_reviews)
#LSI Features
test_texts = [[word for word in review.lower().split() if (word not in stop_words)]
for review in test_reviews]
test_corpus = [dictionary.doc2bow(test) for test in test_texts]
test_tfidf = tfidf[test_corpus]
test_lsi = lsi[test_tfidf]
test_lsi = [[test[1] for test in test_review] for test_review in test_lsi]
test_lsi = [[0.000000000001] * topics if len(x) != topics else x for x in test_lsi]
test_lsi = sparse.coo_matrix(test_lsi)
stacked_test_features = sparse.hstack((test_features, test_lsi))
stacked_test_features = stacked_test_features.todense()
#Get XGBoost prediction
#test_prediction = gbm.predict(stacked_test_features)
#Get SVM prediction
test_prediction = svm_classifier.predict(stacked_test_features)
#Get Random Forest prediction
#test_prediction = rf.predict(stacked_test_features)
#Get NB prediction
#test_prediction = nb_clf.predict(stacked_test_features)
if test_prediction.mean() > 0.7:
predicted_rating = 1
actual_rating = list(user_df[user_df['biz_id'] == test_set[i]]['rating'])[0]
if actual_rating >= 4:
actual_rating = 1
else:
actual_rating = 0
test_predictions.append((test_prediction, actual_rating))
comb_error.append(abs(predicted_rating - actual_rating))
In [377]:
print "The LDA + LSA mean absolute error is: " + str(sum(comb_error) / float(len(comb_error)))
In [378]:
confidence_tuple = [(float(sum(list(x[0])))/float(len(x[0])),x[1]) for x in test_predictions]
confidence_tuple.sort()
top5 = confidence_tuple[-10:]
print "Got a rec accuracy of: " + str(float(sum([x[1] for x in confidence_tuple[-10:]]))/float(10))