Code for visualizations and playing around
In [1]:
# libraries and stuff to make plots look shiny
%matplotlib inline
from collections import defaultdict
import json
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl
#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
In [2]:
# loads the pickle files
trainingreviews = pd.io.pickle.read_pickle('./trainingreviews.pkl')
trainingbusiness = pd.io.pickle.read_pickle('./trainingbusiness.pkl')
trainingusers = pd.io.pickle.read_pickle('./trainingusers.pkl')
testreviews = pd.io.pickle.read_pickle('./testreviews.pkl')
testbusiness = pd.io.pickle.read_pickle('./testbusiness.pkl')
testusers = pd.io.pickle.read_pickle('./testusers.pkl')
In [3]:
print 'Total number of reviews in training: ' + str(len(trainingreviews.index))
print 'Total number of reviews in test: ' + str(len(testreviews.index))
trainingusers.head(2)
Out[3]:
In [4]:
business = trainingbusiness.append(testbusiness)
business.tail(2)
Out[4]:
In [5]:
# performs the merge
business = trainingbusiness.append(testbusiness)
users = trainingusers.append(testusers)
TrainMatrix = trainingreviews.merge(business,on="business_id")
TrainMatrix = TrainMatrix.merge(users,on="user_id", how='left')
TestMatrix = testreviews.merge(business,on="business_id", how='left')
TestMatrix = TestMatrix.merge(users,on="user_id", how='left')
print 'Training DF length: ' + str(len(TrainMatrix.index))
print 'Test DF length: ' + str(len(TestMatrix.index))
TrainMatrix.tail(2)
Out[5]:
In [6]:
urc=TrainMatrix.groupby('user_id').review_id.count()
ax=urc.hist(bins=50, log=True)
remove_border(ax)
plt.xlabel("Reviews per user")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Review Count per User");
In [7]:
urc=TrainMatrix.groupby('business_id').review_id.count()
ax=urc.hist(bins=50, log=True)
remove_border(ax)
plt.xlabel("Reviews per user")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Review Count per User");
In [8]:
print "Mean stars over all reviews:",TrainMatrix.rev_stars.mean()
stars=TrainMatrix.rev_stars
ax=stars.hist(bins=5)
remove_border(ax)
plt.xlabel("Star rating")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Star ratings over all reviews");
In [9]:
# testing out the code bits I worked on earlier-
df_index = TestMatrix.index.values.tolist()
business_index = pd.isnull(TestMatrix['bus_stars']).tolist()
business_index = [i for i, elem in enumerate(business_index) if elem]
user_index = pd.isnull(TestMatrix['user_average_stars']).tolist()
user_index = [i for i, elem in enumerate(user_index) if elem]
# finds the indices depending on what is missing
missing_both_index = list(set(business_index) & set(user_index))
missing_user_index = list(set(user_index) - set(business_index))
missing_business_index = list(set(business_index) - set(user_index))
missing_none = list(set(df_index) - set(business_index) - set(user_index))
In [10]:
missing_user = TestMatrix.iloc[missing_user_index, :]
missing_user.head(2)
Out[10]:
In [58]:
# visualize the percentage of the missing data
# This is the greatest bar chart you've ever seen in your life
percentages = np.array([len(missing_none), len(missing_business_index),
len(missing_user_index), len(missing_both_index)])/float(len(df_index)) * 100
width = 0.4
ind = np.array(range(1,5))
plt.rc('font', family='sans-serif')
labels = ['Has all','Missing businesses','Missing users', 'Missing both']
barlist = plt.bar(ind, percentages)
plt.grid(axis='y', color='white', linestyle='-', lw=1)
plt.yticks([10,20,30,40])
plt.ylabel('Percentage')
plt.title('Missing Data Breakdown')
fmt = plt.ScalarFormatter(useOffset=False)
plt.gca().xaxis.set_major_formatter(fmt)
plt.xlim(0.8, 5.2)
plt.xticks(ind+width, labels )
remove_border()
for x, y in zip(ind, percentages):
plt.annotate(("%.2f" + '%%') % y, (x+width, y + 0.8), ha='center')
barlist[0].set_color((0.20588235294117647, 0.7196078431372549, 0.5666666666666667))
barlist[3].set_color((0.20588235294117647, 0.5196078431372549, 0.4666666666666667))
plt.savefig('Missing_data.eps')
In [12]:
"""
takes a dataframe ldf, makes a copy of it, and returns the copy
with all averages and review counts recomputed
this is used when a frame is subsetted.
"""
def recompute_frame(ldf):
ldfu=ldf.groupby('user_id')
ldfb=ldf.groupby('business_id')
user_avg=ldfu.rev_stars.mean()
user_review_count=ldfu.review_id.count()
business_avg=ldfb.rev_stars.mean()
business_review_count=ldfb.review_id.count()
nldf=ldf.copy()
nldf.set_index(['business_id'], inplace=True)
nldf['business_avg']=business_avg
nldf['business_review_count']=business_review_count
nldf.reset_index(inplace=True)
nldf.set_index(['user_id'], inplace=True)
nldf['user_avg']=user_avg
nldf['user_review_count']=user_review_count
nldf.reset_index(inplace=True)
return nldf
In [13]:
# creates a smaller dataframe for users with small reviews and businesses with a small amount of reviews
# these values are currently the min for the smaller df to work
user_theshold = 2
business_theshold = 4
# smallidf=TrainMatrix[(TrainMatrix.user_review_count < user_theshold) & (TrainMatrix.bus_review_count < business_theshold)]
smallidf=TrainMatrix[(TrainMatrix.bus_review_count < business_theshold)]
smalldf=recompute_frame(smallidf)
In [14]:
# Derpy plot
print "Total Number of Reviews", smalldf.shape[0]
print "Users in this set", smalldf.user_id.unique().shape[0], "Restaurants",smalldf.business_id.unique().shape[0]
plt.figure()
ax=smalldf.groupby('user_id').review_id.count().hist()
remove_border(ax)
plt.xlabel("Reviews per user")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.figure()
ax=smalldf.groupby('business_id').review_id.count().hist()
remove_border(ax)
plt.xlabel("Reviews per restaurant")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
In [15]:
plt.figure()
avg_ratings_by_user=smalldf.groupby('user_id').rev_stars.mean()
ax=avg_ratings_by_user.hist()
remove_border(ax)
plt.xlabel("Average review score")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Average User Rating")
plt.figure()
avg_ratings_by_biz=smalldf.groupby('business_id').rev_stars.mean()
ax=avg_ratings_by_biz.hist()
remove_border(ax)
plt.xlabel("Average review score")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Average Restaurant Rating")
plt.figure()
print smalldf.rev_stars.mean()
plt.figure()
Out[15]:
In [59]:
# make plot for final results
N = 6
fullRMSE = [1.404,1.394,1.310,1.341,1.435, 1.446]
ind = np.arange(N) # the x locations for the groups
width = 0.275 # the width of the bars
plt.rc('font', family='sans-serif')
plt.rcParams.update({'font.size': 10})
fig, ax = plt.subplots()
rects1 = ax.bar(ind, fullRMSE, width, color=dark2_colors[1])
splitRMSE = [1.332,1.338,1.306,1.300,1.431, 1.422]
rects2 = ax.bar(ind+width, splitRMSE, width, color=dark2_colors[0])
randRMSE = [1.434,1.412,1.379,1.377,1.453, 1.474]
rects3 = ax.bar(ind+2*width, randRMSE, width, color=dark2_colors[2])
# add some text for labels, title and axes ticks
ax.set_ylabel('RMSE')
ax.set_title('RMSE by Missing Data Approach and Models')
ax.set_xticks(ind+width+0.125)
ax.set_xticklabels(('Linear Regression', 'Ridge Regression', 'Lasso', 'Elastic Net', 'Random Forest', 'Fact. Machine') )
ax.legend((rects1[0], rects2[0], rects3[0]), ('Mean', 'Random values','Predicted values'), fontsize = 'small', frameon = False)
# ax.legend((rects1[0], rects2[0], rects3[0]), ('Mean', 'Random values','Predicted values'),bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0. )
def autolabel(rects):
# attach some text labels
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2., height, '%.3f'%height,
ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.ylim(1.2,1.55)
leg = plt.legend()
remove_border()
plt.savefig('Results.eps')
In [60]:
# visualize the 3 methods
percentages = np.array([1.300, 1.312, 1.345, 1.258 ])
width = 0.4
ind = np.array(range(1,5))
plt.rc('font', family='sans-serif')
plt.rcParams.update({'font.size': 14})
labels = ['No change','Collaborative Filtering','PCA','Segmentation Ensemble']
barlist = plt.bar(ind, percentages)
plt.grid(axis='y', color='white', linestyle='-', lw=1)
plt.ylabel('RMSE')
fmt = plt.ScalarFormatter(useOffset=False)
plt.gca().xaxis.set_major_formatter(fmt)
plt.xlim(0.8, 5.2)
plt.xticks(ind+width, labels)
plt.ylim(1.2, 1.4)
plt.title('Best Model + Other Techniques')
remove_border()
for x, y in zip(ind, percentages):
plt.annotate(("%.3f") % y, (x+width, y + 0.0025), ha='center')
plt.savefig('Results2.eps')
In [18]:
def not_so_quick_train(block):
block.replace([np.inf, -np.inf], np.nan)
block.fillna(value=1)
review_stars_vector = block.rev_stars.values
user_name = block.user_name.values
user_average_stars = block.user_average_stars.values
bus_open = block.bus_open.values
bus_stars = block.bus_stars.values
bus_review_count = block.bus_review_count.values
user_review_count = block.user_review_count.values
features = [user_average_stars,bus_open,bus_stars,bus_review_count,user_review_count]
X = np.matrix(features).T
Y = np.matrix(review_stars_vector).T
return X,Y
In [19]:
from sklearn.cross_validation import train_test_split
X, Y = not_so_quick_train(TrainMatrix)
In [20]:
f = np.isinf(X)
for i in f:
print i
break
In [21]:
X = np.nan_to_num(X)
xtrain, xtest, ytrain, ytest = train_test_split(X, Y)
In [22]:
from sklearn import cross_validation, linear_model
clf = linear_model.LinearRegression().fit(xtrain, ytrain)
print "RMSE: %.2f" % np.sqrt(np.mean((clf.predict(xtest) - ytest) ** 2))
In [23]:
TestMatrix.head(2)
Out[23]:
In [24]:
TestMatrix.bus_full_address = [v[-5:] for v in TestMatrix.bus_full_address.values]
TestMatrix.head(2)
Out[24]:
In [25]:
t = TestMatrix.groupby('bus_full_address').aggregate(np.mean)
t.fillna(3.67)
t.head(2)
Out[25]:
In [26]:
# START CF
TestMatrix2 = testreviews.merge(trainingbusiness,on="business_id")
TestMatrix2 = TestMatrix2.merge(trainingusers,on="user_id")
TestMatrix2.head(2)
Out[26]:
In [27]:
smallidf=TrainMatrix[(TrainMatrix.user_review_count > 80) & (TrainMatrix.bus_review_count > 200)]
smalldf=recompute_frame(smallidf)
print len(smalldf.index)
smalldf.head(2)
Out[27]:
In [28]:
from scipy.stats.stats import pearsonr
def pearson_sim(rest1_reviews, rest2_reviews, n_common):
"""
Given a subframe of restaurant 1 reviews and a subframe of restaurant 2 reviews,
where the reviewers are those who have reviewed both restaurants, return
the pearson correlation coefficient between the user average subtracted ratings.
The case for zero common reviewers is handled separately. Its
ok to return a NaN if any of the individual variances are 0.
"""
if n_common==0:
rho=0.
else:
diff1=rest1_reviews['rev_stars']-rest1_reviews['user_average_stars']
diff2=rest2_reviews['rev_stars']-rest2_reviews['user_average_stars']
rho=pearsonr(diff1, diff2)[0]
return rho
In [29]:
def get_restaurant_reviews(restaurant_id, df, set_of_users):
"""
given a resturant id and a set of reviewers, return the sub-dataframe of their
reviews.
"""
mask = (df.user_id.isin(set_of_users)) & (df.business_id==restaurant_id)
reviews = df[mask]
reviews = reviews[reviews.user_id.duplicated()==False]
return reviews
In [30]:
"""
Function
--------
calculate_similarity
Parameters
----------
rest1 : string
The id of restaurant 1
rest2 : string
The id of restaurant 2
df : DataFrame
A dataframe of reviews, such as the smalldf above
similarity_func : func
A function like pearson_sim above which takes two dataframes of individual
restaurant reviews made by a common set of reviewers, and the number of
common reviews. This function returns the similarity of the two restaurants
based on the common reviews.
Returns
--------
A tuple
The first element of the tuple is the similarity and the second the
common support n_common. If the similarity is a NaN, set it to 0
"""
#your code here
def calculate_similarity(rest1, rest2, df, similarity_func):
# find common reviewers
rest1_reviewers = df[df.business_id==rest1].user_id.unique()
rest2_reviewers = df[df.business_id==rest2].user_id.unique()
common_reviewers = set(rest1_reviewers).intersection(rest2_reviewers)
n_common=len(common_reviewers)
#get reviews
rest1_reviews = get_restaurant_reviews(rest1, df, common_reviewers)
rest2_reviews = get_restaurant_reviews(rest2, df, common_reviewers)
sim=similarity_func(rest1_reviews, rest2_reviews, n_common)
if np.isnan(sim):
return 0, n_common
return sim, n_common
In [31]:
class Database:
"A class representing a database of similaries and common supports"
def __init__(self, df):
"the constructor, takes a reviews dataframe like smalldf as its argument"
database={}
self.df=df
self.uniquebizids={v:k for (k,v) in enumerate(df.business_id.unique())}
keys=self.uniquebizids.keys()
l_keys=len(keys)
self.database_sim=np.zeros([l_keys,l_keys])
self.database_sup=np.zeros([l_keys, l_keys], dtype=np.int)
def populate_by_calculating(self, similarity_func):
"""
a populator for every pair of businesses in df. takes similarity_func like
pearson_sim as argument
"""
items=self.uniquebizids.items()
for b1, i1 in items:
for b2, i2 in items:
if i1 < i2:
sim, nsup=calculate_similarity(b1, b2, self.df, similarity_func)
self.database_sim[i1][i2]=sim
self.database_sim[i2][i1]=sim
self.database_sup[i1][i2]=nsup
self.database_sup[i2][i1]=nsup
elif i1==i2:
nsup=self.df[self.df.business_id==b1].user_id.count()
self.database_sim[i1][i1]=1.
self.database_sup[i1][i1]=nsup
def get(self, b1, b2):
"returns a tuple of similarity,common_support given two business ids"
sim=self.database_sim[self.uniquebizids[b1]][self.uniquebizids[b2]]
nsup=self.database_sup[self.uniquebizids[b1]][self.uniquebizids[b2]]
return (sim, nsup)
In [32]:
db=Database(smalldf)
db.populate_by_calculating(pearson_sim)
In [ ]:
def shrunk_sim(sim, n_common, reg=3.):
"takes a similarity and shrinks it down by using the regularizer"
ssim=(n_common*sim)/(n_common+reg)
return ssim
In [ ]:
"""
Function
--------
knearest
Parameters
----------
restaurant_id : string
The id of the restaurant whose nearest neighbors we want
set_of_restaurants : array
The set of restaurants from which we want to find the nearest neighbors
dbase : instance of Database class.
A database of similarities, on which the get method can be used to get the similarity
of two businessed. e.g. dbase.get(rid1,rid2)
k : int
the number of nearest neighbors desired, default 7
reg: float
the regularization.
Returns
--------
A sorted list
of the top k similar restaurants. The list is a list of tuples
(business_id, shrunken similarity, common support).
"""
#your code here
from operator import itemgetter
def knearest(restaurant_id, set_of_restaurants, dbase, k=7, reg=3.):
"""
Given a restaurant_id, dataframe, and database, get a sorted list of the
k most similar restaurants from the entire database.
"""
similars=[]
for other_rest_id in set_of_restaurants:
if other_rest_id!=restaurant_id:
sim, nc=dbase.get(restaurant_id, other_rest_id)
ssim=shrunk_sim(sim, nc, reg=reg)
similars.append((other_rest_id, ssim, nc ))
similars=sorted(similars, key=itemgetter(1), reverse=True)
return similars[0:k]
In [ ]:
testbizid = 'b5cEoKR8iQliq-yT2_O0LQ'
tops=knearest(testbizid, smalldf.business_id.unique(), db, k=7, reg=3.)
In [ ]:
tops
In [ ]:
x = [1,2,3,4,5]
In [ ]:
x[1:2] + x[3::]
1+1
In [ ]:
missing_none_df = TestMatrix.iloc[missing_none, :]
missing_none_df.head(2)
In [ ]:
R = TrainMatrix[TrainMatrix.business_id.isin(missing_none_df.business_id.values)]
R = R[TrainMatrix.user_id.isin(missing_none_df.user_id.values)]
print len(R.index)
R.head(2)
In [ ]:
R = R.pivot(index = 'business_id', columns= 'user_id', values = 'rev_stars')
R_bool = R.notnull()
R_bool[R_bool == True] = 1
R_bool[R_bool == False] = 0
R_bool.fillna(value=0, inplace=True)
R.fillna(value=0, inplace=True)
R_matrix = np.matrix(R.values)
R_matrix
In [ ]:
import scipy.io
scipy.io.savemat('R_matrix.mat', mdict={'Y': R_matrix})
In [ ]:
results = scipy.io.loadmat('R_matrix.mat')
In [ ]:
predictions = {}
for i in missing_none_df:
predictions[i.review_id] = results[i.user_id][i.business_id]
pd.DataFrame(predictions).to_csv('cf.csv', index = False)
In [ ]:
In [ ]: