Code for visualizations and playing around


In [1]:
# libraries and stuff to make plots look shiny
%matplotlib inline
from collections import defaultdict
import json

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

In [2]:
# loads the pickle files
trainingreviews = pd.io.pickle.read_pickle('./trainingreviews.pkl')
trainingbusiness = pd.io.pickle.read_pickle('./trainingbusiness.pkl')
trainingusers = pd.io.pickle.read_pickle('./trainingusers.pkl')

testreviews = pd.io.pickle.read_pickle('./testreviews.pkl')
testbusiness = pd.io.pickle.read_pickle('./testbusiness.pkl')
testusers = pd.io.pickle.read_pickle('./testusers.pkl')

In [3]:
print 'Total number of reviews in training: ' + str(len(trainingreviews.index)) 
print 'Total number of reviews in test: ' + str(len(testreviews.index))
trainingusers.head(2)


Total number of reviews in training: 229907
Total number of reviews in test: 36404
Out[3]:
user_average_stars user_name user_review_count user_type user_id user_votes gender cool useful funny
0 5 Jim 6 user CR2y7yEm4X035ZMzrTtN9Q {u'funny': 0, u'useful': 7, u'cool': 0} -1 0 7 0
1 1 Kelle 2 user _9GXoHhdxc30ujPaQwh6Ew {u'funny': 0, u'useful': 1, u'cool': 0} 1 0 1 0

In [4]:
business = trainingbusiness.append(testbusiness)
business.tail(2)


Out[4]:
bus_categories bus_city bus_full_address bus_latitude bus_longitude bus_name bus_neighborhoods bus_open bus_review_count bus_stars bus_state bus_type business_id
2795 [Banks & Credit Unions, Financial Services] Chandler 1949 W Ray Rd Ste 34\nChandler, AZ 85224 33.319395 -111.874965 Desert Schools Federal Credit Union [] True 3 NaN AZ business IpiCTbW1u04ytps6tFF4QQ
2796 [Mexican, Restaurants] Phoenix 3400 E Sky Harbor Blvd\nPhoenix, AZ 85034 33.436527 -111.998996 Barrio Cafe [] True 73 NaN AZ business LbcDWyqGgQdlwWfGO-whMw

In [5]:
# performs the merge
business = trainingbusiness.append(testbusiness)
users = trainingusers.append(testusers)

TrainMatrix = trainingreviews.merge(business,on="business_id")
TrainMatrix = TrainMatrix.merge(users,on="user_id", how='left')
TestMatrix = testreviews.merge(business,on="business_id", how='left')
TestMatrix = TestMatrix.merge(users,on="user_id", how='left')
print 'Training DF length: ' + str(len(TrainMatrix.index))
print 'Test DF length: ' + str(len(TestMatrix.index))
TrainMatrix.tail(2)


Training DF length: 229907
Test DF length: 36404
Out[5]:
business_id rev_date review_id rev_stars rev_text rev_type user_id rev_votes bus_categories bus_city bus_full_address bus_latitude bus_longitude bus_name bus_neighborhoods bus_open bus_review_count bus_stars bus_state bus_type cool funny gender useful user_average_stars user_name user_review_count user_type user_votes
229905 x59g-quKyKqh7VDAWhV-vQ 2008-05-26 T39VKXR2UWzDMniI2dpl3A 5 Home of Randi Rhodes and Mike Molloy progressi... review l1eXX3p2WL_02FWw5TXhsA {u'funny': 1, u'useful': 3, u'cool': 2} [Mass Media, Radio Stations] Phoenix Phoenix, AZ 85034 33.438279 -112.017829 Nova M Radio Network [] False 3 4.5 AZ business 2780 1687 -1 3393 3.81 David 805 user {u'funny': 1687, u'useful': 3393, u'cool': 2780}
229906 qsZpOYEttt8spp4n7YWZyQ 2012-08-21 Q-W-NxtCTI3pnKcELJXI8w 5 This shop was recommended by Ric's Auto Body, ... review Lexfy0u3R9y9qrJo9qOywg {u'funny': 0, u'useful': 0, u'cool': 0} [Automotive, Auto Parts & Supplies] Scottsdale 8260 E Raintree Dr\nSte 2\nScottsdale, AZ 85260 33.619639 -111.902600 Unique Upholstery [] True 3 4.5 AZ business 1 0 -1 4 4.20 Ian 5 user {u'funny': 0, u'useful': 4, u'cool': 1}

In [6]:
urc=TrainMatrix.groupby('user_id').review_id.count()
ax=urc.hist(bins=50, log=True)
remove_border(ax)
plt.xlabel("Reviews per user")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Review Count per User");



In [7]:
urc=TrainMatrix.groupby('business_id').review_id.count()
ax=urc.hist(bins=50, log=True)
remove_border(ax)
plt.xlabel("Reviews per user")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Review Count per User");



In [8]:
print "Mean stars over all reviews:",TrainMatrix.rev_stars.mean()
stars=TrainMatrix.rev_stars
ax=stars.hist(bins=5)
remove_border(ax)
plt.xlabel("Star rating")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Star ratings over all reviews");


Mean stars over all reviews: 3.76672306628

In [9]:
# testing out the code bits I worked on earlier-
df_index = TestMatrix.index.values.tolist()
business_index = pd.isnull(TestMatrix['bus_stars']).tolist()
business_index = [i for i, elem in enumerate(business_index) if elem]
user_index = pd.isnull(TestMatrix['user_average_stars']).tolist()
user_index = [i for i, elem in enumerate(user_index) if elem]

# finds the indices depending on what is missing
missing_both_index = list(set(business_index) & set(user_index))
missing_user_index = list(set(user_index) - set(business_index))
missing_business_index = list(set(business_index) - set(user_index))
missing_none = list(set(df_index) - set(business_index) - set(user_index))

In [10]:
missing_user = TestMatrix.iloc[missing_user_index, :]
missing_user.head(2)


Out[10]:
business_id review_id rev_type user_id bus_categories bus_city bus_full_address bus_latitude bus_longitude bus_name bus_neighborhoods bus_open bus_review_count bus_stars bus_state bus_type cool funny gender useful user_average_stars user_name user_review_count user_type user_votes
32770 7QSYBp2-AOdyUJXEaLnbgA T8CXO1Ct0FNsMmWVRJGQEQ review ngQAmiYfy9QWSrSIH2gXtw [Steakhouses, American (New), Restaurants] Scottsdale 3821 N Scottsdale Rd\nScottsdale, AZ 85251 33.492381 -111.925947 Bandera [] True 188 4 AZ business NaN NaN NaN NaN NaN Mike 3 user NaN
32771 7QSYBp2-AOdyUJXEaLnbgA FbsL_xluHuqgMps2w7weug review NpIuE6NDGhBET1_Q1j3rXQ [Steakhouses, American (New), Restaurants] Scottsdale 3821 N Scottsdale Rd\nScottsdale, AZ 85251 33.492381 -111.925947 Bandera [] True 188 4 AZ business NaN NaN NaN NaN NaN Jenny 2 user NaN

In [58]:
# visualize the percentage of the missing data
# This is the greatest bar chart you've ever seen in your life
percentages = np.array([len(missing_none), len(missing_business_index), 
                        len(missing_user_index), len(missing_both_index)])/float(len(df_index)) * 100
width = 0.4
ind = np.array(range(1,5))
plt.rc('font', family='sans-serif') 
labels = ['Has all','Missing businesses','Missing users', 'Missing both']

barlist = plt.bar(ind, percentages)
plt.grid(axis='y', color='white', linestyle='-', lw=1)
plt.yticks([10,20,30,40])
plt.ylabel('Percentage')
plt.title('Missing Data Breakdown')

fmt = plt.ScalarFormatter(useOffset=False)
plt.gca().xaxis.set_major_formatter(fmt)
plt.xlim(0.8, 5.2)
plt.xticks(ind+width, labels )
remove_border()

for x, y in zip(ind, percentages):
    plt.annotate(("%.2f" + '%%') % y, (x+width, y + 0.8), ha='center')
barlist[0].set_color((0.20588235294117647, 0.7196078431372549, 0.5666666666666667))
barlist[3].set_color((0.20588235294117647, 0.5196078431372549, 0.4666666666666667))
plt.savefig('Missing_data.eps')



In [12]:
"""
takes a dataframe ldf, makes a copy of it, and returns the copy
with all averages and review counts recomputed
this is used when a frame is subsetted.
"""
def recompute_frame(ldf):
    ldfu=ldf.groupby('user_id')
    ldfb=ldf.groupby('business_id')
    user_avg=ldfu.rev_stars.mean()
    user_review_count=ldfu.review_id.count()
    business_avg=ldfb.rev_stars.mean()
    business_review_count=ldfb.review_id.count()
    nldf=ldf.copy()
    nldf.set_index(['business_id'], inplace=True)
    nldf['business_avg']=business_avg
    nldf['business_review_count']=business_review_count
    nldf.reset_index(inplace=True)
    nldf.set_index(['user_id'], inplace=True)
    nldf['user_avg']=user_avg
    nldf['user_review_count']=user_review_count
    nldf.reset_index(inplace=True)
    return nldf

In [13]:
# creates a smaller dataframe for users with small reviews and businesses with a small amount of reviews
# these values are currently the min for the smaller df to work
user_theshold = 2
business_theshold = 4
# smallidf=TrainMatrix[(TrainMatrix.user_review_count < user_theshold) & (TrainMatrix.bus_review_count < business_theshold)]
smallidf=TrainMatrix[(TrainMatrix.bus_review_count < business_theshold)]
smalldf=recompute_frame(smallidf)

In [14]:
# Derpy plot
print "Total Number of Reviews", smalldf.shape[0]
print "Users in this set", smalldf.user_id.unique().shape[0], "Restaurants",smalldf.business_id.unique().shape[0]
plt.figure()
ax=smalldf.groupby('user_id').review_id.count().hist()
remove_border(ax)
plt.xlabel("Reviews per user")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.figure()
ax=smalldf.groupby('business_id').review_id.count().hist()
remove_border(ax)
plt.xlabel("Reviews per restaurant")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')


Total Number of Reviews 7593
Users in this set 5120 Restaurants 2531

In [15]:
plt.figure()
avg_ratings_by_user=smalldf.groupby('user_id').rev_stars.mean()
ax=avg_ratings_by_user.hist()
remove_border(ax)
plt.xlabel("Average review score")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Average User Rating")
plt.figure()

avg_ratings_by_biz=smalldf.groupby('business_id').rev_stars.mean()
ax=avg_ratings_by_biz.hist()
remove_border(ax)
plt.xlabel("Average review score")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Average Restaurant Rating")
plt.figure()

print smalldf.rev_stars.mean()
plt.figure()


3.63400500461
Out[15]:
<matplotlib.figure.Figure at 0x10ab53e90>
<matplotlib.figure.Figure at 0x10a9db650>
<matplotlib.figure.Figure at 0x10ab53e90>

In [59]:
# make plot for final results
N = 6
fullRMSE = [1.404,1.394,1.310,1.341,1.435, 1.446]

ind = np.arange(N)  # the x locations for the groups
width = 0.275       # the width of the bars

plt.rc('font', family='sans-serif') 
plt.rcParams.update({'font.size': 10})

fig, ax = plt.subplots()
rects1 = ax.bar(ind, fullRMSE, width, color=dark2_colors[1])

splitRMSE = [1.332,1.338,1.306,1.300,1.431, 1.422]
rects2 = ax.bar(ind+width, splitRMSE, width, color=dark2_colors[0])

randRMSE = [1.434,1.412,1.379,1.377,1.453, 1.474]
rects3 = ax.bar(ind+2*width, randRMSE, width, color=dark2_colors[2])

# add some text for labels, title and axes ticks
ax.set_ylabel('RMSE')
ax.set_title('RMSE by Missing Data Approach and Models')
ax.set_xticks(ind+width+0.125)
ax.set_xticklabels(('Linear Regression', 'Ridge Regression', 'Lasso', 'Elastic Net', 'Random Forest', 'Fact. Machine') )
ax.legend((rects1[0], rects2[0], rects3[0]), ('Mean', 'Random values','Predicted values'), fontsize = 'small', frameon = False)
# ax.legend((rects1[0], rects2[0], rects3[0]), ('Mean', 'Random values','Predicted values'),bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0. )

def autolabel(rects):
    # attach some text labels
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x()+rect.get_width()/2., height, '%.3f'%height,
                ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.ylim(1.2,1.55)
leg = plt.legend()
remove_border()
plt.savefig('Results.eps')



In [60]:
# visualize the 3 methods  
percentages = np.array([1.300, 1.312, 1.345, 1.258 ])
width = 0.4
ind = np.array(range(1,5))
plt.rc('font', family='sans-serif') 
plt.rcParams.update({'font.size': 14})
labels = ['No change','Collaborative Filtering','PCA','Segmentation Ensemble']

barlist = plt.bar(ind, percentages)
plt.grid(axis='y', color='white', linestyle='-', lw=1)
plt.ylabel('RMSE')

fmt = plt.ScalarFormatter(useOffset=False)
plt.gca().xaxis.set_major_formatter(fmt)
plt.xlim(0.8, 5.2)
plt.xticks(ind+width, labels)
plt.ylim(1.2, 1.4)
plt.title('Best Model + Other Techniques')
remove_border()

for x, y in zip(ind, percentages):
    plt.annotate(("%.3f") % y, (x+width, y + 0.0025), ha='center')
plt.savefig('Results2.eps')



In [18]:
def not_so_quick_train(block):
    block.replace([np.inf, -np.inf], np.nan)
    block.fillna(value=1)
    review_stars_vector = block.rev_stars.values
    user_name = block.user_name.values
    user_average_stars = block.user_average_stars.values
    bus_open = block.bus_open.values
    bus_stars = block.bus_stars.values
    bus_review_count = block.bus_review_count.values
    user_review_count = block.user_review_count.values
    features = [user_average_stars,bus_open,bus_stars,bus_review_count,user_review_count]
    X = np.matrix(features).T
    Y = np.matrix(review_stars_vector).T
    return X,Y

In [19]:
from sklearn.cross_validation import train_test_split
X, Y = not_so_quick_train(TrainMatrix)

In [20]:
f = np.isinf(X)
for i in f:
    print i
    break


[[False False False False False]]

In [21]:
X = np.nan_to_num(X)
xtrain, xtest, ytrain, ytest = train_test_split(X, Y)

In [22]:
from sklearn import cross_validation, linear_model
clf = linear_model.LinearRegression().fit(xtrain, ytrain)
print "RMSE: %.2f" % np.sqrt(np.mean((clf.predict(xtest) - ytest) ** 2))


RMSE: 1.05

In [23]:
TestMatrix.head(2)


Out[23]:
business_id review_id rev_type user_id bus_categories bus_city bus_full_address bus_latitude bus_longitude bus_name bus_neighborhoods bus_open bus_review_count bus_stars bus_state bus_type cool funny gender useful user_average_stars user_name user_review_count user_type user_votes
0 -sC66z4SO3tR7nFCjfQwuQ Wv-4SQr9UUztIBnjzHu9-g review dqeFcKq2L2wiOg9LFT9-UA [Mexican, Restaurants] Phoenix 401 W Clarendon Ave\nPhoenix, AZ 85013 33.491120 -112.079081 Gallo Blanco Cafe [] True 549 4.0 AZ business 6 8 -1 20 4.05 Ty 63 user {u'funny': 8, u'useful': 20, u'cool': 6}
1 qw93CjlAZ6a4ff11Z-hF3Q ZN-kVEwrIQouWyLp-d6R5A review dqeFcKq2L2wiOg9LFT9-UA [Arts & Entertainment, Nightlife, Music Venues] Phoenix 308 N 2nd Ave\nPhoenix, AZ 85003 33.451716 -112.076437 Crescent Ballroom [] True 145 4.5 AZ business 6 8 -1 20 4.05 Ty 63 user {u'funny': 8, u'useful': 20, u'cool': 6}

In [24]:
TestMatrix.bus_full_address = [v[-5:] for v in TestMatrix.bus_full_address.values]
TestMatrix.head(2)


Out[24]:
business_id review_id rev_type user_id bus_categories bus_city bus_full_address bus_latitude bus_longitude bus_name bus_neighborhoods bus_open bus_review_count bus_stars bus_state bus_type cool funny gender useful user_average_stars user_name user_review_count user_type user_votes
0 -sC66z4SO3tR7nFCjfQwuQ Wv-4SQr9UUztIBnjzHu9-g review dqeFcKq2L2wiOg9LFT9-UA [Mexican, Restaurants] Phoenix 85013 33.491120 -112.079081 Gallo Blanco Cafe [] True 549 4.0 AZ business 6 8 -1 20 4.05 Ty 63 user {u'funny': 8, u'useful': 20, u'cool': 6}
1 qw93CjlAZ6a4ff11Z-hF3Q ZN-kVEwrIQouWyLp-d6R5A review dqeFcKq2L2wiOg9LFT9-UA [Arts & Entertainment, Nightlife, Music Venues] Phoenix 85003 33.451716 -112.076437 Crescent Ballroom [] True 145 4.5 AZ business 6 8 -1 20 4.05 Ty 63 user {u'funny': 8, u'useful': 20, u'cool': 6}

In [25]:
t = TestMatrix.groupby('bus_full_address').aggregate(np.mean)
t.fillna(3.67)
t.head(2)


Out[25]:
bus_latitude bus_longitude bus_open bus_review_count bus_stars cool funny gender useful user_average_stars user_review_count
bus_full_address
85003 33.455410 -112.077656 0.996870 195.846635 3.936929 168.607595 135.949367 0.123418 239.664557 3.750158 61.397727
85004 33.455338 -112.071558 0.983278 154.765329 3.832871 208.756571 158.617143 0.093714 301.728000 3.814171 77.067291

In [26]:
# START CF
TestMatrix2 = testreviews.merge(trainingbusiness,on="business_id")
TestMatrix2 = TestMatrix2.merge(trainingusers,on="user_id")
TestMatrix2.head(2)


Out[26]:
business_id review_id rev_type user_id bus_categories bus_city bus_full_address bus_latitude bus_longitude bus_name bus_neighborhoods bus_open bus_review_count bus_stars bus_state bus_type user_average_stars user_name user_review_count user_type user_votes gender cool useful funny
0 -sC66z4SO3tR7nFCjfQwuQ Wv-4SQr9UUztIBnjzHu9-g review dqeFcKq2L2wiOg9LFT9-UA [Mexican, Restaurants] Phoenix 401 W Clarendon Ave\nPhoenix, AZ 85013 33.491120 -112.079081 Gallo Blanco Cafe [] True 549 4.0 AZ business 4.05 Ty 63 user {u'funny': 8, u'useful': 20, u'cool': 6} -1 6 20 8
1 qw93CjlAZ6a4ff11Z-hF3Q ZN-kVEwrIQouWyLp-d6R5A review dqeFcKq2L2wiOg9LFT9-UA [Arts & Entertainment, Nightlife, Music Venues] Phoenix 308 N 2nd Ave\nPhoenix, AZ 85003 33.451716 -112.076437 Crescent Ballroom [] True 145 4.5 AZ business 4.05 Ty 63 user {u'funny': 8, u'useful': 20, u'cool': 6} -1 6 20 8

In [27]:
smallidf=TrainMatrix[(TrainMatrix.user_review_count > 80) & (TrainMatrix.bus_review_count > 200)]
smalldf=recompute_frame(smallidf)
print len(smalldf.index)
smalldf.head(2)


13510
Out[27]:
user_id business_id rev_date review_id rev_stars rev_text rev_type rev_votes bus_categories bus_city bus_full_address bus_latitude bus_longitude bus_name bus_neighborhoods bus_open bus_review_count bus_stars bus_state bus_type cool funny gender useful user_average_stars user_name user_review_count user_type user_votes business_avg business_review_count user_avg
0 rLtl8ZkDX5vH5nAx9C3q5Q b5cEoKR8iQliq-yT2_O0LQ 2010-05-05 j67R8BK-_IIgH18TX0U4Kw 3 I went here last night with a party of 4. It ... review {u'funny': 0, u'useful': 0, u'cool': 0} [Nightlife, Breakfast & Brunch, Mexican, Bars,... Scottsdale 3313 N Hayden Rd\nScottsdale, AZ 85251 33.486918 -111.908738 Carlsbad Tavern [] True 262 4 AZ business 322 331 -1 1034 3.72 Jason 49 user {u'funny': 331, u'useful': 1034, u'cool': 322} 3.572917 96 3.714286
1 rLtl8ZkDX5vH5nAx9C3q5Q qjmCVYkwP-HDa35jwYucbQ 2009-02-13 dWNb4oNditcsD1_IsEHBag 4 This place is very interesting and there is no... review {u'funny': 0, u'useful': 0, u'cool': 0} [American (New), Restaurants] Phoenix 2502 E Camelback Rd\nPhoenix, AZ 85016 33.510207 -112.027204 True Food Kitchen [] True 434 4 AZ business 322 331 -1 1034 3.72 Jason 49 user {u'funny': 331, u'useful': 1034, u'cool': 322} 3.744000 125 3.714286

In [28]:
from scipy.stats.stats import pearsonr
def pearson_sim(rest1_reviews, rest2_reviews, n_common):
    """
    Given a subframe of restaurant 1 reviews and a subframe of restaurant 2 reviews,
    where the reviewers are those who have reviewed both restaurants, return 
    the pearson correlation coefficient between the user average subtracted ratings.
    The case for zero common reviewers is handled separately. Its
    ok to return a NaN if any of the individual variances are 0.
    """
    if n_common==0:
        rho=0.
    else:
        diff1=rest1_reviews['rev_stars']-rest1_reviews['user_average_stars']
        diff2=rest2_reviews['rev_stars']-rest2_reviews['user_average_stars']
        rho=pearsonr(diff1, diff2)[0]
    return rho

In [29]:
def get_restaurant_reviews(restaurant_id, df, set_of_users):
    """
    given a resturant id and a set of reviewers, return the sub-dataframe of their
    reviews.
    """
    mask = (df.user_id.isin(set_of_users)) & (df.business_id==restaurant_id)
    reviews = df[mask]
    reviews = reviews[reviews.user_id.duplicated()==False]
    return reviews

In [30]:
"""
Function
--------
calculate_similarity

Parameters
----------
rest1 : string
    The id of restaurant 1
rest2 : string
    The id of restaurant 2
df : DataFrame
  A dataframe of reviews, such as the smalldf above
similarity_func : func
  A function like pearson_sim above which takes two dataframes of individual
  restaurant reviews made by a common set of reviewers, and the number of
  common reviews. This function returns the similarity of the two restaurants
  based on the common reviews.
  
Returns
--------
A tuple
  The first element of the tuple is the similarity and the second the
  common support n_common. If the similarity is a NaN, set it to 0
"""
#your code here
def calculate_similarity(rest1, rest2, df, similarity_func):
    # find common reviewers
    rest1_reviewers = df[df.business_id==rest1].user_id.unique()
    rest2_reviewers = df[df.business_id==rest2].user_id.unique()
    common_reviewers = set(rest1_reviewers).intersection(rest2_reviewers)
    n_common=len(common_reviewers)
    #get reviews
    rest1_reviews = get_restaurant_reviews(rest1, df, common_reviewers)
    rest2_reviews = get_restaurant_reviews(rest2, df, common_reviewers)
    sim=similarity_func(rest1_reviews, rest2_reviews, n_common)
    if np.isnan(sim):
        return 0, n_common
    return sim, n_common

In [31]:
class Database:
    "A class representing a database of similaries and common supports"
    
    def __init__(self, df):
        "the constructor, takes a reviews dataframe like smalldf as its argument"
        database={}
        self.df=df
        self.uniquebizids={v:k for (k,v) in enumerate(df.business_id.unique())}
        keys=self.uniquebizids.keys()
        l_keys=len(keys)
        self.database_sim=np.zeros([l_keys,l_keys])
        self.database_sup=np.zeros([l_keys, l_keys], dtype=np.int)
        
    def populate_by_calculating(self, similarity_func):
        """
        a populator for every pair of businesses in df. takes similarity_func like
        pearson_sim as argument
        """
        items=self.uniquebizids.items()
        for b1, i1 in items:
            for b2, i2 in items:
                if i1 < i2:
                    sim, nsup=calculate_similarity(b1, b2, self.df, similarity_func)
                    self.database_sim[i1][i2]=sim
                    self.database_sim[i2][i1]=sim
                    self.database_sup[i1][i2]=nsup
                    self.database_sup[i2][i1]=nsup
                elif i1==i2:
                    nsup=self.df[self.df.business_id==b1].user_id.count()
                    self.database_sim[i1][i1]=1.
                    self.database_sup[i1][i1]=nsup
                    

    def get(self, b1, b2):
        "returns a tuple of similarity,common_support given two business ids"
        sim=self.database_sim[self.uniquebizids[b1]][self.uniquebizids[b2]]
        nsup=self.database_sup[self.uniquebizids[b1]][self.uniquebizids[b2]]
        return (sim, nsup)

In [32]:
db=Database(smalldf)
db.populate_by_calculating(pearson_sim)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-32-6c257d8d42ad> in <module>()
      1 db=Database(smalldf)
----> 2 db.populate_by_calculating(pearson_sim)

<ipython-input-31-7292897fbdd7> in populate_by_calculating(self, similarity_func)
     21             for b2, i2 in items:
     22                 if i1 < i2:
---> 23                     sim, nsup=calculate_similarity(b1, b2, self.df, similarity_func)
     24                     self.database_sim[i1][i2]=sim
     25                     self.database_sim[i2][i1]=sim

<ipython-input-30-b976b1b42a7d> in calculate_similarity(rest1, rest2, df, similarity_func)
     27 def calculate_similarity(rest1, rest2, df, similarity_func):
     28     # find common reviewers
---> 29     rest1_reviewers = df[df.business_id==rest1].user_id.unique()
     30     rest2_reviewers = df[df.business_id==rest2].user_id.unique()
     31     common_reviewers = set(rest1_reviewers).intersection(rest2_reviewers)

//anaconda/lib/python2.7/site-packages/pandas/core/ops.pyc in wrapper(self, other)
    562 
    563             # scalars
--> 564             res = na_op(values, other)
    565             if np.isscalar(res):
    566                 raise TypeError('Could not compare %s type with Series'

//anaconda/lib/python2.7/site-packages/pandas/core/ops.pyc in na_op(x, y)
    525                     result = lib.vec_compare(x, y, op)
    526             else:
--> 527                 result = lib.scalar_compare(x, y, op)
    528         else:
    529 

KeyboardInterrupt: 

In [ ]:
def shrunk_sim(sim, n_common, reg=3.):
    "takes a similarity and shrinks it down by using the regularizer"
    ssim=(n_common*sim)/(n_common+reg)
    return ssim

In [ ]:
"""
Function
--------
knearest

Parameters
----------
restaurant_id : string
    The id of the restaurant whose nearest neighbors we want
set_of_restaurants : array
    The set of restaurants from which we want to find the nearest neighbors
dbase : instance of Database class.
    A database of similarities, on which the get method can be used to get the similarity
  of two businessed. e.g. dbase.get(rid1,rid2)
k : int
    the number of nearest neighbors desired, default 7
reg: float
    the regularization.
    
  
Returns
--------
A sorted list
    of the top k similar restaurants. The list is a list of tuples
    (business_id, shrunken similarity, common support).
"""
#your code here
from operator import itemgetter
def knearest(restaurant_id, set_of_restaurants, dbase, k=7, reg=3.):
    """
    Given a restaurant_id, dataframe, and database, get a sorted list of the
    k most similar restaurants from the entire database.
    """
    similars=[]
    for other_rest_id in set_of_restaurants:
        if other_rest_id!=restaurant_id:
            sim, nc=dbase.get(restaurant_id, other_rest_id)
            ssim=shrunk_sim(sim, nc, reg=reg)
            similars.append((other_rest_id, ssim, nc ))
    similars=sorted(similars, key=itemgetter(1), reverse=True)
    return similars[0:k]

In [ ]:
testbizid = 'b5cEoKR8iQliq-yT2_O0LQ'
tops=knearest(testbizid, smalldf.business_id.unique(), db, k=7, reg=3.)

In [ ]:
tops

In [ ]:
x = [1,2,3,4,5]

In [ ]:
x[1:2] + x[3::]
1+1

In [ ]:
missing_none_df = TestMatrix.iloc[missing_none, :]
missing_none_df.head(2)

In [ ]:
R = TrainMatrix[TrainMatrix.business_id.isin(missing_none_df.business_id.values)]
R = R[TrainMatrix.user_id.isin(missing_none_df.user_id.values)]
print len(R.index)
R.head(2)

In [ ]:
R = R.pivot(index = 'business_id', columns= 'user_id', values = 'rev_stars')
R_bool = R.notnull()
R_bool[R_bool == True] = 1
R_bool[R_bool == False] = 0
R_bool.fillna(value=0, inplace=True)
R.fillna(value=0, inplace=True)
R_matrix = np.matrix(R.values)
R_matrix

In [ ]:
import scipy.io
scipy.io.savemat('R_matrix.mat', mdict={'Y': R_matrix})

In [ ]:
results = scipy.io.loadmat('R_matrix.mat')

In [ ]:
predictions = {}
for i in missing_none_df:
    predictions[i.review_id] = results[i.user_id][i.business_id]
pd.DataFrame(predictions).to_csv('cf.csv', index = False)

In [ ]:


In [ ]: