Code for visualizations and playing around



In [1]:

    
# libraries and stuff to make plots look shiny
%matplotlib inline
from collections import defaultdict
import json

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)



In [2]:

    
# loads the pickle files
trainingreviews = pd.io.pickle.read_pickle('./trainingreviews.pkl')
trainingbusiness = pd.io.pickle.read_pickle('./trainingbusiness.pkl')
trainingusers = pd.io.pickle.read_pickle('./trainingusers.pkl')

testreviews = pd.io.pickle.read_pickle('./testreviews.pkl')
testbusiness = pd.io.pickle.read_pickle('./testbusiness.pkl')
testusers = pd.io.pickle.read_pickle('./testusers.pkl')



In [3]:

    
print 'Total number of reviews in training: ' + str(len(trainingreviews.index)) 
print 'Total number of reviews in test: ' + str(len(testreviews.index))
trainingusers.head(2)









    



Total number of reviews in training: 229907
Total number of reviews in test: 36404






    Out[3]:






  
    
      
      user_average_stars
      user_name
      user_review_count
      user_type
      user_id
      user_votes
      gender
      cool
      useful
      funny
    
  
  
    
      0
       5
         Jim
       6
       user
       CR2y7yEm4X035ZMzrTtN9Q
       {u'funny': 0, u'useful': 7, u'cool': 0}
      -1
       0
       7
       0
    
    
      1
       1
       Kelle
       2
       user
       _9GXoHhdxc30ujPaQwh6Ew
       {u'funny': 0, u'useful': 1, u'cool': 0}
       1
       0
       1
       0



In [4]:

    
business = trainingbusiness.append(testbusiness)
business.tail(2)









    Out[4]:






  
    
      
      bus_categories
      bus_city
      bus_full_address
      bus_latitude
      bus_longitude
      bus_name
      bus_neighborhoods
      bus_open
      bus_review_count
      bus_stars
      bus_state
      bus_type
      business_id
    
  
  
    
      2795
       [Banks & Credit Unions, Financial Services]
       Chandler
        1949 W Ray Rd Ste 34\nChandler, AZ 85224
       33.319395
      -111.874965
       Desert Schools Federal Credit Union
       []
       True
        3
      NaN
       AZ
       business
       IpiCTbW1u04ytps6tFF4QQ
    
    
      2796
                            [Mexican, Restaurants]
        Phoenix
       3400 E Sky Harbor Blvd\nPhoenix, AZ 85034
       33.436527
      -111.998996
                               Barrio Cafe
       []
       True
       73
      NaN
       AZ
       business
       LbcDWyqGgQdlwWfGO-whMw



In [5]:

    
# performs the merge
business = trainingbusiness.append(testbusiness)
users = trainingusers.append(testusers)

TrainMatrix = trainingreviews.merge(business,on="business_id")
TrainMatrix = TrainMatrix.merge(users,on="user_id", how='left')
TestMatrix = testreviews.merge(business,on="business_id", how='left')
TestMatrix = TestMatrix.merge(users,on="user_id", how='left')
print 'Training DF length: ' + str(len(TrainMatrix.index))
print 'Test DF length: ' + str(len(TestMatrix.index))
TrainMatrix.tail(2)









    



Training DF length: 229907
Test DF length: 36404






    Out[5]:






  
    
      
      business_id
      rev_date
      review_id
      rev_stars
      rev_text
      rev_type
      user_id
      rev_votes
      bus_categories
      bus_city
      bus_full_address
      bus_latitude
      bus_longitude
      bus_name
      bus_neighborhoods
      bus_open
      bus_review_count
      bus_stars
      bus_state
      bus_type
      cool
      funny
      gender
      useful
      user_average_stars
      user_name
      user_review_count
      user_type
      user_votes
    
  
  
    
      229905
       x59g-quKyKqh7VDAWhV-vQ
       2008-05-26
       T39VKXR2UWzDMniI2dpl3A
       5
       Home of Randi Rhodes and Mike Molloy progressi...
       review
       l1eXX3p2WL_02FWw5TXhsA
       {u'funny': 1, u'useful': 3, u'cool': 2}
              [Mass Media, Radio Stations]
          Phoenix
                                     Phoenix, AZ 85034
       33.438279
      -112.017829
       Nova M Radio Network
       []
       False
       3
       4.5
       AZ
       business
       2780
       1687
      -1
       3393
       3.81
       David
       805
       user
       {u'funny': 1687, u'useful': 3393, u'cool': 2780}
    
    
      229906
       qsZpOYEttt8spp4n7YWZyQ
       2012-08-21
       Q-W-NxtCTI3pnKcELJXI8w
       5
       This shop was recommended by Ric's Auto Body, ...
       review
       Lexfy0u3R9y9qrJo9qOywg
       {u'funny': 0, u'useful': 0, u'cool': 0}
       [Automotive, Auto Parts & Supplies]
       Scottsdale
       8260 E Raintree Dr\nSte 2\nScottsdale, AZ 85260
       33.619639
      -111.902600
          Unique Upholstery
       []
        True
       3
       4.5
       AZ
       business
          1
          0
      -1
          4
       4.20
         Ian
         5
       user
                {u'funny': 0, u'useful': 4, u'cool': 1}



In [6]:

    
urc=TrainMatrix.groupby('user_id').review_id.count()
ax=urc.hist(bins=50, log=True)
remove_border(ax)
plt.xlabel("Reviews per user")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Review Count per User");



In [7]:

    
urc=TrainMatrix.groupby('business_id').review_id.count()
ax=urc.hist(bins=50, log=True)
remove_border(ax)
plt.xlabel("Reviews per user")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Review Count per User");



In [8]:

    
print "Mean stars over all reviews:",TrainMatrix.rev_stars.mean()
stars=TrainMatrix.rev_stars
ax=stars.hist(bins=5)
remove_border(ax)
plt.xlabel("Star rating")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Star ratings over all reviews");









    



Mean stars over all reviews: 3.76672306628



In [9]:

    
# testing out the code bits I worked on earlier-
df_index = TestMatrix.index.values.tolist()
business_index = pd.isnull(TestMatrix['bus_stars']).tolist()
business_index = [i for i, elem in enumerate(business_index) if elem]
user_index = pd.isnull(TestMatrix['user_average_stars']).tolist()
user_index = [i for i, elem in enumerate(user_index) if elem]

# finds the indices depending on what is missing
missing_both_index = list(set(business_index) & set(user_index))
missing_user_index = list(set(user_index) - set(business_index))
missing_business_index = list(set(business_index) - set(user_index))
missing_none = list(set(df_index) - set(business_index) - set(user_index))



In [10]:

    
missing_user = TestMatrix.iloc[missing_user_index, :]
missing_user.head(2)









    Out[10]:






  
    
      
      business_id
      review_id
      rev_type
      user_id
      bus_categories
      bus_city
      bus_full_address
      bus_latitude
      bus_longitude
      bus_name
      bus_neighborhoods
      bus_open
      bus_review_count
      bus_stars
      bus_state
      bus_type
      cool
      funny
      gender
      useful
      user_average_stars
      user_name
      user_review_count
      user_type
      user_votes
    
  
  
    
      32770
       7QSYBp2-AOdyUJXEaLnbgA
       T8CXO1Ct0FNsMmWVRJGQEQ
       review
       ngQAmiYfy9QWSrSIH2gXtw
       [Steakhouses, American (New), Restaurants]
       Scottsdale
       3821 N Scottsdale Rd\nScottsdale, AZ 85251
       33.492381
      -111.925947
       Bandera
       []
       True
       188
       4
       AZ
       business
      NaN
      NaN
      NaN
      NaN
      NaN
        Mike
       3
       user
       NaN
    
    
      32771
       7QSYBp2-AOdyUJXEaLnbgA
       FbsL_xluHuqgMps2w7weug
       review
       NpIuE6NDGhBET1_Q1j3rXQ
       [Steakhouses, American (New), Restaurants]
       Scottsdale
       3821 N Scottsdale Rd\nScottsdale, AZ 85251
       33.492381
      -111.925947
       Bandera
       []
       True
       188
       4
       AZ
       business
      NaN
      NaN
      NaN
      NaN
      NaN
       Jenny
       2
       user
       NaN



In [58]:

    
# visualize the percentage of the missing data
# This is the greatest bar chart you've ever seen in your life
percentages = np.array([len(missing_none), len(missing_business_index), 
                        len(missing_user_index), len(missing_both_index)])/float(len(df_index)) * 100
width = 0.4
ind = np.array(range(1,5))
plt.rc('font', family='sans-serif') 
labels = ['Has all','Missing businesses','Missing users', 'Missing both']

barlist = plt.bar(ind, percentages)
plt.grid(axis='y', color='white', linestyle='-', lw=1)
plt.yticks([10,20,30,40])
plt.ylabel('Percentage')
plt.title('Missing Data Breakdown')

fmt = plt.ScalarFormatter(useOffset=False)
plt.gca().xaxis.set_major_formatter(fmt)
plt.xlim(0.8, 5.2)
plt.xticks(ind+width, labels )
remove_border()

for x, y in zip(ind, percentages):
    plt.annotate(("%.2f" + '%%') % y, (x+width, y + 0.8), ha='center')
barlist[0].set_color((0.20588235294117647, 0.7196078431372549, 0.5666666666666667))
barlist[3].set_color((0.20588235294117647, 0.5196078431372549, 0.4666666666666667))
plt.savefig('Missing_data.eps')



In [12]:

    
"""
takes a dataframe ldf, makes a copy of it, and returns the copy
with all averages and review counts recomputed
this is used when a frame is subsetted.
"""
def recompute_frame(ldf):
    ldfu=ldf.groupby('user_id')
    ldfb=ldf.groupby('business_id')
    user_avg=ldfu.rev_stars.mean()
    user_review_count=ldfu.review_id.count()
    business_avg=ldfb.rev_stars.mean()
    business_review_count=ldfb.review_id.count()
    nldf=ldf.copy()
    nldf.set_index(['business_id'], inplace=True)
    nldf['business_avg']=business_avg
    nldf['business_review_count']=business_review_count
    nldf.reset_index(inplace=True)
    nldf.set_index(['user_id'], inplace=True)
    nldf['user_avg']=user_avg
    nldf['user_review_count']=user_review_count
    nldf.reset_index(inplace=True)
    return nldf



In [13]:

    
# creates a smaller dataframe for users with small reviews and businesses with a small amount of reviews
# these values are currently the min for the smaller df to work
user_theshold = 2
business_theshold = 4
# smallidf=TrainMatrix[(TrainMatrix.user_review_count < user_theshold) & (TrainMatrix.bus_review_count < business_theshold)]
smallidf=TrainMatrix[(TrainMatrix.bus_review_count < business_theshold)]
smalldf=recompute_frame(smallidf)



In [14]:

    
# Derpy plot
print "Total Number of Reviews", smalldf.shape[0]
print "Users in this set", smalldf.user_id.unique().shape[0], "Restaurants",smalldf.business_id.unique().shape[0]
plt.figure()
ax=smalldf.groupby('user_id').review_id.count().hist()
remove_border(ax)
plt.xlabel("Reviews per user")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.figure()
ax=smalldf.groupby('business_id').review_id.count().hist()
remove_border(ax)
plt.xlabel("Reviews per restaurant")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')









    



Total Number of Reviews 7593
Users in this set 5120 Restaurants 2531



In [15]:

    
plt.figure()
avg_ratings_by_user=smalldf.groupby('user_id').rev_stars.mean()
ax=avg_ratings_by_user.hist()
remove_border(ax)
plt.xlabel("Average review score")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Average User Rating")
plt.figure()

avg_ratings_by_biz=smalldf.groupby('business_id').rev_stars.mean()
ax=avg_ratings_by_biz.hist()
remove_border(ax)
plt.xlabel("Average review score")
plt.grid(False)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.title("Average Restaurant Rating")
plt.figure()

print smalldf.rev_stars.mean()
plt.figure()









    



3.63400500461






    Out[15]:





<matplotlib.figure.Figure at 0x10ab53e90>






    












    












    





<matplotlib.figure.Figure at 0x10a9db650>






    





<matplotlib.figure.Figure at 0x10ab53e90>



In [59]:

    
# make plot for final results
N = 6
fullRMSE = [1.404,1.394,1.310,1.341,1.435, 1.446]

ind = np.arange(N)  # the x locations for the groups
width = 0.275       # the width of the bars

plt.rc('font', family='sans-serif') 
plt.rcParams.update({'font.size': 10})

fig, ax = plt.subplots()
rects1 = ax.bar(ind, fullRMSE, width, color=dark2_colors[1])

splitRMSE = [1.332,1.338,1.306,1.300,1.431, 1.422]
rects2 = ax.bar(ind+width, splitRMSE, width, color=dark2_colors[0])

randRMSE = [1.434,1.412,1.379,1.377,1.453, 1.474]
rects3 = ax.bar(ind+2*width, randRMSE, width, color=dark2_colors[2])

# add some text for labels, title and axes ticks
ax.set_ylabel('RMSE')
ax.set_title('RMSE by Missing Data Approach and Models')
ax.set_xticks(ind+width+0.125)
ax.set_xticklabels(('Linear Regression', 'Ridge Regression', 'Lasso', 'Elastic Net', 'Random Forest', 'Fact. Machine') )
ax.legend((rects1[0], rects2[0], rects3[0]), ('Mean', 'Random values','Predicted values'), fontsize = 'small', frameon = False)
# ax.legend((rects1[0], rects2[0], rects3[0]), ('Mean', 'Random values','Predicted values'),bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0. )

def autolabel(rects):
    # attach some text labels
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x()+rect.get_width()/2., height, '%.3f'%height,
                ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.ylim(1.2,1.55)
leg = plt.legend()
remove_border()
plt.savefig('Results.eps')



In [60]:

    
# visualize the 3 methods  
percentages = np.array([1.300, 1.312, 1.345, 1.258 ])
width = 0.4
ind = np.array(range(1,5))
plt.rc('font', family='sans-serif') 
plt.rcParams.update({'font.size': 14})
labels = ['No change','Collaborative Filtering','PCA','Segmentation Ensemble']

barlist = plt.bar(ind, percentages)
plt.grid(axis='y', color='white', linestyle='-', lw=1)
plt.ylabel('RMSE')

fmt = plt.ScalarFormatter(useOffset=False)
plt.gca().xaxis.set_major_formatter(fmt)
plt.xlim(0.8, 5.2)
plt.xticks(ind+width, labels)
plt.ylim(1.2, 1.4)
plt.title('Best Model + Other Techniques')
remove_border()

for x, y in zip(ind, percentages):
    plt.annotate(("%.3f") % y, (x+width, y + 0.0025), ha='center')
plt.savefig('Results2.eps')



In [18]:

    
def not_so_quick_train(block):
    block.replace([np.inf, -np.inf], np.nan)
    block.fillna(value=1)
    review_stars_vector = block.rev_stars.values
    user_name = block.user_name.values
    user_average_stars = block.user_average_stars.values
    bus_open = block.bus_open.values
    bus_stars = block.bus_stars.values
    bus_review_count = block.bus_review_count.values
    user_review_count = block.user_review_count.values
    features = [user_average_stars,bus_open,bus_stars,bus_review_count,user_review_count]
    X = np.matrix(features).T
    Y = np.matrix(review_stars_vector).T
    return X,Y



In [19]:

    
from sklearn.cross_validation import train_test_split
X, Y = not_so_quick_train(TrainMatrix)



In [20]:

    
f = np.isinf(X)
for i in f:
    print i
    break









    



[[False False False False False]]



In [21]:

    
X = np.nan_to_num(X)
xtrain, xtest, ytrain, ytest = train_test_split(X, Y)



In [22]:

    
from sklearn import cross_validation, linear_model
clf = linear_model.LinearRegression().fit(xtrain, ytrain)
print "RMSE: %.2f" % np.sqrt(np.mean((clf.predict(xtest) - ytest) ** 2))









    



RMSE: 1.05



In [23]:

    
TestMatrix.head(2)









    Out[23]:






  
    
      
      business_id
      review_id
      rev_type
      user_id
      bus_categories
      bus_city
      bus_full_address
      bus_latitude
      bus_longitude
      bus_name
      bus_neighborhoods
      bus_open
      bus_review_count
      bus_stars
      bus_state
      bus_type
      cool
      funny
      gender
      useful
      user_average_stars
      user_name
      user_review_count
      user_type
      user_votes
    
  
  
    
      0
       -sC66z4SO3tR7nFCjfQwuQ
       Wv-4SQr9UUztIBnjzHu9-g
       review
       dqeFcKq2L2wiOg9LFT9-UA
                                [Mexican, Restaurants]
       Phoenix
       401 W Clarendon Ave\nPhoenix, AZ 85013
       33.491120
      -112.079081
       Gallo Blanco Cafe
       []
       True
       549
       4.0
       AZ
       business
       6
       8
      -1
       20
       4.05
       Ty
       63
       user
       {u'funny': 8, u'useful': 20, u'cool': 6}
    
    
      1
       qw93CjlAZ6a4ff11Z-hF3Q
       ZN-kVEwrIQouWyLp-d6R5A
       review
       dqeFcKq2L2wiOg9LFT9-UA
       [Arts & Entertainment, Nightlife, Music Venues]
       Phoenix
             308 N 2nd Ave\nPhoenix, AZ 85003
       33.451716
      -112.076437
       Crescent Ballroom
       []
       True
       145
       4.5
       AZ
       business
       6
       8
      -1
       20
       4.05
       Ty
       63
       user
       {u'funny': 8, u'useful': 20, u'cool': 6}



In [24]:

    
TestMatrix.bus_full_address = [v[-5:] for v in TestMatrix.bus_full_address.values]
TestMatrix.head(2)









    Out[24]:






  
    
      
      business_id
      review_id
      rev_type
      user_id
      bus_categories
      bus_city
      bus_full_address
      bus_latitude
      bus_longitude
      bus_name
      bus_neighborhoods
      bus_open
      bus_review_count
      bus_stars
      bus_state
      bus_type
      cool
      funny
      gender
      useful
      user_average_stars
      user_name
      user_review_count
      user_type
      user_votes
    
  
  
    
      0
       -sC66z4SO3tR7nFCjfQwuQ
       Wv-4SQr9UUztIBnjzHu9-g
       review
       dqeFcKq2L2wiOg9LFT9-UA
                                [Mexican, Restaurants]
       Phoenix
       85013
       33.491120
      -112.079081
       Gallo Blanco Cafe
       []
       True
       549
       4.0
       AZ
       business
       6
       8
      -1
       20
       4.05
       Ty
       63
       user
       {u'funny': 8, u'useful': 20, u'cool': 6}
    
    
      1
       qw93CjlAZ6a4ff11Z-hF3Q
       ZN-kVEwrIQouWyLp-d6R5A
       review
       dqeFcKq2L2wiOg9LFT9-UA
       [Arts & Entertainment, Nightlife, Music Venues]
       Phoenix
       85003
       33.451716
      -112.076437
       Crescent Ballroom
       []
       True
       145
       4.5
       AZ
       business
       6
       8
      -1
       20
       4.05
       Ty
       63
       user
       {u'funny': 8, u'useful': 20, u'cool': 6}



In [25]:

    
t = TestMatrix.groupby('bus_full_address').aggregate(np.mean)
t.fillna(3.67)
t.head(2)









    Out[25]:






  
    
      
      bus_latitude
      bus_longitude
      bus_open
      bus_review_count
      bus_stars
      cool
      funny
      gender
      useful
      user_average_stars
      user_review_count
    
    
      bus_full_address
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      85003
       33.455410
      -112.077656
       0.996870
       195.846635
       3.936929
       168.607595
       135.949367
       0.123418
       239.664557
       3.750158
       61.397727
    
    
      85004
       33.455338
      -112.071558
       0.983278
       154.765329
       3.832871
       208.756571
       158.617143
       0.093714
       301.728000
       3.814171
       77.067291



In [26]:

    
# START CF
TestMatrix2 = testreviews.merge(trainingbusiness,on="business_id")
TestMatrix2 = TestMatrix2.merge(trainingusers,on="user_id")
TestMatrix2.head(2)









    Out[26]:






  
    
      
      business_id
      review_id
      rev_type
      user_id
      bus_categories
      bus_city
      bus_full_address
      bus_latitude
      bus_longitude
      bus_name
      bus_neighborhoods
      bus_open
      bus_review_count
      bus_stars
      bus_state
      bus_type
      user_average_stars
      user_name
      user_review_count
      user_type
      user_votes
      gender
      cool
      useful
      funny
    
  
  
    
      0
       -sC66z4SO3tR7nFCjfQwuQ
       Wv-4SQr9UUztIBnjzHu9-g
       review
       dqeFcKq2L2wiOg9LFT9-UA
                                [Mexican, Restaurants]
       Phoenix
       401 W Clarendon Ave\nPhoenix, AZ 85013
       33.491120
      -112.079081
       Gallo Blanco Cafe
       []
       True
       549
       4.0
       AZ
       business
       4.05
       Ty
       63
       user
       {u'funny': 8, u'useful': 20, u'cool': 6}
      -1
       6
       20
       8
    
    
      1
       qw93CjlAZ6a4ff11Z-hF3Q
       ZN-kVEwrIQouWyLp-d6R5A
       review
       dqeFcKq2L2wiOg9LFT9-UA
       [Arts & Entertainment, Nightlife, Music Venues]
       Phoenix
             308 N 2nd Ave\nPhoenix, AZ 85003
       33.451716
      -112.076437
       Crescent Ballroom
       []
       True
       145
       4.5
       AZ
       business
       4.05
       Ty
       63
       user
       {u'funny': 8, u'useful': 20, u'cool': 6}
      -1
       6
       20
       8



In [27]:

    
smallidf=TrainMatrix[(TrainMatrix.user_review_count > 80) & (TrainMatrix.bus_review_count > 200)]
smalldf=recompute_frame(smallidf)
print len(smalldf.index)
smalldf.head(2)









    



13510






    Out[27]:






  
    
      
      user_id
      business_id
      rev_date
      review_id
      rev_stars
      rev_text
      rev_type
      rev_votes
      bus_categories
      bus_city
      bus_full_address
      bus_latitude
      bus_longitude
      bus_name
      bus_neighborhoods
      bus_open
      bus_review_count
      bus_stars
      bus_state
      bus_type
      cool
      funny
      gender
      useful
      user_average_stars
      user_name
      user_review_count
      user_type
      user_votes
      business_avg
      business_review_count
      user_avg
    
  
  
    
      0
       rLtl8ZkDX5vH5nAx9C3q5Q
       b5cEoKR8iQliq-yT2_O0LQ
       2010-05-05
       j67R8BK-_IIgH18TX0U4Kw
       3
       I went here last night with a party of 4.  It ...
       review
       {u'funny': 0, u'useful': 0, u'cool': 0}
       [Nightlife, Breakfast & Brunch, Mexican, Bars,...
       Scottsdale
       3313 N Hayden Rd\nScottsdale, AZ 85251
       33.486918
      -111.908738
         Carlsbad Tavern
       []
       True
       262
       4
       AZ
       business
       322
       331
      -1
       1034
       3.72
       Jason
       49
       user
       {u'funny': 331, u'useful': 1034, u'cool': 322}
       3.572917
        96
       3.714286
    
    
      1
       rLtl8ZkDX5vH5nAx9C3q5Q
       qjmCVYkwP-HDa35jwYucbQ
       2009-02-13
       dWNb4oNditcsD1_IsEHBag
       4
       This place is very interesting and there is no...
       review
       {u'funny': 0, u'useful': 0, u'cool': 0}
                           [American (New), Restaurants]
          Phoenix
       2502 E Camelback Rd\nPhoenix, AZ 85016
       33.510207
      -112.027204
       True Food Kitchen
       []
       True
       434
       4
       AZ
       business
       322
       331
      -1
       1034
       3.72
       Jason
       49
       user
       {u'funny': 331, u'useful': 1034, u'cool': 322}
       3.744000
       125
       3.714286



In [28]:

    
from scipy.stats.stats import pearsonr
def pearson_sim(rest1_reviews, rest2_reviews, n_common):
    """
    Given a subframe of restaurant 1 reviews and a subframe of restaurant 2 reviews,
    where the reviewers are those who have reviewed both restaurants, return 
    the pearson correlation coefficient between the user average subtracted ratings.
    The case for zero common reviewers is handled separately. Its
    ok to return a NaN if any of the individual variances are 0.
    """
    if n_common==0:
        rho=0.
    else:
        diff1=rest1_reviews['rev_stars']-rest1_reviews['user_average_stars']
        diff2=rest2_reviews['rev_stars']-rest2_reviews['user_average_stars']
        rho=pearsonr(diff1, diff2)[0]
    return rho



In [29]:

    
def get_restaurant_reviews(restaurant_id, df, set_of_users):
    """
    given a resturant id and a set of reviewers, return the sub-dataframe of their
    reviews.
    """
    mask = (df.user_id.isin(set_of_users)) & (df.business_id==restaurant_id)
    reviews = df[mask]
    reviews = reviews[reviews.user_id.duplicated()==False]
    return reviews



In [30]:

    
"""
Function
--------
calculate_similarity

Parameters
----------
rest1 : string
    The id of restaurant 1
rest2 : string
    The id of restaurant 2
df : DataFrame
  A dataframe of reviews, such as the smalldf above
similarity_func : func
  A function like pearson_sim above which takes two dataframes of individual
  restaurant reviews made by a common set of reviewers, and the number of
  common reviews. This function returns the similarity of the two restaurants
  based on the common reviews.
  
Returns
--------
A tuple
  The first element of the tuple is the similarity and the second the
  common support n_common. If the similarity is a NaN, set it to 0
"""
#your code here
def calculate_similarity(rest1, rest2, df, similarity_func):
    # find common reviewers
    rest1_reviewers = df[df.business_id==rest1].user_id.unique()
    rest2_reviewers = df[df.business_id==rest2].user_id.unique()
    common_reviewers = set(rest1_reviewers).intersection(rest2_reviewers)
    n_common=len(common_reviewers)
    #get reviews
    rest1_reviews = get_restaurant_reviews(rest1, df, common_reviewers)
    rest2_reviews = get_restaurant_reviews(rest2, df, common_reviewers)
    sim=similarity_func(rest1_reviews, rest2_reviews, n_common)
    if np.isnan(sim):
        return 0, n_common
    return sim, n_common



In [31]:

    
class Database:
    "A class representing a database of similaries and common supports"
    
    def __init__(self, df):
        "the constructor, takes a reviews dataframe like smalldf as its argument"
        database={}
        self.df=df
        self.uniquebizids={v:k for (k,v) in enumerate(df.business_id.unique())}
        keys=self.uniquebizids.keys()
        l_keys=len(keys)
        self.database_sim=np.zeros([l_keys,l_keys])
        self.database_sup=np.zeros([l_keys, l_keys], dtype=np.int)
        
    def populate_by_calculating(self, similarity_func):
        """
        a populator for every pair of businesses in df. takes similarity_func like
        pearson_sim as argument
        """
        items=self.uniquebizids.items()
        for b1, i1 in items:
            for b2, i2 in items:
                if i1 < i2:
                    sim, nsup=calculate_similarity(b1, b2, self.df, similarity_func)
                    self.database_sim[i1][i2]=sim
                    self.database_sim[i2][i1]=sim
                    self.database_sup[i1][i2]=nsup
                    self.database_sup[i2][i1]=nsup
                elif i1==i2:
                    nsup=self.df[self.df.business_id==b1].user_id.count()
                    self.database_sim[i1][i1]=1.
                    self.database_sup[i1][i1]=nsup
                    

    def get(self, b1, b2):
        "returns a tuple of similarity,common_support given two business ids"
        sim=self.database_sim[self.uniquebizids[b1]][self.uniquebizids[b2]]
        nsup=self.database_sup[self.uniquebizids[b1]][self.uniquebizids[b2]]
        return (sim, nsup)



In [32]:

    
db=Database(smalldf)
db.populate_by_calculating(pearson_sim)









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-32-6c257d8d42ad> in <module>()
      1 db=Database(smalldf)
----> 2 db.populate_by_calculating(pearson_sim)

<ipython-input-31-7292897fbdd7> in populate_by_calculating(self, similarity_func)
     21             for b2, i2 in items:
     22                 if i1 < i2:
---> 23                     sim, nsup=calculate_similarity(b1, b2, self.df, similarity_func)
     24                     self.database_sim[i1][i2]=sim
     25                     self.database_sim[i2][i1]=sim

<ipython-input-30-b976b1b42a7d> in calculate_similarity(rest1, rest2, df, similarity_func)
     27 def calculate_similarity(rest1, rest2, df, similarity_func):
     28     # find common reviewers
---> 29     rest1_reviewers = df[df.business_id==rest1].user_id.unique()
     30     rest2_reviewers = df[df.business_id==rest2].user_id.unique()
     31     common_reviewers = set(rest1_reviewers).intersection(rest2_reviewers)

//anaconda/lib/python2.7/site-packages/pandas/core/ops.pyc in wrapper(self, other)
    562 
    563             # scalars
--> 564             res = na_op(values, other)
    565             if np.isscalar(res):
    566                 raise TypeError('Could not compare %s type with Series'

//anaconda/lib/python2.7/site-packages/pandas/core/ops.pyc in na_op(x, y)
    525                     result = lib.vec_compare(x, y, op)
    526             else:
--> 527                 result = lib.scalar_compare(x, y, op)
    528         else:
    529 

KeyboardInterrupt:



In [ ]:

    
def shrunk_sim(sim, n_common, reg=3.):
    "takes a similarity and shrinks it down by using the regularizer"
    ssim=(n_common*sim)/(n_common+reg)
    return ssim



In [ ]:

    
"""
Function
--------
knearest

Parameters
----------
restaurant_id : string
    The id of the restaurant whose nearest neighbors we want
set_of_restaurants : array
    The set of restaurants from which we want to find the nearest neighbors
dbase : instance of Database class.
    A database of similarities, on which the get method can be used to get the similarity
  of two businessed. e.g. dbase.get(rid1,rid2)
k : int
    the number of nearest neighbors desired, default 7
reg: float
    the regularization.
    
  
Returns
--------
A sorted list
    of the top k similar restaurants. The list is a list of tuples
    (business_id, shrunken similarity, common support).
"""
#your code here
from operator import itemgetter
def knearest(restaurant_id, set_of_restaurants, dbase, k=7, reg=3.):
    """
    Given a restaurant_id, dataframe, and database, get a sorted list of the
    k most similar restaurants from the entire database.
    """
    similars=[]
    for other_rest_id in set_of_restaurants:
        if other_rest_id!=restaurant_id:
            sim, nc=dbase.get(restaurant_id, other_rest_id)
            ssim=shrunk_sim(sim, nc, reg=reg)
            similars.append((other_rest_id, ssim, nc ))
    similars=sorted(similars, key=itemgetter(1), reverse=True)
    return similars[0:k]



In [ ]:

    
testbizid = 'b5cEoKR8iQliq-yT2_O0LQ'
tops=knearest(testbizid, smalldf.business_id.unique(), db, k=7, reg=3.)



In [ ]:

    
tops



In [ ]:

    
x = [1,2,3,4,5]



In [ ]:

    
x[1:2] + x[3::]
1+1



In [ ]:

    
missing_none_df = TestMatrix.iloc[missing_none, :]
missing_none_df.head(2)



In [ ]:

    
R = TrainMatrix[TrainMatrix.business_id.isin(missing_none_df.business_id.values)]
R = R[TrainMatrix.user_id.isin(missing_none_df.user_id.values)]
print len(R.index)
R.head(2)



In [ ]:

    
R = R.pivot(index = 'business_id', columns= 'user_id', values = 'rev_stars')
R_bool = R.notnull()
R_bool[R_bool == True] = 1
R_bool[R_bool == False] = 0
R_bool.fillna(value=0, inplace=True)
R.fillna(value=0, inplace=True)
R_matrix = np.matrix(R.values)
R_matrix



In [ ]:

    
import scipy.io
scipy.io.savemat('R_matrix.mat', mdict={'Y': R_matrix})



In [ ]:

    
results = scipy.io.loadmat('R_matrix.mat')



In [ ]:

    
predictions = {}
for i in missing_none_df:
    predictions[i.review_id] = results[i.user_id][i.business_id]
pd.DataFrame(predictions).to_csv('cf.csv', index = False)



In [ ]:



In [ ]:

	user_average_stars	user_name	user_review_count	user_type	user_id	user_votes	gender	cool	useful	funny
0	5	Jim	6	user	CR2y7yEm4X035ZMzrTtN9Q	{u'funny': 0, u'useful': 7, u'cool': 0}	-1	0	7	0
1	1	Kelle	2	user	_9GXoHhdxc30ujPaQwh6Ew	{u'funny': 0, u'useful': 1, u'cool': 0}	1	0	1	0

	bus_categories	bus_city	bus_full_address	bus_latitude	bus_longitude	bus_name	bus_neighborhoods	bus_open	bus_review_count	bus_stars	bus_state	bus_type	business_id
2795	[Banks & Credit Unions, Financial Services]	Chandler	1949 W Ray Rd Ste 34\nChandler, AZ 85224	33.319395	-111.874965	Desert Schools Federal Credit Union	[]	True	3	NaN	AZ	business	IpiCTbW1u04ytps6tFF4QQ
2796	[Mexican, Restaurants]	Phoenix	3400 E Sky Harbor Blvd\nPhoenix, AZ 85034	33.436527	-111.998996	Barrio Cafe	[]	True	73	NaN	AZ	business	LbcDWyqGgQdlwWfGO-whMw

	business_id	rev_date	review_id	rev_stars	rev_text	rev_type	user_id	rev_votes	bus_categories	bus_city	bus_full_address	bus_latitude	bus_longitude	bus_name	bus_neighborhoods	bus_open	bus_review_count	bus_stars	bus_state	bus_type	cool	funny	gender	useful	user_average_stars	user_name	user_review_count	user_type	user_votes
229905	x59g-quKyKqh7VDAWhV-vQ	2008-05-26	T39VKXR2UWzDMniI2dpl3A	5	Home of Randi Rhodes and Mike Molloy progressi...	review	l1eXX3p2WL_02FWw5TXhsA	{u'funny': 1, u'useful': 3, u'cool': 2}	[Mass Media, Radio Stations]	Phoenix	Phoenix, AZ 85034	33.438279	-112.017829	Nova M Radio Network	[]	False	3	4.5	AZ	business	2780	1687	-1	3393	3.81	David	805	user	{u'funny': 1687, u'useful': 3393, u'cool': 2780}
229906	qsZpOYEttt8spp4n7YWZyQ	2012-08-21	Q-W-NxtCTI3pnKcELJXI8w	5	This shop was recommended by Ric's Auto Body, ...	review	Lexfy0u3R9y9qrJo9qOywg	{u'funny': 0, u'useful': 0, u'cool': 0}	[Automotive, Auto Parts & Supplies]	Scottsdale	8260 E Raintree Dr\nSte 2\nScottsdale, AZ 85260	33.619639	-111.902600	Unique Upholstery	[]	True	3	4.5	AZ	business	1	0	-1	4	4.20	Ian	5	user	{u'funny': 0, u'useful': 4, u'cool': 1}

	business_id	review_id	rev_type	user_id	bus_categories	bus_city	bus_full_address	bus_latitude	bus_longitude	bus_name	bus_neighborhoods	bus_open	bus_review_count	bus_stars	bus_state	bus_type	cool	funny	gender	useful	user_average_stars	user_name	user_review_count	user_type	user_votes
32770	7QSYBp2-AOdyUJXEaLnbgA	T8CXO1Ct0FNsMmWVRJGQEQ	review	ngQAmiYfy9QWSrSIH2gXtw	[Steakhouses, American (New), Restaurants]	Scottsdale	3821 N Scottsdale Rd\nScottsdale, AZ 85251	33.492381	-111.925947	Bandera	[]	True	188	4	AZ	business	NaN	NaN	NaN	NaN	NaN	Mike	3	user	NaN
32771	7QSYBp2-AOdyUJXEaLnbgA	FbsL_xluHuqgMps2w7weug	review	NpIuE6NDGhBET1_Q1j3rXQ	[Steakhouses, American (New), Restaurants]	Scottsdale	3821 N Scottsdale Rd\nScottsdale, AZ 85251	33.492381	-111.925947	Bandera	[]	True	188	4	AZ	business	NaN	NaN	NaN	NaN	NaN	Jenny	2	user	NaN

	business_id	review_id	rev_type	user_id	bus_categories	bus_city	bus_full_address	bus_latitude	bus_longitude	bus_name	bus_neighborhoods	bus_open	bus_review_count	bus_stars	bus_state	bus_type	cool	funny	gender	useful	user_average_stars	user_name	user_review_count	user_type	user_votes
0	-sC66z4SO3tR7nFCjfQwuQ	Wv-4SQr9UUztIBnjzHu9-g	review	dqeFcKq2L2wiOg9LFT9-UA	[Mexican, Restaurants]	Phoenix	401 W Clarendon Ave\nPhoenix, AZ 85013	33.491120	-112.079081	Gallo Blanco Cafe	[]	True	549	4.0	AZ	business	6	8	-1	20	4.05	Ty	63	user	{u'funny': 8, u'useful': 20, u'cool': 6}
1	qw93CjlAZ6a4ff11Z-hF3Q	ZN-kVEwrIQouWyLp-d6R5A	review	dqeFcKq2L2wiOg9LFT9-UA	[Arts & Entertainment, Nightlife, Music Venues]	Phoenix	308 N 2nd Ave\nPhoenix, AZ 85003	33.451716	-112.076437	Crescent Ballroom	[]	True	145	4.5	AZ	business	6	8	-1	20	4.05	Ty	63	user	{u'funny': 8, u'useful': 20, u'cool': 6}

	bus_latitude	bus_longitude	bus_open	bus_review_count	bus_stars	cool	funny	gender	useful	user_average_stars	user_review_count
bus_full_address
85003	33.455410	-112.077656	0.996870	195.846635	3.936929	168.607595	135.949367	0.123418	239.664557	3.750158	61.397727
85004	33.455338	-112.071558	0.983278	154.765329	3.832871	208.756571	158.617143	0.093714	301.728000	3.814171	77.067291

	user_id	business_id	rev_date	review_id	rev_stars	rev_text	rev_type	rev_votes	bus_categories	bus_city	bus_full_address	bus_latitude	bus_longitude	bus_name	bus_neighborhoods	bus_open	bus_review_count	bus_stars	bus_state	bus_type	cool	funny	gender	useful	user_average_stars	user_name	user_review_count	user_type	user_votes	business_avg	business_review_count	user_avg
0	rLtl8ZkDX5vH5nAx9C3q5Q	b5cEoKR8iQliq-yT2_O0LQ	2010-05-05	j67R8BK-_IIgH18TX0U4Kw	3	I went here last night with a party of 4. It ...	review	{u'funny': 0, u'useful': 0, u'cool': 0}	[Nightlife, Breakfast & Brunch, Mexican, Bars,...	Scottsdale	3313 N Hayden Rd\nScottsdale, AZ 85251	33.486918	-111.908738	Carlsbad Tavern	[]	True	262	4	AZ	business	322	331	-1	1034	3.72	Jason	49	user	{u'funny': 331, u'useful': 1034, u'cool': 322}	3.572917	96	3.714286
1	rLtl8ZkDX5vH5nAx9C3q5Q	qjmCVYkwP-HDa35jwYucbQ	2009-02-13	dWNb4oNditcsD1_IsEHBag	4	This place is very interesting and there is no...	review	{u'funny': 0, u'useful': 0, u'cool': 0}	[American (New), Restaurants]	Phoenix	2502 E Camelback Rd\nPhoenix, AZ 85016	33.510207	-112.027204	True Food Kitchen	[]	True	434	4	AZ	business	322	331	-1	1034	3.72	Jason	49	user	{u'funny': 331, u'useful': 1034, u'cool': 322}	3.744000	125	3.714286