San Diego Burrito Analytics

Scott Cole

23 April 2016

This notebook contains analyses on the burrito ratings in San Diego, including:

  • How each metric correlates with one another.
  • Linear model of how each dimension contributes to the overall rating

Default imports


In [1]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
sns.set_style("white")


C:\Users\Scott\Anaconda2\lib\site-packages\matplotlib\__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

Load data


In [180]:
filename="burrito_current.csv"
df = pd.read_csv(filename)
N = df.shape[0]

Metric correlations


In [3]:
dfcorr = df.corr()

In [73]:
from tools.misc import pearsonp
metricscorr = ['Hunger','Cost','Volume','Tortilla','Temp','Meat','Fillings','Meat:filling',
               'Uniformity','Salsa','Synergy','Wrap','overall']
M = len(metricscorr)

Mcorrmat = np.zeros((M,M))
Mpmat = np.zeros((M,M))
for m1 in range(M):
    for m2 in range(M):
        if m1 != m2:
            Mcorrmat[m1,m2] = dfcorr[metricscorr[m1]][metricscorr[m2]]
            Mpmat[m1,m2] = pearsonp(Mcorrmat[m1,m2],N)

In [84]:
from matplotlib import cm

clim1 = (-1,1)
plt.figure(figsize=(12,10))
cax = plt.pcolor(range(M+1), range(M+1), Mcorrmat, cmap=cm.bwr)
cbar = plt.colorbar(cax, ticks=(-1,-.5,0,.5,1))
cbar.ax.set_ylabel('Pearson correlation (r)', size=30)
plt.clim(clim1)
cbar.ax.set_yticklabels((-1,-.5,0,.5,1),size=20)
#plt.axis([2, M+1, floall[0],floall[-1]+10])
ax = plt.gca()
ax.set_yticks(np.arange(M)+.5)
ax.set_yticklabels(metricscorr,size=25)
ax.set_xticks(np.arange(M)+.5)
ax.set_xticklabels(metricscorr,size=25)
plt.xticks(rotation='vertical')
plt.tight_layout()
plt.xlim((0,M))
plt.ylim((0,M))

figname = 'metriccorrmat'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')



In [6]:
sp.stats.pearsonr(df.Hunger,df.overall)


Out[6]:
(0.1969167162116611, 0.045117643507527491)

In [7]:
print Mpmat[0]
print Mcorrmat[0]


[ 0.          0.42936818  0.77239476  0.10259077  0.07085038  0.17569026
  0.01958646  0.13872519  0.1804102   0.26842165  0.05236806  0.84900154
  0.04511764]
[ 0.          0.07832013  0.02870377  0.16097189  0.17786999  0.13380405
  0.22860805  0.14616602  0.13236916  0.10951817  0.19079653 -0.01889624
  0.19691672]

Negative correlation: Cost and volume


In [155]:
plt.figure(figsize=(4,4))
ax = plt.gca()
df.plot(kind='scatter',x='Cost',y='Volume',ax=ax,**{'s':40,'color':'k'})
plt.xlabel('Cost ($)',size=20)
plt.ylabel('Volume (L)',size=20)
plt.xticks(np.arange(5,11),size=15)
plt.yticks(np.arange(.6,1.2,.1),size=15)
plt.tight_layout()
print df.corr()['Cost']['Volume']
from tools.misc import pearsonp
print pearsonp(df.corr()['Cost']['Volume'],len(df[['Cost','Volume']].dropna()))

figname = 'corr-volume-cost'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')


-0.379087488241
0.0425542151887

In [9]:
# Visualize some correlations

from tools.plt import scatt_corr
scatt_corr(df['overall'].values,df['Meat'].values,
          xlabel = 'overall rating', ylabel='meat rating', xlim = (-.5,5.5),ylim = (-.5,5.5),xticks=range(6),yticks=range(6))
          #showline = True)
    
scatt_corr(df['overall'].values,df['Wrap'].values,
          xlabel = 'overall rating', ylabel='wrap integrity rating', xlim = (-.5,5.5),ylim = (-.5,5.5),xticks=range(6),yticks=range(6))
          #showline = True)


Linear regression: ingredients


In [10]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

# Get all ingredient keys
startingredients = 29
ingredientkeys = df.keys()[startingredients:]
# Get all ingredient keys with at least 10 burritos
Nlim = 10
ingredientkeys = ingredientkeys[df.count()[startingredients:].values>=Nlim]
# Make a dataframe for all ingredient keys
dfing = df[ingredientkeys]

# For each key, make binary
for k in dfing.keys():
    dfing[k] = dfing[k].map({'x':1,'X':1,1:1})
    dfing[k] = dfing[k].fillna(0)


C:\Users\Scott\Anaconda2\lib\site-packages\ipykernel\__main__.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\Scott\Anaconda2\lib\site-packages\ipykernel\__main__.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [21]:
import statsmodels.api as sm
X = sm.add_constant(dfing)
y = df.overall
lm = sm.GLM(y,X)
res = lm.fit()
print(res.summary())
origR2 = 1 - np.var(res.resid_pearson) / np.var(y)
print origR2


                  Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                overall   No. Observations:                  104
Model:                            GLM   Df Residuals:                       93
Model Family:                Gaussian   Df Model:                           10
Link Function:               identity   Scale:                   0.56401902106
Method:                          IRLS   Log-Likelihood:                -111.98
Date:                Thu, 19 May 2016   Deviance:                       52.454
Time:                        14:24:52   Pearson chi2:                     52.5
No. Iterations:                     4                                         
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          3.5362      0.230     15.358      0.000         3.085     3.987
Beef          -0.0077      0.249     -0.031      0.975        -0.496     0.481
Pico          -0.0203      0.186     -0.109      0.913        -0.385     0.344
Guac          -0.0581      0.185     -0.314      0.753        -0.420     0.304
Cheese         0.1772      0.207      0.856      0.392        -0.228     0.583
Fries         -0.0657      0.224     -0.293      0.770        -0.506     0.374
Sour cream     0.3432      0.212      1.618      0.106        -0.072     0.759
Pork          -0.0618      0.302     -0.204      0.838        -0.654     0.530
Rice          -0.1220      0.316     -0.386      0.700        -0.742     0.498
Beans         -0.4188      0.284     -1.475      0.140        -0.975     0.138
Sauce          0.0329      0.284      0.116      0.908        -0.523     0.589
==============================================================================
0.117933999958

In [22]:
Nsurr = 1000
randr2 = np.zeros(Nsurr)
for n in range(Nsurr):
    Xrand = np.random.rand(X.shape[0],X.shape[1])
    Xrand[:,0] = np.ones(X.shape[0])
    lm = sm.GLM(y,Xrand)
    res = lm.fit()
    randr2[n] = 1 - np.var(res.resid_pearson) / np.var(y)
print 'p = ' , np.mean(randr2>origR2)


p =  0.271

In [23]:
# Is this a null result? let's do t-tests
for k in dfing.keys():
    withk = df.overall[dfing[k].values==1].values
    nok = df.overall[dfing[k].values==0].values
    print k
    print sp.stats.ttest_ind(withk,nok)


Beef
Ttest_indResult(statistic=1.0891771583077212, pvalue=0.27864241519037963)
Pico
Ttest_indResult(statistic=-0.28267673336890714, pvalue=0.77799749213626745)
Guac
Ttest_indResult(statistic=0.19211886327208949, pvalue=0.84803106376561266)
Cheese
Ttest_indResult(statistic=2.2706732593962577, pvalue=0.025268640435202969)
Fries
Ttest_indResult(statistic=1.9956571700500678, pvalue=0.048637556836881514)
Sour cream
Ttest_indResult(statistic=2.638488394917581, pvalue=0.0096327087865926107)
Pork
Ttest_indResult(statistic=-1.3310563095868717, pvalue=0.18613805315019669)
Rice
Ttest_indResult(statistic=-1.3060790466877805, pvalue=0.19446342540246225)
Beans
Ttest_indResult(statistic=-2.2415811731663067, pvalue=0.027154571614590502)
Sauce
Ttest_indResult(statistic=-0.33710046699686136, pvalue=0.73673382272805088)

Taco Stand case study: Meat-fillings correlation


In [61]:
plt.figure(figsize=(4,4))
ax = plt.gca()
df.plot(kind='scatter',x='Meat',y='Fillings',ax=ax,**{'s':40,'color':'k','alpha':.1})
plt.xlabel('Meat rating',size=20)
plt.ylabel('Non-meat rating',size=20)
plt.xticks(np.arange(0,6),size=15)
plt.yticks(np.arange(0,6),size=15)
print df.corr()['Meat']['Fillings']
from tools.misc import pearsonp
print pearsonp(df.corr()['Meat']['Fillings'],len(df[['Meat','Fillings']].dropna()))

figname = 'corr-meat-filling'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')


0.616481931196
5.27321497898e-12

In [170]:
# How many burritos at taco stand?
restrictCali = False
import re
reTS = re.compile('.*taco stand.*', re.IGNORECASE)
reCali = re.compile('.*cali.*', re.IGNORECASE)
locTS = np.ones(len(df))
for i in range(len(df)):
    mat = reTS.match(df['Location'][i])
    if mat is None:
        locTS[i] = 0
    else:
        if restrictCali:
            mat = reCali.match(df['Burrito'][i])
            if mat is None:
                locTS[i] = 0
print sum(locTS)
temp = np.arange(len(df))
dfTS = df.loc[temp[locTS==1]]


16.0

In [171]:
plt.figure(figsize=(4,4))
ax = plt.gca()
dfTS.plot(kind='scatter',x='Meat',y='Fillings',ax=ax,**{'s':40,'color':'k','alpha':.1})
plt.xlabel('Meat rating',size=20)
plt.ylabel('Non-meat rating',size=20)
plt.xticks(np.arange(0,6),size=15)
plt.yticks(np.arange(0,6),size=15)
print dfTS.corr()['Meat']['Fillings']
from tools.misc import pearsonp
print pearsonp(dfTS.corr()['Meat']['Fillings'],len(dfTS[['Meat','Fillings']].dropna()))

figname = 'corr-meat-filling-TS'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')


0.753794967197
0.000744707419522

In [172]:
# Spearman correlation
dfMF = df[['Meat','Fillings']].dropna()
dfTSMF = dfTS[['Meat','Fillings']].dropna()
print sp.stats.spearmanr(dfMF.Meat,dfMF.Fillings)
print sp.stats.spearmanr(dfTSMF.Meat,dfTSMF.Fillings)


SpearmanrResult(correlation=0.57015785654260809, pvalue=3.9722535491729561e-10)
SpearmanrResult(correlation=0.64876240286196585, pvalue=0.0065514677363974136)

Hunger level slightly correlated to overall


In [63]:
plt.figure(figsize=(4,4))
ax = plt.gca()
df.plot(kind='scatter',x='Hunger',y='overall',ax=ax,**{'s':40,'color':'k'})
plt.xlabel('Hunger',size=20)
plt.ylabel('Overall rating',size=20)
plt.xticks(np.arange(0,6),size=15)
plt.yticks(np.arange(0,6),size=15)
print df.corr()['Hunger']['overall']
from tools.misc import pearsonp
print pearsonp(df.corr()['Hunger']['overall'],len(df[['Hunger','overall']].dropna()))

figname = 'corr-hunger-overall'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')


0.196916716212
0.0451176435075

Model overall as a function of each main dimension


In [159]:
# GLM for 
# Remove NANs
mainD = ['Hunger','Cost','Tortilla','Temp','Meat','Fillings','Meat:filling',
               'Uniformity','Salsa','Wrap']
dffull = df[np.hstack((mainD,'overall'))].dropna()
X = sm.add_constant(dffull[mainD])
y = dffull['overall']
import statsmodels.api as sm
my_glm = sm.GLM(y,X)
res = my_glm.fit()
print(res.summary())
print 1 - np.var(res.resid_pearson) / np.var(y)


                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                overall   No. Observations:                   89
Model:                            GLM   Df Residuals:                       78
Model Family:                Gaussian   Df Model:                           10
Link Function:               identity   Scale:                  0.152583346326
Method:                          IRLS   Log-Likelihood:                -36.753
Date:                Fri, 20 May 2016   Deviance:                       11.902
Time:                        17:23:00   Pearson chi2:                     11.9
No. Iterations:                     4                                         
================================================================================
                   coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
const           -0.1148      0.431     -0.267      0.790        -0.959     0.729
Hunger           0.0156      0.051      0.309      0.757        -0.083     0.115
Cost             0.0118      0.041      0.285      0.776        -0.069     0.093
Tortilla         0.0774      0.069      1.129      0.259        -0.057     0.212
Temp             0.0200      0.042      0.472      0.637        -0.063     0.103
Meat             0.2485      0.068      3.676      0.000         0.116     0.381
Fillings         0.3426      0.069      4.981      0.000         0.208     0.477
Meat:filling     0.1097      0.048      2.275      0.023         0.015     0.204
Uniformity       0.0702      0.041      1.696      0.090        -0.011     0.151
Salsa            0.1411      0.052      2.717      0.007         0.039     0.243
Wrap             0.0166      0.039      0.429      0.668        -0.059     0.093
================================================================================
0.710613100599

In [150]:
# Linear regression
# Note that this matches GLM above :D
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X,y)
print lm.intercept_
print lm.coef_

print 'R2 = ' + np.str(lm.score(X,y))


-0.114770195977
[ 0.          0.01562018  0.01180215  0.07741117  0.01995426  0.24847114
  0.34258428  0.10968035  0.07023424  0.14107771  0.0166265 ]
R2 = 0.710613100599

In [153]:
# Visualize coefficients
from tools.plt import bar
newidx = np.argsort(-res.params.values)
temp = np.arange(len(newidx))
newidx = np.delete(newidx,temp[newidx==0])
bar(res.params[newidx],res.bse[newidx],X.keys()[newidx],'Overall rating\nLinear model\ncoefficient',
    ylim =(0,.5),figsize=(11,3))
plt.plot()

figname = 'overall_metric_linearmodelcoef'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')



In [45]:
sp.stats.pearsonr(X['Synergy'],y)[0]**2


Out[45]:
0.56482206397865531

Yelp and Google


In [233]:
# Average each metric over each Location
# Avoid case issues; in the future should avoid article issues
df.Location = df.Location.str.lower()
m_Location = ['Location','N','Yelp','Google','Hunger','Cost','Volume','Tortilla','Temp','Meat','Fillings','Meat:filling',
               'Uniformity','Salsa','Synergy','Wrap','overall']

tacoshops = df.Location.unique()
TS = len(tacoshops)
dfmean = pd.DataFrame(np.nan, index=range(TS), columns=m_Location)
dfmean.Location = tacoshops
for ts in range(TS):
    dfmean['N'][ts] = sum(df.Location == tacoshops[ts])
    for m in m_Location[2:]:
        dfmean[m][ts] = df[m].loc[df.Location==tacoshops[ts]].mean()


C:\Users\Scott\Anaconda2\lib\site-packages\ipykernel\__main__.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\Scott\Anaconda2\lib\site-packages\ipykernel\__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [237]:
metricscorr = ['Yelp','Google','Hunger','Cost','Volume','Tortilla','Temp','Meat','Fillings','Meat:filling',
               'Uniformity','Salsa','Synergy','Wrap','overall']
M = len(metricscorr)
dfmeancorr = dfmean.corr()
Mcorrmat = np.zeros((M,M))
Mpmat = np.zeros((M,M))
for m1 in range(M):
    for m2 in range(M):
        if m1 != m2:
            Mcorrmat[m1,m2] = dfmeancorr[metricscorr[m1]][metricscorr[m2]]
            Mpmat[m1,m2] = pearsonp(Mcorrmat[m1,m2],N)
            
clim1 = (-1,1)
plt.figure(figsize=(10,10))
cax = plt.pcolor(range(M+1), range(M+1), Mcorrmat, cmap=cm.bwr)
cbar = plt.colorbar(cax, ticks=(-1,-.5,0,.5,1))
cbar.ax.set_ylabel('Pearson correlation (r)', size=30)
plt.clim(clim1)
cbar.ax.set_yticklabels((-1,-.5,0,.5,1),size=20)
#plt.axis([2, M+1, floall[0],floall[-1]+10])
ax = plt.gca()
ax.set_yticks(np.arange(M)+.5)
ax.set_yticklabels(metricscorr,size=25)
ax.set_xticks(np.arange(M)+.5)
ax.set_xticklabels(metricscorr,size=9)
plt.tight_layout()



In [174]:
print Mcorrmat[0]
print Mpmat[0]


[ 0.          0.66250115  0.04234896  0.18431205 -0.1011487   0.36989643
 -0.02637736  0.31735503  0.36340765  0.06366544  0.00253354  0.32584581
  0.34068519  0.06612881  0.3418045 ]
[  0.00000000e+00   1.86616172e-14   6.69489926e-01   6.10705682e-02
   3.06935072e-01   1.11256808e-04   7.90399093e-01   1.02865079e-03
   1.49592730e-04   5.20829480e-01   9.79636293e-01   7.36969991e-04
   4.01934498e-04   5.04795116e-01   3.83494450e-04]

In [238]:
# GLM for Yelp
mainDo = ['Hunger','Cost','Tortilla','Temp','Meat','Fillings','Meat:filling',
               'Uniformity','Salsa','Synergy','Wrap','overall']
dffull = dfmean[np.hstack((mainDo,'Yelp'))].dropna()
X = sm.add_constant(dffull[mainDo])
y = dffull['Yelp']
import statsmodels.api as sm
my_glm = sm.GLM(y,X)
res = my_glm.fit()
print(res.summary())
print(res.pvalues)
print 1 - np.var(res.resid_pearson) / np.var(y)


                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                   Yelp   No. Observations:                   28
Model:                            GLM   Df Residuals:                       15
Model Family:                Gaussian   Df Model:                           12
Link Function:               identity   Scale:                  0.199135164543
Method:                          IRLS   Log-Likelihood:                -8.3993
Date:                Sat, 21 May 2016   Deviance:                       2.9870
Time:                        14:33:37   Pearson chi2:                     2.99
No. Iterations:                     4                                         
================================================================================
                   coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
const            3.5667      1.317      2.708      0.007         0.986     6.148
Hunger          -0.0374      0.266     -0.140      0.888        -0.559     0.484
Cost            -0.0716      0.129     -0.555      0.579        -0.325     0.181
Tortilla         0.2793      0.182      1.537      0.124        -0.077     0.635
Temp            -0.1240      0.162     -0.768      0.443        -0.441     0.193
Meat             0.0504      0.251      0.200      0.841        -0.442     0.543
Fillings         0.2697      0.305      0.883      0.377        -0.329     0.868
Meat:filling    -0.1489      0.200     -0.746      0.456        -0.540     0.242
Uniformity      -0.1037      0.149     -0.694      0.488        -0.397     0.189
Salsa           -0.0889      0.195     -0.455      0.649        -0.472     0.294
Synergy         -0.0872      0.314     -0.278      0.781        -0.702     0.528
Wrap             0.0851      0.134      0.636      0.525        -0.177     0.347
overall          0.1553      0.464      0.334      0.738        -0.755     1.065
================================================================================
const           0.006763
Hunger          0.888343
Cost            0.579169
Tortilla        0.124393
Temp            0.442655
Meat            0.841142
Fillings        0.377306
Meat:filling    0.455614
Uniformity      0.487682
Salsa           0.649001
Synergy         0.781186
Wrap            0.524682
overall         0.738118
dtype: float64
0.435001221995

In [68]:
plt.figure(figsize=(4,4))
ax = plt.gca()
df.plot(kind='scatter',x='Tortilla',y='Yelp',ax=ax,**{'s':40,'color':'k','alpha':.1})
plt.xlabel('Tortilla rating',size=20)
plt.ylabel('Yelp rating',size=20)
plt.xticks(np.arange(0,6),size=15)
plt.yticks(np.arange(0,6),size=15)
plt.ylim((2,5))
print df.corr()['Yelp']['Tortilla']
from tools.misc import pearsonp
print pearsonp(df.corr()['Yelp']['Tortilla'],len(df[['Yelp','Tortilla']].dropna()))

figname = 'corr-Yelp-tortilla'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')


0.369896428766
0.000120282868787

In [67]:
plt.figure(figsize=(4,4))
ax = plt.gca()
df.plot(kind='scatter',x='overall',y='Yelp',ax=ax,**{'s':40,'color':'k','alpha':.1})
plt.xlabel('Overall rating',size=20)
plt.ylabel('Yelp rating',size=20)
plt.xticks(np.arange(0,6),size=15)
plt.yticks(np.arange(0,6),size=15)
plt.ylim((2,5))
print df.corr()['Yelp']['overall']
from tools.misc import pearsonp
print pearsonp(df.corr()['Yelp']['overall'],len(df[['Yelp','overall']].dropna()))

figname = 'corr-Yelp-overall'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')


0.341804497442
0.000409857545661

In [49]:
# GLM for Google
mainDo = ['Hunger','Cost','Tortilla','Temp','Meat','Fillings','Meat:filling',
               'Uniformity','Salsa','Synergy','Wrap','overall']
dffull = df[np.hstack((mainDo,'Google'))].dropna()
X = sm.add_constant(dffull[mainDo])
y = dffull['Google']
import statsmodels.api as sm
my_glm = sm.GLM(y,X)
res = my_glm.fit()
print(res.summary())
print(res.pvalues)
print 1 - np.var(res.resid_pearson) / np.var(y)


                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                 Google   No. Observations:                   86
Model:                            GLM   Df Residuals:                       73
Model Family:                Gaussian   Df Model:                           12
Link Function:               identity   Scale:                 0.0774221968148
Method:                          IRLS   Log-Likelihood:                -4.9668
Date:                Thu, 19 May 2016   Deviance:                       5.6518
Time:                        14:34:55   Pearson chi2:                     5.65
No. Iterations:                     4                                         
================================================================================
                   coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
const            3.3710      0.315     10.699      0.000         2.753     3.989
Hunger          -0.0064      0.038     -0.167      0.868        -0.081     0.069
Cost             0.0558      0.031      1.819      0.069        -0.004     0.116
Tortilla         0.1191      0.050      2.402      0.016         0.022     0.216
Temp            -0.0089      0.032     -0.275      0.784        -0.073     0.055
Meat            -0.0466      0.053     -0.885      0.376        -0.150     0.057
Fillings         0.0982      0.058      1.699      0.089        -0.015     0.212
Meat:filling     0.0216      0.037      0.587      0.557        -0.050     0.093
Uniformity      -0.0502      0.031     -1.614      0.106        -0.111     0.011
Salsa            0.0249      0.040      0.615      0.539        -0.054     0.104
Synergy          0.0118      0.058      0.203      0.839        -0.102     0.126
Wrap            -0.0059      0.028     -0.209      0.834        -0.062     0.050
overall         -0.0075      0.092     -0.082      0.935        -0.187     0.172
================================================================================
const           1.033212e-26
Hunger          8.676318e-01
Cost            6.895084e-02
Tortilla        1.629015e-02
Temp            7.835342e-01
Meat            3.761281e-01
Fillings        8.932973e-02
Meat:filling    5.568710e-01
Uniformity      1.064496e-01
Salsa           5.386564e-01
Synergy         8.391618e-01
Wrap            8.341880e-01
overall         9.347095e-01
dtype: float64
0.255929594631

Cali burritos vs. other burritos


In [106]:
# Identify california burritos
def caliburritoidx(x):
    import re
    idx = []
    for b in range(len(x)):
        re4str = re.compile('.*cali.*', re.IGNORECASE)
        if re4str.match(x[b]) is not None:
            idx.append(b)
    return idx

caliidx = caliburritoidx(df.Burrito)
Ncaliidx = np.arange(len(df))
Ncaliidx = np.delete(Ncaliidx,caliidx)

In [127]:
met_Cali = ['Hunger','Volume','Cost','Tortilla','Temp','Meat','Fillings','Meat:filling',
               'Uniformity','Salsa','Synergy','Wrap','overall']
for k in met_Cali:
    Mcali = df[k][caliidx].dropna()
    MNcali = df[k][Ncaliidx].dropna()
    print k
    print sp.stats.ttest_ind(Mcali,MNcali)


Hunger
Ttest_indResult(statistic=-0.54866656852575646, pvalue=0.58443327656113375)
Volume
Ttest_indResult(statistic=-1.181744265547761, pvalue=0.24761126388319155)
Cost
Ttest_indResult(statistic=0.37231937202870713, pvalue=0.71042753039024831)
Tortilla
Ttest_indResult(statistic=1.3534252950906041, pvalue=0.17891156228638563)
Temp
Ttest_indResult(statistic=2.0312156922031797, pvalue=0.044968980930911479)
Meat
Ttest_indResult(statistic=0.48418124016338993, pvalue=0.629305501608172)
Fillings
Ttest_indResult(statistic=0.44318178753735071, pvalue=0.65858208289853648)
Meat:filling
Ttest_indResult(statistic=-0.84491122336642899, pvalue=0.4001378956638737)
Uniformity
Ttest_indResult(statistic=-0.59884832788395115, pvalue=0.55061482575184373)
Salsa
Ttest_indResult(statistic=1.1872485701483237, pvalue=0.23818603397623547)
Synergy
Ttest_indResult(statistic=2.1762909303446163, pvalue=0.031887921544908597)
Wrap
Ttest_indResult(statistic=1.4829359548164081, pvalue=0.14123622153899251)
overall
Ttest_indResult(statistic=2.1782291207448741, pvalue=0.031692667848336825)

Independence of each dimension


In [146]:
df_Scott = df[df.Reviewer=='Scott']
idx_Scott = df2.index.values
idx_NScott = np.arange(len(df))
idx_NScott = np.delete(idx_NScott,idx_Scott)
burritos_Scott = df.loc[df2.index.values]['Burrito']

In [144]:
dfScorr = df_Scott.corr()

metricscorr = ['Yelp','Google','Hunger','Cost','Volume','Tortilla','Temp','Meat','Fillings','Meat:filling',
               'Uniformity','Salsa','Synergy','Wrap','overall']
M = len(metricscorr)

Mcorrmat = np.zeros((M,M))
Mpmat = np.zeros((M,M))
for m1 in range(M):
    for m2 in range(M):
        if m1 != m2:
            Mcorrmat[m1,m2] = dfcorr[metricscorr[m1]][metricscorr[m2]]
            Mpmat[m1,m2] = pearsonp(Mcorrmat[m1,m2],N)
            
clim1 = (-1,1)
plt.figure(figsize=(10,10))
cax = plt.pcolor(range(M+1), range(M+1), Mcorrmat, cmap=cm.bwr)
cbar = plt.colorbar(cax, ticks=(-1,-.5,0,.5,1))
cbar.ax.set_ylabel('Pearson correlation (r)', size=30)
plt.clim(clim1)
cbar.ax.set_yticklabels((-1,-.5,0,.5,1),size=20)
#plt.axis([2, M+1, floall[0],floall[-1]+10])
ax = plt.gca()
ax.set_yticks(np.arange(M)+.5)
ax.set_yticklabels(metricscorr,size=25)
ax.set_xticks(np.arange(M)+.5)
ax.set_xticklabels(metricscorr,size=9)
plt.tight_layout()



In [ ]:
# Try to argue that me sampling a bunch of burritos is equivalent to a bunch of people sampling burritos
# you would not be able to tell if a rated burrito was by me or someone else.

# Tests:
# 1. Means of each metric are the same
# 2. Metric correlations are the same (between each quality and overall)
# 3. Do I like Cali burritos more than other people?

In [147]:
# 1. Metric means are the same: I give my meat and meat:filling lower ratings
met_Scott = ['Hunger','Volume','Cost','Tortilla','Temp','Meat','Fillings','Meat:filling',
               'Uniformity','Salsa','Synergy','Wrap','overall']
for k in met_Scott:
    Msc = df[k][idx_Scott].dropna()
    MNsc = df[k][idx_NScott].dropna()
    print k
    print sp.stats.ttest_ind(Msc,MNsc)


Hunger
Ttest_indResult(statistic=-0.35515944428387269, pvalue=0.72320349253131666)
Volume
Ttest_indResult(statistic=0.32569543702126352, pvalue=0.74716554989761197)
Cost
Ttest_indResult(statistic=0.6418895400638277, pvalue=0.52238485954369307)
Tortilla
Ttest_indResult(statistic=-1.538189907305789, pvalue=0.12709866873400824)
Temp
Ttest_indResult(statistic=0.16871961167475821, pvalue=0.86636851309604335)
Meat
Ttest_indResult(statistic=-1.9864640647412954, pvalue=0.049691125740403012)
Fillings
Ttest_indResult(statistic=-1.6244282575689253, pvalue=0.10740127315147346)
Meat:filling
Ttest_indResult(statistic=-2.2264795472792414, pvalue=0.028181317840999365)
Uniformity
Ttest_indResult(statistic=-0.39351532077518508, pvalue=0.69476838917168504)
Salsa
Ttest_indResult(statistic=-0.18213800618467818, pvalue=0.85587515152365046)
Synergy
Ttest_indResult(statistic=-1.4317030200764307, pvalue=0.15534695808339646)
Wrap
Ttest_indResult(statistic=-1.3313416641041877, pvalue=0.18610371213303115)
overall
Ttest_indResult(statistic=-0.82675435352834792, pvalue=0.41030616909325002)