In [1]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set_style("white")
In [180]:
filename="burrito_current.csv"
df = pd.read_csv(filename)
N = df.shape[0]
In [3]:
dfcorr = df.corr()
In [73]:
from tools.misc import pearsonp
metricscorr = ['Hunger','Cost','Volume','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Synergy','Wrap','overall']
M = len(metricscorr)
Mcorrmat = np.zeros((M,M))
Mpmat = np.zeros((M,M))
for m1 in range(M):
for m2 in range(M):
if m1 != m2:
Mcorrmat[m1,m2] = dfcorr[metricscorr[m1]][metricscorr[m2]]
Mpmat[m1,m2] = pearsonp(Mcorrmat[m1,m2],N)
In [84]:
from matplotlib import cm
clim1 = (-1,1)
plt.figure(figsize=(12,10))
cax = plt.pcolor(range(M+1), range(M+1), Mcorrmat, cmap=cm.bwr)
cbar = plt.colorbar(cax, ticks=(-1,-.5,0,.5,1))
cbar.ax.set_ylabel('Pearson correlation (r)', size=30)
plt.clim(clim1)
cbar.ax.set_yticklabels((-1,-.5,0,.5,1),size=20)
#plt.axis([2, M+1, floall[0],floall[-1]+10])
ax = plt.gca()
ax.set_yticks(np.arange(M)+.5)
ax.set_yticklabels(metricscorr,size=25)
ax.set_xticks(np.arange(M)+.5)
ax.set_xticklabels(metricscorr,size=25)
plt.xticks(rotation='vertical')
plt.tight_layout()
plt.xlim((0,M))
plt.ylim((0,M))
figname = 'metriccorrmat'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')
In [6]:
sp.stats.pearsonr(df.Hunger,df.overall)
Out[6]:
In [7]:
print Mpmat[0]
print Mcorrmat[0]
In [155]:
plt.figure(figsize=(4,4))
ax = plt.gca()
df.plot(kind='scatter',x='Cost',y='Volume',ax=ax,**{'s':40,'color':'k'})
plt.xlabel('Cost ($)',size=20)
plt.ylabel('Volume (L)',size=20)
plt.xticks(np.arange(5,11),size=15)
plt.yticks(np.arange(.6,1.2,.1),size=15)
plt.tight_layout()
print df.corr()['Cost']['Volume']
from tools.misc import pearsonp
print pearsonp(df.corr()['Cost']['Volume'],len(df[['Cost','Volume']].dropna()))
figname = 'corr-volume-cost'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')
In [9]:
# Visualize some correlations
from tools.plt import scatt_corr
scatt_corr(df['overall'].values,df['Meat'].values,
xlabel = 'overall rating', ylabel='meat rating', xlim = (-.5,5.5),ylim = (-.5,5.5),xticks=range(6),yticks=range(6))
#showline = True)
scatt_corr(df['overall'].values,df['Wrap'].values,
xlabel = 'overall rating', ylabel='wrap integrity rating', xlim = (-.5,5.5),ylim = (-.5,5.5),xticks=range(6),yticks=range(6))
#showline = True)
In [10]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
# Get all ingredient keys
startingredients = 29
ingredientkeys = df.keys()[startingredients:]
# Get all ingredient keys with at least 10 burritos
Nlim = 10
ingredientkeys = ingredientkeys[df.count()[startingredients:].values>=Nlim]
# Make a dataframe for all ingredient keys
dfing = df[ingredientkeys]
# For each key, make binary
for k in dfing.keys():
dfing[k] = dfing[k].map({'x':1,'X':1,1:1})
dfing[k] = dfing[k].fillna(0)
In [21]:
import statsmodels.api as sm
X = sm.add_constant(dfing)
y = df.overall
lm = sm.GLM(y,X)
res = lm.fit()
print(res.summary())
origR2 = 1 - np.var(res.resid_pearson) / np.var(y)
print origR2
In [22]:
Nsurr = 1000
randr2 = np.zeros(Nsurr)
for n in range(Nsurr):
Xrand = np.random.rand(X.shape[0],X.shape[1])
Xrand[:,0] = np.ones(X.shape[0])
lm = sm.GLM(y,Xrand)
res = lm.fit()
randr2[n] = 1 - np.var(res.resid_pearson) / np.var(y)
print 'p = ' , np.mean(randr2>origR2)
In [23]:
# Is this a null result? let's do t-tests
for k in dfing.keys():
withk = df.overall[dfing[k].values==1].values
nok = df.overall[dfing[k].values==0].values
print k
print sp.stats.ttest_ind(withk,nok)
In [61]:
plt.figure(figsize=(4,4))
ax = plt.gca()
df.plot(kind='scatter',x='Meat',y='Fillings',ax=ax,**{'s':40,'color':'k','alpha':.1})
plt.xlabel('Meat rating',size=20)
plt.ylabel('Non-meat rating',size=20)
plt.xticks(np.arange(0,6),size=15)
plt.yticks(np.arange(0,6),size=15)
print df.corr()['Meat']['Fillings']
from tools.misc import pearsonp
print pearsonp(df.corr()['Meat']['Fillings'],len(df[['Meat','Fillings']].dropna()))
figname = 'corr-meat-filling'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')
In [170]:
# How many burritos at taco stand?
restrictCali = False
import re
reTS = re.compile('.*taco stand.*', re.IGNORECASE)
reCali = re.compile('.*cali.*', re.IGNORECASE)
locTS = np.ones(len(df))
for i in range(len(df)):
mat = reTS.match(df['Location'][i])
if mat is None:
locTS[i] = 0
else:
if restrictCali:
mat = reCali.match(df['Burrito'][i])
if mat is None:
locTS[i] = 0
print sum(locTS)
temp = np.arange(len(df))
dfTS = df.loc[temp[locTS==1]]
In [171]:
plt.figure(figsize=(4,4))
ax = plt.gca()
dfTS.plot(kind='scatter',x='Meat',y='Fillings',ax=ax,**{'s':40,'color':'k','alpha':.1})
plt.xlabel('Meat rating',size=20)
plt.ylabel('Non-meat rating',size=20)
plt.xticks(np.arange(0,6),size=15)
plt.yticks(np.arange(0,6),size=15)
print dfTS.corr()['Meat']['Fillings']
from tools.misc import pearsonp
print pearsonp(dfTS.corr()['Meat']['Fillings'],len(dfTS[['Meat','Fillings']].dropna()))
figname = 'corr-meat-filling-TS'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')
In [172]:
# Spearman correlation
dfMF = df[['Meat','Fillings']].dropna()
dfTSMF = dfTS[['Meat','Fillings']].dropna()
print sp.stats.spearmanr(dfMF.Meat,dfMF.Fillings)
print sp.stats.spearmanr(dfTSMF.Meat,dfTSMF.Fillings)
In [63]:
plt.figure(figsize=(4,4))
ax = plt.gca()
df.plot(kind='scatter',x='Hunger',y='overall',ax=ax,**{'s':40,'color':'k'})
plt.xlabel('Hunger',size=20)
plt.ylabel('Overall rating',size=20)
plt.xticks(np.arange(0,6),size=15)
plt.yticks(np.arange(0,6),size=15)
print df.corr()['Hunger']['overall']
from tools.misc import pearsonp
print pearsonp(df.corr()['Hunger']['overall'],len(df[['Hunger','overall']].dropna()))
figname = 'corr-hunger-overall'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')
In [159]:
# GLM for
# Remove NANs
mainD = ['Hunger','Cost','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Wrap']
dffull = df[np.hstack((mainD,'overall'))].dropna()
X = sm.add_constant(dffull[mainD])
y = dffull['overall']
import statsmodels.api as sm
my_glm = sm.GLM(y,X)
res = my_glm.fit()
print(res.summary())
print 1 - np.var(res.resid_pearson) / np.var(y)
In [150]:
# Linear regression
# Note that this matches GLM above :D
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X,y)
print lm.intercept_
print lm.coef_
print 'R2 = ' + np.str(lm.score(X,y))
In [153]:
# Visualize coefficients
from tools.plt import bar
newidx = np.argsort(-res.params.values)
temp = np.arange(len(newidx))
newidx = np.delete(newidx,temp[newidx==0])
bar(res.params[newidx],res.bse[newidx],X.keys()[newidx],'Overall rating\nLinear model\ncoefficient',
ylim =(0,.5),figsize=(11,3))
plt.plot()
figname = 'overall_metric_linearmodelcoef'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')
In [45]:
sp.stats.pearsonr(X['Synergy'],y)[0]**2
Out[45]:
In [233]:
# Average each metric over each Location
# Avoid case issues; in the future should avoid article issues
df.Location = df.Location.str.lower()
m_Location = ['Location','N','Yelp','Google','Hunger','Cost','Volume','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Synergy','Wrap','overall']
tacoshops = df.Location.unique()
TS = len(tacoshops)
dfmean = pd.DataFrame(np.nan, index=range(TS), columns=m_Location)
dfmean.Location = tacoshops
for ts in range(TS):
dfmean['N'][ts] = sum(df.Location == tacoshops[ts])
for m in m_Location[2:]:
dfmean[m][ts] = df[m].loc[df.Location==tacoshops[ts]].mean()
In [237]:
metricscorr = ['Yelp','Google','Hunger','Cost','Volume','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Synergy','Wrap','overall']
M = len(metricscorr)
dfmeancorr = dfmean.corr()
Mcorrmat = np.zeros((M,M))
Mpmat = np.zeros((M,M))
for m1 in range(M):
for m2 in range(M):
if m1 != m2:
Mcorrmat[m1,m2] = dfmeancorr[metricscorr[m1]][metricscorr[m2]]
Mpmat[m1,m2] = pearsonp(Mcorrmat[m1,m2],N)
clim1 = (-1,1)
plt.figure(figsize=(10,10))
cax = plt.pcolor(range(M+1), range(M+1), Mcorrmat, cmap=cm.bwr)
cbar = plt.colorbar(cax, ticks=(-1,-.5,0,.5,1))
cbar.ax.set_ylabel('Pearson correlation (r)', size=30)
plt.clim(clim1)
cbar.ax.set_yticklabels((-1,-.5,0,.5,1),size=20)
#plt.axis([2, M+1, floall[0],floall[-1]+10])
ax = plt.gca()
ax.set_yticks(np.arange(M)+.5)
ax.set_yticklabels(metricscorr,size=25)
ax.set_xticks(np.arange(M)+.5)
ax.set_xticklabels(metricscorr,size=9)
plt.tight_layout()
In [174]:
print Mcorrmat[0]
print Mpmat[0]
In [238]:
# GLM for Yelp
mainDo = ['Hunger','Cost','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Synergy','Wrap','overall']
dffull = dfmean[np.hstack((mainDo,'Yelp'))].dropna()
X = sm.add_constant(dffull[mainDo])
y = dffull['Yelp']
import statsmodels.api as sm
my_glm = sm.GLM(y,X)
res = my_glm.fit()
print(res.summary())
print(res.pvalues)
print 1 - np.var(res.resid_pearson) / np.var(y)
In [68]:
plt.figure(figsize=(4,4))
ax = plt.gca()
df.plot(kind='scatter',x='Tortilla',y='Yelp',ax=ax,**{'s':40,'color':'k','alpha':.1})
plt.xlabel('Tortilla rating',size=20)
plt.ylabel('Yelp rating',size=20)
plt.xticks(np.arange(0,6),size=15)
plt.yticks(np.arange(0,6),size=15)
plt.ylim((2,5))
print df.corr()['Yelp']['Tortilla']
from tools.misc import pearsonp
print pearsonp(df.corr()['Yelp']['Tortilla'],len(df[['Yelp','Tortilla']].dropna()))
figname = 'corr-Yelp-tortilla'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')
In [67]:
plt.figure(figsize=(4,4))
ax = plt.gca()
df.plot(kind='scatter',x='overall',y='Yelp',ax=ax,**{'s':40,'color':'k','alpha':.1})
plt.xlabel('Overall rating',size=20)
plt.ylabel('Yelp rating',size=20)
plt.xticks(np.arange(0,6),size=15)
plt.yticks(np.arange(0,6),size=15)
plt.ylim((2,5))
print df.corr()['Yelp']['overall']
from tools.misc import pearsonp
print pearsonp(df.corr()['Yelp']['overall'],len(df[['Yelp','overall']].dropna()))
figname = 'corr-Yelp-overall'
plt.savefig('C:/Users/Scott/Google Drive/qwm/burritos/figs/'+figname + '.png')
In [49]:
# GLM for Google
mainDo = ['Hunger','Cost','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Synergy','Wrap','overall']
dffull = df[np.hstack((mainDo,'Google'))].dropna()
X = sm.add_constant(dffull[mainDo])
y = dffull['Google']
import statsmodels.api as sm
my_glm = sm.GLM(y,X)
res = my_glm.fit()
print(res.summary())
print(res.pvalues)
print 1 - np.var(res.resid_pearson) / np.var(y)
In [106]:
# Identify california burritos
def caliburritoidx(x):
import re
idx = []
for b in range(len(x)):
re4str = re.compile('.*cali.*', re.IGNORECASE)
if re4str.match(x[b]) is not None:
idx.append(b)
return idx
caliidx = caliburritoidx(df.Burrito)
Ncaliidx = np.arange(len(df))
Ncaliidx = np.delete(Ncaliidx,caliidx)
In [127]:
met_Cali = ['Hunger','Volume','Cost','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Synergy','Wrap','overall']
for k in met_Cali:
Mcali = df[k][caliidx].dropna()
MNcali = df[k][Ncaliidx].dropna()
print k
print sp.stats.ttest_ind(Mcali,MNcali)
In [146]:
df_Scott = df[df.Reviewer=='Scott']
idx_Scott = df2.index.values
idx_NScott = np.arange(len(df))
idx_NScott = np.delete(idx_NScott,idx_Scott)
burritos_Scott = df.loc[df2.index.values]['Burrito']
In [144]:
dfScorr = df_Scott.corr()
metricscorr = ['Yelp','Google','Hunger','Cost','Volume','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Synergy','Wrap','overall']
M = len(metricscorr)
Mcorrmat = np.zeros((M,M))
Mpmat = np.zeros((M,M))
for m1 in range(M):
for m2 in range(M):
if m1 != m2:
Mcorrmat[m1,m2] = dfcorr[metricscorr[m1]][metricscorr[m2]]
Mpmat[m1,m2] = pearsonp(Mcorrmat[m1,m2],N)
clim1 = (-1,1)
plt.figure(figsize=(10,10))
cax = plt.pcolor(range(M+1), range(M+1), Mcorrmat, cmap=cm.bwr)
cbar = plt.colorbar(cax, ticks=(-1,-.5,0,.5,1))
cbar.ax.set_ylabel('Pearson correlation (r)', size=30)
plt.clim(clim1)
cbar.ax.set_yticklabels((-1,-.5,0,.5,1),size=20)
#plt.axis([2, M+1, floall[0],floall[-1]+10])
ax = plt.gca()
ax.set_yticks(np.arange(M)+.5)
ax.set_yticklabels(metricscorr,size=25)
ax.set_xticks(np.arange(M)+.5)
ax.set_xticklabels(metricscorr,size=9)
plt.tight_layout()
In [ ]:
# Try to argue that me sampling a bunch of burritos is equivalent to a bunch of people sampling burritos
# you would not be able to tell if a rated burrito was by me or someone else.
# Tests:
# 1. Means of each metric are the same
# 2. Metric correlations are the same (between each quality and overall)
# 3. Do I like Cali burritos more than other people?
In [147]:
# 1. Metric means are the same: I give my meat and meat:filling lower ratings
met_Scott = ['Hunger','Volume','Cost','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Synergy','Wrap','overall']
for k in met_Scott:
Msc = df[k][idx_Scott].dropna()
MNsc = df[k][idx_NScott].dropna()
print k
print sp.stats.ttest_ind(Msc,MNsc)