Scott Cole
21 May 2016
This notebook characterizes the collection of reviewers of San Diego burritos including:
In [1]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set_style("white")
In [3]:
import util
df = util.load_burritos()
N = df.shape[0]
df.head()
In [6]:
print('Number of burritos:', df.shape[0])
print('Number of restaurants:', len(df.Location.unique()))
print('Number of reviewers:', len(df.Reviewer.unique()))
print('Number of reviews by Scott:', df.Reviewer.value_counts()['Scott'])
uniqlocidx = df.Location.drop_duplicates().index
print('Percentage of taco shops with free chips:', np.round(100 - 100*df.Chips[uniqlocidx].isnull().sum()/np.float(len(df.Location.unique())),1))
In [8]:
# Count of how many burritos each person has eaten
df['Reviewer'].value_counts()
Out[8]:
In [31]:
# Number of each type of burrito
def burritotypes(x, types = {'California':'cali', 'Carnitas':'carnita', 'Carne asada':'carne asada',
'Chicken':'chicken', 'Surf & Turf':'surf.*turf', 'Adobada':'adobad', 'Al Pastor':'pastor'}):
import re
T = len(types)
Nmatches = {}
for b in x:
matched = False
for t in types.keys():
re4str = re.compile('.*'+types[t]+'.*', re.IGNORECASE)
if np.logical_and(re4str.match(b) is not None, matched is False):
try:
Nmatches[t] +=1
except KeyError:
Nmatches[t] = 1
matched = True
if matched is False:
try:
Nmatches['other'] +=1
except KeyError:
Nmatches['other'] = 1
return Nmatches
typecounts = burritotypes(df.Burrito)
plt.figure(figsize=(7,7))
ax = plt.axes([0.1, 0.1, 0.65, 0.65])
# The slices will be ordered and plotted counter-clockwise.
labels = typecounts.keys()
fracs = np.array([i for i in typecounts.values()])
explode=[.1]*len(typecounts)
patches, texts, autotexts = plt.pie(fracs, explode=explode, labels=labels,
autopct=lambda p: '{:.0f}'.format(p * np.sum(fracs) / 100), shadow=False, startangle=0)
# The default startangle is 0, which would start
# the Frogs slice on the x-axis. With startangle=90,
# everything is rotated counter-clockwise by 90 degrees,
# so the plotting starts on the positive y-axis.
plt.title('Types of burritos',size=30)
for t in texts:
t.set_size(20)
for t in autotexts:
t.set_size(20)
autotexts[0].set_color('w')
autotexts[6].set_color('w')
figname = 'burritotypes'
plt.savefig('/gh/fig/burrito/'+figname + '.png')
In [35]:
# Time series of ratings
import math
def dates2ts(dates):
from datetime import datetime
D = len(dates)
start = datetime.strptime('1/1/2016','%m/%d/%Y')
ts = np.zeros(D,dtype=int)
for d in range(D):
burrdate = datetime.strptime(df.Date[d],'%m/%d/%Y')
diff = burrdate - start
ts[d] = diff.days
return ts
def cumburritos(days):
from statsmodels.distributions.empirical_distribution import ECDF
ecdf = ECDF(days)
t = np.arange(days[-1]+1)
return t, ecdf(t)*len(days)
def datelabels(startdate = '1/1/2016', M = 12):
from datetime import datetime
start = datetime.strptime(startdate,'%m/%d/%Y')
datestrs = []
ts = np.zeros(M)
for m in range(M):
datestrs.append(str(m+1) + '/1')
burrdate = datetime.strptime(datestrs[m]+'/2016','%m/%d/%Y')
diff = burrdate - start
ts[m] = diff.days
return datestrs, ts
burrdays = dates2ts(df.Date)
t, burrcdf = cumburritos(burrdays)
datestrs, datets = datelabels()
plt.figure(figsize=(5,5))
plt.plot(t,burrcdf,'k-')
plt.xlabel('Date (2016)',size=20)
plt.ylabel('# burritos rated',size=15)
plt.xticks(datets,datestrs,size=10, rotation='vertical')
plt.yticks(size=10)
plt.tight_layout()
figname = 'burritoprogress'
plt.savefig('/gh/fig/burrito/'+figname + '.png')
In [37]:
# Distribution of hunger level
plt.figure(figsize=(4,4))
n, _, _ = plt.hist(df.Hunger.dropna(),np.arange(-.25,5.5,.5),color='k')
plt.xlabel('Hunger level',size=20)
plt.xticks(np.arange(0,5.5,.5),size=10)
plt.xlim((-.25,5.25))
plt.ylabel('Count',size=20)
plt.yticks((0,int(math.ceil(np.max(n) / 5.)) * 5),size=10)
plt.tight_layout()
figname = 'hungerleveldist'
plt.savefig('/gh/fig/burrito/'+figname + '.png')
In [39]:
# Average burrito cost
plt.figure(figsize=(4,4))
n, _, _ = plt.hist(df.Cost.dropna(),np.arange(4,10.25,.5),color='k')
plt.xlabel('Cost ($)',size=20)
plt.xticks(np.arange(4,11,1),size=15)
plt.xlim((4,10))
plt.ylabel('Count',size=20)
plt.yticks((0,int(math.ceil(np.max(n) / 5.)) * 5),size=15)
plt.tight_layout()
figname = 'costdist'
plt.savefig('/gh/fig/burrito/'+figname + '.png')
print(np.nanmean(df.Cost))
In [40]:
# Volume dist
plt.figure(figsize=(5,5))
n, _, _ = plt.hist(df.Volume.dropna(),np.arange(0.5,1.3,.05),color='k')
plt.xlabel('Volume (L)',size=20)
plt.xticks(np.arange(0.5,1.3,.1),size=15)
plt.xlim((0.5,1.2))
plt.ylabel('Count',size=20)
plt.yticks((0,int(math.ceil(np.max(n) / 5.)) * 5),size=15)
plt.tight_layout()
figname = 'volumedist'
plt.savefig('/gh/fig/burrito/'+figname + '.png')
print(np.mean(df.Volume))
In [41]:
def metrichist(metricname):
plt.figure(figsize=(5,5))
n, _, _ = plt.hist(df[metricname].dropna(),np.arange(-.25,5.5,.5),color='k')
plt.xlabel(metricname + ' rating',size=20)
plt.xticks(np.arange(0,5.5,.5),size=15)
plt.xlim((-.25,5.25))
plt.ylabel('Count',size=20)
plt.yticks((0,int(math.ceil(np.max(n) / 5.)) * 5),size=15)
plt.tight_layout()
if metricname == 'Meat:filling':
metricname = 'meattofilling'
figname = metricname + 'dist'
plt.savefig('/gh/fig/burrito/'+figname + '.png')
In [42]:
m_Hist = ['Tortilla','Temp','Meat','Fillings','Meat:filling','Uniformity','Salsa','Synergy','Wrap','overall']
for m in m_Hist:
metrichist(m)
In [44]:
# Overall recommendations
plt.figure(figsize=(6,6))
ax = plt.axes([0.1, 0.1, 0.8, 0.8])
# The slices will be ordered and plotted counter-clockwise.
labels = ['Yes','No']
fracs = np.array([np.sum(df.Rec==labels[0]),np.sum(df.Rec==labels[1])])
explode=[.01]*len(labels)
patches, texts, autotexts = plt.pie(fracs, explode=explode, labels=labels,
autopct=lambda p: '{:.0f}'.format(p * np.sum(fracs) / 100), shadow=False, startangle=90)
# The default startangle is 0, which would start
# the Frogs slice on the x-axis. With startangle=90,
# everything is rotated counter-clockwise by 90 degrees,
# so the plotting starts on the positive y-axis.
plt.title('Would you recommend this burrito?',size=30)
for t in texts:
t.set_size(20)
for t in autotexts:
t.set_size(30)
autotexts[0].set_color('w')
autotexts[1].set_color('w')
figname = 'recspie'
plt.savefig('/gh/fig/burrito/'+figname + '.png')
In [45]:
dfpca = df[['Volume','Tortilla','Temp','Meat','Fillings','Meat:filling','Uniformity','Salsa','Synergy','Wrap']]
In [46]:
dfpca = dfpca.fillna(dfpca.mean())
In [47]:
# Normalize
dfpca = (dfpca - dfpca.mean()) / dfpca.std()
dfpca
Out[47]:
In [48]:
# Color: Taco Stand, Lucha, Los Primos
In [50]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(dfpca)
print(pca.components_)
print(pca.explained_variance_ratio_)
In [52]:
dfpca_proj = np.dot(pca.components_,dfpca.T)
In [55]:
dfpca_proj[0][np.where(df.Location=='taco stand')]
Out[55]:
In [56]:
plt.plot(dfpca_proj[0],dfpca_proj[1],'k.')
Out[56]:
In [57]:
plt.figure(figsize=(8,8))
shops = ['taco stand','lucha libre north park','los primos mexican food']
shops_marker = ['*','^','s']
shops_ms = [20,12,12]
overallcutoffs = [-.1, 3, 4, 5.1]
overallcolors = ['r','k','g']
for o in range(len(overallcolors)):
notshops = np.logical_and(df.Location != shops[0],np.logical_and(df.Location != shops[1],df.Location != shops[2]))
orange = np.logical_and(df.overall>=overallcutoffs[o],df.overall<overallcutoffs[o+1])
notshops = np.where(np.logical_and(notshops,orange))
plt.plot(dfpca_proj[0][notshops],dfpca_proj[1][notshops],'.',color=overallcolors[o],alpha=.5,ms=20)
for s in range(len(shops)):
burridx = np.where(np.logical_and(df.Location==shops[s],np.logical_and(df.overall>=overallcutoffs[o],df.overall<overallcutoffs[o+1])))
plt.plot(dfpca_proj[0][burridx],dfpca_proj[1][burridx],
shops_marker[s],color=overallcolors[o],ms=shops_ms[s],label = shops[s])
plt.xlim((-8,4.5))
plt.ylim((-3,4))
plt.xlabel('PC 1',size=20)
plt.ylabel('PC 2',size=20)
plt.xticks([])
plt.yticks([])
plt.legend(loc='best')
Out[57]:
In [59]:
shopsalpha = [.2,.2,.2]
shops = ['taco stand','lucha libre north park','los primos mexican food']
overall_marker = ['v','.','*']
overall_ms = [12,25,20]
overallcutoffs = [-.1, 3, 4, 5.1]
shopscolors = ['g','b','r']
plt.figure(figsize=(8,8))
for o in range(len(overallcolors)):
notshops = np.logical_and(df.Location != shops[0],np.logical_and(df.Location != shops[1],df.Location != shops[2]))
orange = np.logical_and(df.overall>=overallcutoffs[o],df.overall<overallcutoffs[o+1])
notshops = np.where(np.logical_and(notshops,orange))[0]
#plt.plot(df.Meat[notshops],df.Fillings[notshops],'.',color=overallcolors[o],alpha=.2,ms=20)
for s in range(len(shops)):
burridx = np.where(np.logical_and(df.Location==shops[s],np.logical_and(df.overall>=overallcutoffs[o],df.overall<overallcutoffs[o+1])))[0]
plt.plot(df.Meat[burridx],df.Salsa[burridx],
overall_marker[o],color=shopscolors[s],ms=overall_ms[o],alpha=shopsalpha[s],label=shops[s])
plt.xlim((0,5.5))
plt.ylim((0,5.5))
plt.xlabel('Meat flavor',size=20)
plt.ylabel('Salsa flavor',size=20)
plt.xticks(np.arange(1,6),size=20)
plt.yticks(np.arange(1,6),size=20)
plt.legend(loc='best',fontsize=12)
plt.savefig('/gh/fig/burrito/superscatter.png')
In [ ]: