Scott Cole
1 July 2016
This notebook applies nonlinear technqiues to analyze the contributions of burrito dimensions to the overall burrito rating.
NOTE: A Neural network is not recommended because we should have 30x as many examples as weights (and for 3-layer neural network with 4 nodes in the first 2 layers and 1 in the last layer, that would be (16+4 = 20), so would need 600 burritos. One option would be to artificially create data.
In [1]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import pandasql
import seaborn as sns
sns.set_style("white")
In [2]:
import util
df = util.load_burritos()
N = df.shape[0]
In [3]:
def vitalness(df, dim, rating_cutoff = 2,
metrics = ['Hunger','Tortilla','Temp','Meat','Fillings','Meatfilling',
'Uniformity','Salsa','Wrap']):
# Fit GLM to get predicted values
dffull = df[np.hstack((metrics,'overall'))].dropna()
X = sm.add_constant(dffull[metrics])
y = dffull['overall']
my_glm = sm.GLM(y,X)
res = my_glm.fit()
dffull['overallpred'] = res.fittedvalues
# Make exception for Meat:filling in order to avoid pandasql error
if dim == 'Meat:filling':
dffull = dffull.rename(columns={'Meat:filling':'Meatfilling'})
dim = 'Meatfilling'
# Compare predicted and actual overall ratings for each metric below the rating cutoff
import pandasql
q = """
SELECT
overall, overallpred
FROM
dffull
WHERE
"""
q = q + dim + ' <= ' + np.str(rating_cutoff)
df2 = pandasql.sqldf(q.lower(), locals())
return sp.stats.ttest_rel(df2.overall,df2.overallpred)
In [4]:
vital_metrics = ['Hunger','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Wrap']
for metric in vital_metrics:
print metric
if metric == 'Volume':
rating_cutoff = .7
else:
rating_cutoff = 1
print vitalness(df,metric,rating_cutoff=rating_cutoff, metrics=vital_metrics)
In [5]:
def savior(df, dim, rating_cutoff = 2,
metrics = ['Hunger','Tortilla','Temp','Meat','Fillings','Meatfilling',
'Uniformity','Salsa','Wrap']):
# Fit GLM to get predicted values
dffull = df[np.hstack((metrics,'overall'))].dropna()
X = sm.add_constant(dffull[metrics])
y = dffull['overall']
my_glm = sm.GLM(y,X)
res = my_glm.fit()
dffull['overallpred'] = res.fittedvalues
# Make exception for Meat:filling in order to avoid pandasql error
if dim == 'Meat:filling':
dffull = dffull.rename(columns={'Meat:filling':'Meatfilling'})
dim = 'Meatfilling'
# Compare predicted and actual overall ratings for each metric below the rating cutoff
import pandasql
q = """
SELECT
overall, overallpred
FROM
dffull
WHERE
"""
q = q + dim + ' >= ' + np.str(rating_cutoff)
df2 = pandasql.sqldf(q.lower(), locals())
print len(df2)
return sp.stats.ttest_rel(df2.overall,df2.overallpred)
In [11]:
vital_metrics = ['Hunger','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Wrap']
for metric in vital_metrics:
print metric
print savior(df,metric,rating_cutoff=5, metrics=vital_metrics)
print 'Volume'
vital_metrics = ['Hunger','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Wrap','Volume']
print savior(df,'Volume',rating_cutoff=.9,metrics=vital_metrics)