In [ ]:
%matplotlib inline
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from tqdm import tqdm
In [ ]:
# pretty print nutriments
def pretty_nutriments(nuts):
print('\n'.join(['{:25}{:.2f}\t{}\t{:.2f}'.format(k, v['quantity'], v['unit'], v['variance']) for k, v in nuts.items()]))
In [ ]:
client = Elasticsearch(hosts='http://')
In [ ]:
analysis = [a for a in Search(using=client, index='analysis').scan()]
In [ ]:
restaurant_analysis = {a.id: a for a in analysis}
In [ ]:
geneva = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='genève').scan() if restaurant_analysis.get(r.meta.id)]
len(geneva)
In [ ]:
lausanne = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='lausanne').scan() if restaurant_analysis.get(r.meta.id)]
len(lausanne)
In [ ]:
paris = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='paris').scan() if restaurant_analysis.get(r.meta.id)]
len(paris)
In [ ]:
lyon = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='lyon').scan() if restaurant_analysis.get(r.meta.id)]
len(lyon)
In [ ]:
bordeaux = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='bordeaux').scan() if restaurant_analysis.get(r.meta.id)]
len(bordeaux)
In [ ]:
cities = {
'geneva': geneva,
'lausanne': lausanne,
'paris': paris,
'lyon': lyon,
'bordeaux': bordeaux
}
In [ ]:
units = {}
rests = []
# for each city, transform the data into friendly pandas format for futher filtering
for city, group in cities.items():
print(city)
for rest, agg in group:
if len(agg.analysis):
for k, v in agg.total.to_dict().items():
units[k] = v['unit'] # units are all the same
rest = {k: v['quantity'] for k, v in agg.total.to_dict().items()}
rest['city'] = city
rest['country'] = 'CH' if city == 'geneva' or city == 'lausanne' else 'FR'
rests.append(rest)
rests = pd.DataFrame(rests)
plt.show()
In [ ]:
# remove extreme outliers as they come from errors in data entries (e.g. price of one meal > 2K)
rests_robust = rests[rests.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3 if type(x) is str else x).all(axis=1)]
In [ ]:
def country_criteria(name):
# create bar plot for countries
sns.barplot(x='country', y=name, data=rests_robust)
a = rests_robust[rests_robust.country == 'CH'][name]
b = rests_robust[rests_robust.country == 'FR'][name]
test = stats.ttest_ind(a, b, equal_var=False) # perform 2-sample t-test
pval = ', pval = {:.2f}'.format(test.pvalue)
count = ', CH/FR = {}/{}'.format(len(a), len(b))
plt.title('{} ({}) per country{}{}'.format(name, units[name], count, pval))
plt.show()
In [ ]:
def city_criteria(name):
# create bar plot for cities
sns.barplot(x='city', y=name, data=rests_robust)
plt.title('{} ({}) per city'.format(name, units[name]))
plt.show()
In [ ]:
def criteria(name):
country_criteria(name)
city_criteria(name)
In [ ]:
criteria('Énergie (kCal)')
In [ ]:
criteria('Magnésium')
In [ ]:
criteria('Matières grasses')
In [ ]:
criteria('Sel')
In [ ]:
criteria('Calcium')
In [ ]:
criteria('Protéines')
In [ ]:
criteria('Acides gras saturées')
In [ ]:
criteria('Fibres alimentaires')
In [ ]:
criteria('Glucides')
In [ ]:
criteria('Sucres')
In [ ]:
criteria('Fer')
In [ ]:
criteria('Chlore')
In [ ]:
In [ ]: