notebook.community

Edit and run



In [ ]:

    
%matplotlib inline



In [ ]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from tqdm import tqdm



In [ ]:

    
# pretty print nutriments
def pretty_nutriments(nuts):
    print('\n'.join(['{:25}{:.2f}\t{}\t{:.2f}'.format(k, v['quantity'], v['unit'], v['variance']) for k, v in nuts.items()]))



In [ ]:

    
client = Elasticsearch(hosts='http://')



In [ ]:

    
analysis = [a for a in Search(using=client, index='analysis').scan()]



In [ ]:

    
restaurant_analysis = {a.id: a for a in analysis}



In [ ]:

    
geneva = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='genève').scan() if restaurant_analysis.get(r.meta.id)]
len(geneva)



In [ ]:

    
lausanne = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='lausanne').scan() if restaurant_analysis.get(r.meta.id)]
len(lausanne)



In [ ]:

    
paris = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='paris').scan() if restaurant_analysis.get(r.meta.id)]
len(paris)



In [ ]:

    
lyon = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='lyon').scan() if restaurant_analysis.get(r.meta.id)]
len(lyon)



In [ ]:

    
bordeaux = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='bordeaux').scan() if restaurant_analysis.get(r.meta.id)]
len(bordeaux)



In [ ]:

    
cities = {
    'geneva': geneva,
    'lausanne': lausanne,
    'paris': paris,
    'lyon': lyon,
    'bordeaux': bordeaux
}



In [ ]:

    
units = {}
rests = []

# for each city, transform the data into friendly pandas format for futher filtering
for city, group in cities.items():
    print(city)
    for rest, agg in group:
        if len(agg.analysis):
            for k, v in agg.total.to_dict().items():
                units[k] = v['unit'] # units are all the same
            rest = {k: v['quantity'] for k, v in agg.total.to_dict().items()}

            rest['city'] = city
            rest['country'] = 'CH' if city == 'geneva' or city == 'lausanne' else 'FR'
            rests.append(rest)

rests = pd.DataFrame(rests)
plt.show()



In [ ]:

    
# remove extreme outliers as they come from errors in data entries (e.g. price of one meal > 2K)
rests_robust = rests[rests.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3  if type(x) is str else x).all(axis=1)]



In [ ]:

    
def country_criteria(name):
    # create bar plot for countries
    sns.barplot(x='country', y=name, data=rests_robust)
    a = rests_robust[rests_robust.country == 'CH'][name]
    b = rests_robust[rests_robust.country == 'FR'][name]
    test = stats.ttest_ind(a, b, equal_var=False) # perform 2-sample t-test
    pval = ', pval = {:.2f}'.format(test.pvalue)
    count = ', CH/FR = {}/{}'.format(len(a), len(b))
    plt.title('{} ({}) per country{}{}'.format(name, units[name], count, pval))
    plt.show()



In [ ]:

    
def city_criteria(name):
    # create bar plot for cities
    sns.barplot(x='city', y=name, data=rests_robust)
    plt.title('{} ({}) per city'.format(name, units[name]))
    plt.show()



In [ ]:

    
def criteria(name):
    country_criteria(name)
    city_criteria(name)



In [ ]:

    
criteria('Énergie (kCal)')



In [ ]:

    
criteria('Magnésium')



In [ ]:

    
criteria('Matières grasses')



In [ ]:

    
criteria('Sel')



In [ ]:

    
criteria('Calcium')



In [ ]:

    
criteria('Protéines')



In [ ]:

    
criteria('Acides gras saturées')



In [ ]:

    
criteria('Fibres alimentaires')



In [ ]:

    
criteria('Glucides')



In [ ]:

    
criteria('Sucres')



In [ ]:

    
criteria('Fer')



In [ ]:

    
criteria('Chlore')



In [ ]:



In [ ]: