In [ ]:
%matplotlib inline

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from tqdm import tqdm

In [ ]:
# pretty print nutriments
def pretty_nutriments(nuts):
    print('\n'.join(['{:25}{:.2f}\t{}\t{:.2f}'.format(k, v['quantity'], v['unit'], v['variance']) for k, v in nuts.items()]))

In [ ]:
client = Elasticsearch(hosts='http://')

In [ ]:
analysis = [a for a in Search(using=client, index='analysis').scan()]

In [ ]:
restaurant_analysis = {a.id: a for a in analysis}

In [ ]:
geneva = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='genève').scan() if restaurant_analysis.get(r.meta.id)]
len(geneva)

In [ ]:
lausanne = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='lausanne').scan() if restaurant_analysis.get(r.meta.id)]
len(lausanne)

In [ ]:
paris = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='paris').scan() if restaurant_analysis.get(r.meta.id)]
len(paris)

In [ ]:
lyon = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='lyon').scan() if restaurant_analysis.get(r.meta.id)]
len(lyon)

In [ ]:
bordeaux = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='bordeaux').scan() if restaurant_analysis.get(r.meta.id)]
len(bordeaux)

In [ ]:
cities = {
    'geneva': geneva,
    'lausanne': lausanne,
    'paris': paris,
    'lyon': lyon,
    'bordeaux': bordeaux
}

In [ ]:
units = {}
rests = []

# for each city, transform the data into friendly pandas format for futher filtering
for city, group in cities.items():
    print(city)
    for rest, agg in group:
        if len(agg.analysis):
            for k, v in agg.total.to_dict().items():
                units[k] = v['unit'] # units are all the same
            rest = {k: v['quantity'] for k, v in agg.total.to_dict().items()}

            rest['city'] = city
            rest['country'] = 'CH' if city == 'geneva' or city == 'lausanne' else 'FR'
            rests.append(rest)

rests = pd.DataFrame(rests)
plt.show()

In [ ]:
# remove extreme outliers as they come from errors in data entries (e.g. price of one meal > 2K)
rests_robust = rests[rests.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3  if type(x) is str else x).all(axis=1)]

In [ ]:
def country_criteria(name):
    # create bar plot for countries
    sns.barplot(x='country', y=name, data=rests_robust)
    a = rests_robust[rests_robust.country == 'CH'][name]
    b = rests_robust[rests_robust.country == 'FR'][name]
    test = stats.ttest_ind(a, b, equal_var=False) # perform 2-sample t-test
    pval = ', pval = {:.2f}'.format(test.pvalue)
    count = ', CH/FR = {}/{}'.format(len(a), len(b))
    plt.title('{} ({}) per country{}{}'.format(name, units[name], count, pval))
    plt.show()

In [ ]:
def city_criteria(name):
    # create bar plot for cities
    sns.barplot(x='city', y=name, data=rests_robust)
    plt.title('{} ({}) per city'.format(name, units[name]))
    plt.show()

In [ ]:
def criteria(name):
    country_criteria(name)
    city_criteria(name)

In [ ]:
criteria('Énergie (kCal)')

In [ ]:
criteria('Magnésium')

In [ ]:
criteria('Matières grasses')

In [ ]:
criteria('Sel')

In [ ]:
criteria('Calcium')

In [ ]:
criteria('Protéines')

In [ ]:
criteria('Acides gras saturées')

In [ ]:
criteria('Fibres alimentaires')

In [ ]:
criteria('Glucides')

In [ ]:
criteria('Sucres')

In [ ]:
criteria('Fer')

In [ ]:
criteria('Chlore')

In [ ]:


In [ ]: