In [ ]:
import pandas as pd
import numpy as np
import json
from elasticsearch import Elasticsearch
from elasticsearch import helpers as eshelper

In [ ]:
df_ch = pd.read_json("../data/parsed_ch.json").transpose()
df_com0 = pd.read_json("../data/parsed_com0.json").transpose()
df_com1 = pd.read_json("../data/parsed_com1.json").transpose()
df_com2 = pd.read_json("../data/parsed_com2.json").transpose()
df_com3 = pd.read_json("../data/parsed_com3.json").transpose()
df_com4 = pd.read_json("../data/parsed_com4.json").transpose()

In [ ]:
df = pd.concat([df_ch, df_com0, df_com1, df_com2, df_com3, df_com4])
df.head()

In [ ]:
df.shape

Clean-up

ID clean-up

structure : ['https:', '', 'www.lafourchette.com', 'restaurant', 'joe-s', '218659']


In [ ]:
restaurant_ids = []
restaurant_names = []
websites = []

for link in df.index.values:
    link_split = link.split("/")
    websites.append(link_split[2])
    restaurant_names.append(link_split[-2])
    restaurant_ids.append(link_split[-1])

In [ ]:
len(np.unique(restaurant_ids)) == len(restaurant_ids)

In [ ]:
df = df.set_index([restaurant_ids])

In [ ]:
np.unique(websites)

In [ ]:
df.head()

In [ ]:
df.shape

Address clean-up

structure : {'country': 'Suisse', 'city': 'Renens', 'ZIP': '1020', 'street': 'Place du marché 1'}


In [ ]:
no_addr = df[df["address"].isnull()]
no_addr.shape

Remove missing address entries


In [ ]:
df.drop(no_addr.index, inplace=True, errors='ignore')
df.shape

In [ ]:
addrs = pd.DataFrame.from_records(df["address"].values, index=df.index)
addrs.head()

In [ ]:
addrs.ZIP.unique()[:100]

Remove swiss prefix and cedex (special business mail) suffix


In [ ]:
addrs.ZIP = addrs.ZIP.apply(lambda z: z[3:] if z.startswith('CH-') else z)
addrs.ZIP = addrs.ZIP.apply(lambda z: z[:-6] if z.endswith(' CEDEX') else z)

In [ ]:
np.unique(addrs.ZIP)[:100]

In [ ]:
addrs.ZIP = addrs.ZIP.astype(np.int, raise_on_error=False)

In [ ]:
df = pd.concat([df.drop('address', axis=1), addrs], axis=1)
df.shape

In [ ]:
for u in addrs.country.unique():
    print("{}: {}".format(u, np.count_nonzero(addrs.country == u)))

Keep only true country as other are non significant.


In [ ]:
df = df[addrs.country.isin(['Suisse', 'France', 'Belgique', 'Monaco', 'Belgique', 'Espagne', 'Italie'])]

In [ ]:
df.shape

GPS clean-up

structure : {'lng': 6.1227483, 'lat': 46.2393855}


In [ ]:
no_gps = df[df["gps"].isnull()]
no_gps.shape

In [ ]:
gps = pd.DataFrame.from_records(df['gps'].values, index=df.index)
gps.head()

In [ ]:
gps.lat = gps.lat.astype(np.float)
gps.lng = gps.lng.astype(np.float)

In [ ]:
df = pd.concat([df.drop('gps', axis=1), gps], axis=1)
df.head()

Insertion


In [ ]:
entries = []
for i, resto in df.iterrows():
    if resto.menu:
        fmt = {
            '_index': 'restaurants',
            '_type': 'raw',
            '_source': {
                'name': resto['name'],
                'street': resto.street,
                'zip': resto.ZIP,
                'city': resto.city,
                'country': resto.country,
                'lat': resto.lat,
                'lng': resto.lng
            }
        }
        
        if resto.menu.get('Entrée'):
            fmt['_source']['starters'] = [{'name': e, 'price': p} for e, p in resto.menu.get('Entrée').items()]
            
        if resto.menu.get('Plat'):
            fmt['_source']['mains'] = [{'name': e, 'price': p} for e, p in resto.menu.get('Plat').items()]
            
        if resto.menu.get('Dessert'):
            fmt['_source']['desserts'] = [{'name': e, 'price': p} for e, p in resto.menu.get('Dessert').items()]
            
        if resto.drinks:
            fmt['_source']['drinks'] = [{'name': e, 'price': p} for e, p in resto.drinks.items()]

        entries.append(fmt)
        
len(entries)

In [ ]:
client = Elasticsearch(hosts='http://')

In [ ]:


In [ ]: