In [ ]:
import pandas as pd
import numpy as np
import json
from elasticsearch import Elasticsearch
from elasticsearch import helpers as eshelper
In [ ]:
df_ch = pd.read_json("../data/parsed_ch.json").transpose()
df_com0 = pd.read_json("../data/parsed_com0.json").transpose()
df_com1 = pd.read_json("../data/parsed_com1.json").transpose()
df_com2 = pd.read_json("../data/parsed_com2.json").transpose()
df_com3 = pd.read_json("../data/parsed_com3.json").transpose()
df_com4 = pd.read_json("../data/parsed_com4.json").transpose()
In [ ]:
df = pd.concat([df_ch, df_com0, df_com1, df_com2, df_com3, df_com4])
df.head()
In [ ]:
df.shape
structure : ['https:', '', 'www.lafourchette.com', 'restaurant', 'joe-s', '218659']
In [ ]:
restaurant_ids = []
restaurant_names = []
websites = []
for link in df.index.values:
link_split = link.split("/")
websites.append(link_split[2])
restaurant_names.append(link_split[-2])
restaurant_ids.append(link_split[-1])
In [ ]:
len(np.unique(restaurant_ids)) == len(restaurant_ids)
In [ ]:
df = df.set_index([restaurant_ids])
In [ ]:
np.unique(websites)
In [ ]:
df.head()
In [ ]:
df.shape
structure : {'country': 'Suisse', 'city': 'Renens', 'ZIP': '1020', 'street': 'Place du marché 1'}
In [ ]:
no_addr = df[df["address"].isnull()]
no_addr.shape
Remove missing address entries
In [ ]:
df.drop(no_addr.index, inplace=True, errors='ignore')
df.shape
In [ ]:
addrs = pd.DataFrame.from_records(df["address"].values, index=df.index)
addrs.head()
In [ ]:
addrs.ZIP.unique()[:100]
Remove swiss prefix and cedex (special business mail) suffix
In [ ]:
addrs.ZIP = addrs.ZIP.apply(lambda z: z[3:] if z.startswith('CH-') else z)
addrs.ZIP = addrs.ZIP.apply(lambda z: z[:-6] if z.endswith(' CEDEX') else z)
In [ ]:
np.unique(addrs.ZIP)[:100]
In [ ]:
addrs.ZIP = addrs.ZIP.astype(np.int, raise_on_error=False)
In [ ]:
df = pd.concat([df.drop('address', axis=1), addrs], axis=1)
df.shape
In [ ]:
for u in addrs.country.unique():
print("{}: {}".format(u, np.count_nonzero(addrs.country == u)))
Keep only true country as other are non significant.
In [ ]:
df = df[addrs.country.isin(['Suisse', 'France', 'Belgique', 'Monaco', 'Belgique', 'Espagne', 'Italie'])]
In [ ]:
df.shape
structure : {'lng': 6.1227483, 'lat': 46.2393855}
In [ ]:
no_gps = df[df["gps"].isnull()]
no_gps.shape
In [ ]:
gps = pd.DataFrame.from_records(df['gps'].values, index=df.index)
gps.head()
In [ ]:
gps.lat = gps.lat.astype(np.float)
gps.lng = gps.lng.astype(np.float)
In [ ]:
df = pd.concat([df.drop('gps', axis=1), gps], axis=1)
df.head()
In [ ]:
entries = []
for i, resto in df.iterrows():
if resto.menu:
fmt = {
'_index': 'restaurants',
'_type': 'raw',
'_source': {
'name': resto['name'],
'street': resto.street,
'zip': resto.ZIP,
'city': resto.city,
'country': resto.country,
'lat': resto.lat,
'lng': resto.lng
}
}
if resto.menu.get('Entrée'):
fmt['_source']['starters'] = [{'name': e, 'price': p} for e, p in resto.menu.get('Entrée').items()]
if resto.menu.get('Plat'):
fmt['_source']['mains'] = [{'name': e, 'price': p} for e, p in resto.menu.get('Plat').items()]
if resto.menu.get('Dessert'):
fmt['_source']['desserts'] = [{'name': e, 'price': p} for e, p in resto.menu.get('Dessert').items()]
if resto.drinks:
fmt['_source']['drinks'] = [{'name': e, 'price': p} for e, p in resto.drinks.items()]
entries.append(fmt)
len(entries)
In [ ]:
client = Elasticsearch(hosts='http://')
In [ ]:
In [ ]: