In [ ]:
import pandas as pd
import numpy as np
import json
import nltk
from elasticsearch import Elasticsearch
from elasticsearch import helpers as eshelper
In [ ]:
df = pd.read_json("../data/marmiton.json")
In [ ]:
df.head()
In [ ]:
recipies = df[["title", "ingredients", "quantity"]]
In [ ]:
mesures = np.array(['dl', 'l', 'ml', 'cl', 'g', 'kg', 'cuillère', 'cuillères', 'cuillerées', 'pincée', 'poignée', 'pincées', 'verre'])
litre_ml = np.array(["ml","cl","dl","l"])
poids_g = np.array(["g", "kg"])
long = np.array(['cuillère', 'cuillères', 'cuillerées'])
long_num = np.array([3, 3, 3])
stop_word = np.array(['une', 'de', 'petits', 'petites', 'petite', 'petit', 'grand', 'gros', 'grosse', 'beaux', 'beau'])
mult = np.array(['ou', 'à'])
quant = np.array(['paquet', 'sachet'])
mapping = {
'cuillères': (10, 'ml'),
'cuillerées': (10, 'ml'),
'cuillère': (10, 'ml'),
'cuillerée': (10, 'ml'),
'pincées': (3, 'g'),
'pincée': (3, 'g'),
'poignées': (50, 'g'),
'poignée': (50, 'g'),
'verres': (300, 'ml'),
'verre': (300, 'ml'),
}
In [ ]:
def number(s):
s = s.replace(',', '.')
try:
return float(s)
except ValueError:
if '/' in s and s[0].isdigit():
a, b = s.split('/')
if not number(a) or not number(b):
return False
return float(a) / float(b)
else:
return False
In [ ]:
def parse(title, nbr_pers, ingredients):
res = {'name': title, 'ingredients': []}
for ingredient in ingredients:
arr = np.array(nltk.word_tokenize(ingredient))
arr = arr[np.in1d(arr, stop_word, invert=True)]
if len(arr) > 0:
quant = None
mesure = None
num = number(arr[0])
if len(arr) > 1 and num:
quant = num
if arr[1] in mult:
arr = arr[2:]
ingredient = ""
num = 0
if arr[1].lower() in mesures:
mesure = arr[1].lower()
num = 1
if mesure in long:
num = long_num[long == mesure]
if arr[1] in mapping.keys():
quant *= mapping[arr[1]][0]
mesure = mapping[arr[1]][1]
if mesure in litre_ml:
quant *= 10 ** np.where(litre_ml == mesure)[0][0]
mesure = "ml"
if mesure in poids_g:
quant *= 1000 ** np.where(poids_g == mesure)[0][0]
mesure = "g"
try:
nbr_pers = float(nbr_pers)
if nbr_pers and not nbr_pers == 0:
quant /= float(nbr_pers)
except ValueError:
pass
arr = arr[1+num:]
elif arr[0][0].isdigit():
_1 = arr[0]
for mes in mesures:
if mes in _1:
num = number(_1.replace(mes, ''))
if num:
quant = num
mesure = mes
arr[1:]
if '(' in arr:
idx = np.where(arr == '(')[0]
if len(idx) > 1:
idx = idx[0]
arr = arr[:idx]
if not quant is None:
res['ingredients'].append({
'quantity': quant,
'unit': mesure,
'content': " ".join(arr)
})
return res
In [ ]:
res = [parse(title, nbr_pers, ingds) for title, ingds, nbr_pers in recipies.values]
In [ ]:
entries = []
for recipe in res:
fmt = {
'_index': 'recipes',
'_type': 'marmiton',
'_source': recipe
}
entries.append(fmt)
In [ ]:
client = Elasticsearch(hosts='TODO')
eshelper.bulk(client, entries)