In [ ]:
import pandas as pd
import numpy as np
import json
import nltk
from elasticsearch import Elasticsearch
from elasticsearch import helpers as eshelper

In [ ]:
df = pd.read_json("../data/marmiton.json")

In [ ]:
df.head()

In [ ]:
recipies = df[["title", "ingredients", "quantity"]]

In [ ]:
mesures = np.array(['dl', 'l', 'ml', 'cl', 'g', 'kg', 'cuillère', 'cuillères', 'cuillerées', 'pincée', 'poignée', 'pincées', 'verre'])
litre_ml = np.array(["ml","cl","dl","l"])
poids_g = np.array(["g", "kg"])
long = np.array(['cuillère', 'cuillères', 'cuillerées'])
long_num = np.array([3, 3, 3])
stop_word = np.array(['une', 'de', 'petits', 'petites', 'petite', 'petit', 'grand', 'gros', 'grosse', 'beaux', 'beau'])
mult = np.array(['ou', 'à'])
quant = np.array(['paquet', 'sachet'])
mapping = {
    'cuillères': (10, 'ml'),
    'cuillerées': (10, 'ml'),
    'cuillère': (10, 'ml'),
    'cuillerée': (10, 'ml'),
    'pincées': (3, 'g'),
    'pincée': (3, 'g'),
    'poignées': (50, 'g'),
    'poignée': (50, 'g'),
    'verres': (300, 'ml'),
    'verre': (300, 'ml'),
}

In [ ]:
def number(s):
    s = s.replace(',', '.')
    try:
        return float(s)
    except ValueError:
        if '/' in s and s[0].isdigit():
            a, b = s.split('/')
            
            if not number(a) or not number(b):
                return False
            
            return float(a) / float(b)
        else:
            return False

In [ ]:
def parse(title, nbr_pers, ingredients):
    res = {'name': title, 'ingredients': []}
    for ingredient in ingredients:
        arr = np.array(nltk.word_tokenize(ingredient))
        arr = arr[np.in1d(arr, stop_word, invert=True)]
        
        if len(arr) > 0:

            quant = None
            mesure = None

            num = number(arr[0])
            
            if len(arr) > 1 and num:
                quant = num

                if arr[1] in mult:
                    arr = arr[2:]

                ingredient = ""

                num = 0

                if arr[1].lower() in mesures:
                    mesure = arr[1].lower()
                    num = 1

                    if mesure in long:
                        num = long_num[long == mesure]
                
                if arr[1] in mapping.keys():
                    quant *= mapping[arr[1]][0]
                    mesure = mapping[arr[1]][1]
                
                if mesure in litre_ml:
                    quant *= 10 ** np.where(litre_ml == mesure)[0][0]
                    mesure = "ml"
                    
                if mesure in poids_g:
                    quant *= 1000 ** np.where(poids_g == mesure)[0][0]
                    mesure = "g"
                    
                try:
                    nbr_pers = float(nbr_pers)
                    if nbr_pers and not nbr_pers == 0:
                        quant /= float(nbr_pers)
                except ValueError:
                    pass
                    
                arr = arr[1+num:]
            elif arr[0][0].isdigit():
                _1 = arr[0]
                for mes in mesures:
                    if mes in _1:
                        num = number(_1.replace(mes, ''))
                        if num:
                            quant = num
                        mesure = mes
                        arr[1:]

            if '(' in arr:
                idx = np.where(arr == '(')[0]
                if len(idx) > 1:
                    idx = idx[0]
                arr = arr[:idx]

            if not quant is None:
                res['ingredients'].append({
                    'quantity': quant, 
                    'unit': mesure, 
                    'content': " ".join(arr)
                })
        
    return res

In [ ]:
res = [parse(title, nbr_pers, ingds) for title, ingds, nbr_pers in recipies.values]

In [ ]:
entries = []
for recipe in res:
    fmt = {
        '_index': 'recipes',
        '_type': 'marmiton',
        '_source': recipe
    }
    entries.append(fmt)

In [ ]:
client = Elasticsearch(hosts='TODO')
eshelper.bulk(client, entries)