In [ ]:
import pandas as pd
import numpy as np
import json
from bs4 import BeautifulSoup
import re
from elasticsearch import Elasticsearch
from elasticsearch import helpers as eshelper

In [ ]:
df = pd.read_json("../data/recipe1.json")
df.shape

In [ ]:
df.head()

Quantity


In [ ]:
df.quantity.unique()

In [ ]:
df = df[df.quantity.str.contains('pers') | df.quantity.str.contains('portion')]
df.shape

In [ ]:
quantity = df.quantity.apply(lambda x: int(x.split()[0]))

In [ ]:
df = pd.concat([df.drop('quantity', axis=1), quantity], axis=1)
df.shape

In [ ]:
df.ix[0].content

In [ ]:
def replace_frac(n):
    return n \
    .replace('1/2', '0.5') \
    .replace('1/3', '0.3333') \
    .replace('2/3', '0.6666') \
    .replace('1/4', '0.25') \
    .replace('3/4', '0.75') \
    .replace('1/5', '0.2') \
    .replace('2/5', '0.4') \
    .replace('3/5', '0.6') \
    .replace('4/5', '0.8') \
    .replace('1/6', '0.1666') \
    .replace('5/6', '0.8333') \
    .replace('1/8', '0.125') \
    .replace('3/8', '0.375') \
    .replace('5/8', '0.675') \
    .replace('7/8', '0.875') \
    .replace('½',   '0.5') \
    .replace('⅓',   '0.3333') \
    .replace('⅔',   '0.6666') \
    .replace('¼',   '0.25') \
    .replace('¾',   '0.75') \
    .replace('⅕',   '0.2') \
    .replace('⅖',   '0.4') \
    .replace('⅗',   '0.6') \
    .replace('⅘',   '0.8') \
    .replace('⅙',   '0.1666') \
    .replace('⅚',   '0.8333') \
    .replace('⅛',   '0.125') \
    .replace('⅜',   '0.375') \
    .replace('⅝',   '0.675') \
    .replace('⅞',   '0.875')

In [ ]:
raw = []
delim_re = re.compile('\([^\)]+\)')
un_re = re.compile('^une?\s')

for i, r in df.iterrows():
    if 'li' in r.content:
        ings = [li.get_text().strip() for li in BeautifulSoup(r.content, 'html.parser').find_all('li')]
        raw.append({
            'name': r.recipe,
            'quantity': r.quantity,
            'ingredients': [replace_frac(un_re.sub('1 ', delim_re.sub('', ing.lower()))) for ing in ings],
        })

In [ ]:
float_re = '\s?([0-9]*[.]?[0-9]+)'
ing_re = '(.+)'
units_re = [
    re.compile('{}{}{}'.format(float_re, u, ing_re)) for u in [
        '\s?(dl)\s', 
        '\s?(l)\s', 
        '\s?(ml)\s', 
        '\s?(cl)\s', 
        '\s?(g)\s', 
        '\s?(kg)\s',
        '\s?(c)\s?à\s[cs]\s',
        '\s(cuillère)s?\s', 
        '\s(cuillerée)s?\s', 
        '\s(pincée)s?\s', 
        '\s(poignée)s?\s', 
        '\s(verre)s?\s',
        '()']
    ]

In [ ]:
mapping = {
    'dl': (100, 'ml'),
    'l': (1000, 'ml') ,
    'ml': (1, 'ml'),
    'cl': (10, 'ml'),
    'g': (1, 'g'),
    'kg': (1000, 'kg'),
    'c': (10, 'ml'),
    'cuillère': (10, 'ml'),
    'cuillerée': (10, 'ml'),
    'pincée': (3, 'g'),
    'poignée': (50, 'g'),
    'verre': (300, 'ml'),
}

In [ ]:
too_vague = ['sel', 'poivre', 'huile', 'beurre']

In [ ]:
recipes = []
not_matched = []
c = 0

for r in raw:
    recipe = {
        'name': r['name'],
        'ingredients': []
    }
    for ing in r['ingredients']:

        found = False
        for pttrn in units_re:
            res = pttrn.search(ing)
            if res:
                quantity, unit, content = res.groups()
                
                quantity = float(quantity)
                if unit:
                    coef, unit = mapping[unit]
                    quantity *= float(coef) / r['quantity']

                #print("{} / {} / {}".format(quantity, unit, content))
                recipe['ingredients'].append({
                    'content': content.strip(),
                    'quantity': quantity,
                    'unit': unit if len(unit) else None,
                })
                found = True
                break
        if not found and all([w not in ing for w in too_vague]):
            not_matched.append(ing)
        else:
            c += 1
            
    recipes.append(recipe)

In [ ]:
len(recipes)

In [ ]:
len(not_matched)

In [ ]:
c

In [ ]:
not_matched

In [ ]:
recipes[0]

In [ ]:
entries = []

for recipe in recipes:
    fmt = {
        '_index': 'recipes',
        '_type': '750g',
        '_source': recipe
    }
    entries.append(fmt)
        
len(entries)

In [ ]:
client = Elasticsearch(hosts='http://')

In [ ]:
eshelper.bulk(client, entries)

In [ ]: