In [ ]:
import pandas as pd
import numpy as np
import json
from bs4 import BeautifulSoup
import re
from elasticsearch import Elasticsearch
from elasticsearch import helpers as eshelper
In [ ]:
df = pd.read_json("../data/recipe1.json")
df.shape
In [ ]:
df.head()
In [ ]:
df.quantity.unique()
In [ ]:
df = df[df.quantity.str.contains('pers') | df.quantity.str.contains('portion')]
df.shape
In [ ]:
quantity = df.quantity.apply(lambda x: int(x.split()[0]))
In [ ]:
df = pd.concat([df.drop('quantity', axis=1), quantity], axis=1)
df.shape
In [ ]:
df.ix[0].content
In [ ]:
def replace_frac(n):
return n \
.replace('1/2', '0.5') \
.replace('1/3', '0.3333') \
.replace('2/3', '0.6666') \
.replace('1/4', '0.25') \
.replace('3/4', '0.75') \
.replace('1/5', '0.2') \
.replace('2/5', '0.4') \
.replace('3/5', '0.6') \
.replace('4/5', '0.8') \
.replace('1/6', '0.1666') \
.replace('5/6', '0.8333') \
.replace('1/8', '0.125') \
.replace('3/8', '0.375') \
.replace('5/8', '0.675') \
.replace('7/8', '0.875') \
.replace('½', '0.5') \
.replace('⅓', '0.3333') \
.replace('⅔', '0.6666') \
.replace('¼', '0.25') \
.replace('¾', '0.75') \
.replace('⅕', '0.2') \
.replace('⅖', '0.4') \
.replace('⅗', '0.6') \
.replace('⅘', '0.8') \
.replace('⅙', '0.1666') \
.replace('⅚', '0.8333') \
.replace('⅛', '0.125') \
.replace('⅜', '0.375') \
.replace('⅝', '0.675') \
.replace('⅞', '0.875')
In [ ]:
raw = []
delim_re = re.compile('\([^\)]+\)')
un_re = re.compile('^une?\s')
for i, r in df.iterrows():
if 'li' in r.content:
ings = [li.get_text().strip() for li in BeautifulSoup(r.content, 'html.parser').find_all('li')]
raw.append({
'name': r.recipe,
'quantity': r.quantity,
'ingredients': [replace_frac(un_re.sub('1 ', delim_re.sub('', ing.lower()))) for ing in ings],
})
In [ ]:
float_re = '\s?([0-9]*[.]?[0-9]+)'
ing_re = '(.+)'
units_re = [
re.compile('{}{}{}'.format(float_re, u, ing_re)) for u in [
'\s?(dl)\s',
'\s?(l)\s',
'\s?(ml)\s',
'\s?(cl)\s',
'\s?(g)\s',
'\s?(kg)\s',
'\s?(c)\s?à\s[cs]\s',
'\s(cuillère)s?\s',
'\s(cuillerée)s?\s',
'\s(pincée)s?\s',
'\s(poignée)s?\s',
'\s(verre)s?\s',
'()']
]
In [ ]:
mapping = {
'dl': (100, 'ml'),
'l': (1000, 'ml') ,
'ml': (1, 'ml'),
'cl': (10, 'ml'),
'g': (1, 'g'),
'kg': (1000, 'kg'),
'c': (10, 'ml'),
'cuillère': (10, 'ml'),
'cuillerée': (10, 'ml'),
'pincée': (3, 'g'),
'poignée': (50, 'g'),
'verre': (300, 'ml'),
}
In [ ]:
too_vague = ['sel', 'poivre', 'huile', 'beurre']
In [ ]:
recipes = []
not_matched = []
c = 0
for r in raw:
recipe = {
'name': r['name'],
'ingredients': []
}
for ing in r['ingredients']:
found = False
for pttrn in units_re:
res = pttrn.search(ing)
if res:
quantity, unit, content = res.groups()
quantity = float(quantity)
if unit:
coef, unit = mapping[unit]
quantity *= float(coef) / r['quantity']
#print("{} / {} / {}".format(quantity, unit, content))
recipe['ingredients'].append({
'content': content.strip(),
'quantity': quantity,
'unit': unit if len(unit) else None,
})
found = True
break
if not found and all([w not in ing for w in too_vague]):
not_matched.append(ing)
else:
c += 1
recipes.append(recipe)
In [ ]:
len(recipes)
In [ ]:
len(not_matched)
In [ ]:
c
In [ ]:
not_matched
In [ ]:
recipes[0]
In [ ]:
entries = []
for recipe in recipes:
fmt = {
'_index': 'recipes',
'_type': '750g',
'_source': recipe
}
entries.append(fmt)
len(entries)
In [ ]:
client = Elasticsearch(hosts='http://')
In [ ]:
eshelper.bulk(client, entries)
In [ ]: