In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from elasticsearch import helpers as eshelper
import nltk
from nltk.tag import StanfordPOSTagger
from tqdm import tqdm
In [ ]:
# load stanford utilities, see readme
jar = '../data/stanford-postagger/stanford-postagger.jar'
model = '../data/stanford-postagger/models/french.tagger'
pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
In [ ]:
client = Elasticsearch(hosts='http://')
In [ ]:
restaurants = [r for r in Search(using=client, index='restaurants').scan()]
len(restaurants)
In [ ]:
# some tests
In [ ]:
req = Search(using=client, index='recipes') \
.query('match', **{'ingredients.content': 'asperge'})
for hit in req.execute():
print(hit.name)
print(hit.ingredients)
print()
In [ ]:
req = Search(using=client, index='products') \
.query('match', name='sachets de thé vert')
for hit in req.execute():
print(hit.name)
print(hit.meta.score)
print([x.name for x in hit.nutriments])
print()
In [ ]:
# select restaurants that will be analyzed
In [ ]:
restaurants = [r for r in Search(using=client, index='restaurants').query('match', **{'city': 'lyon'}).scan()]
len(restaurants)
In [ ]:
# overview of all naive queries results
for restaurant in restaurants:
print('- Restaurant: {}'.format(restaurant.name))
if 'mains' in restaurant:
for main in restaurant.mains:
print(' - Plat: {}'.format(main.name))
recipes = [i for i in Search(using=client, index='recipes').query('match', name=main.name)[0].execute()]
recipes_match = recipes
for recipe in recipes_match:
print(' - Recette: {}'.format(recipe.name))
ings = [n for n in Search(using=client, index='products').query('match', name=recipe.name).execute()]
ings_match = ings
for ing in ings_match:
print(' - Ingredient: {}'.format(ing.name))
for nut in ing.nutriments:
print(" - Nutriment: {} {}{}".format(nut.name, nut.per_hundred, nut.unit))
In [ ]:
In [ ]:
# search recipes that match given meal name and that have ingredients with units
def search_fielddata(index, term, fields=[], size=10):
res = client.search(
index=index,
body={
'query': {
'match': {
'name': term
}
},
'post_filter': {
'exists': {
'field': 'ingredients.unit'
}
},
'size': size,
'fielddata_fields': fields
}
)['hits']['hits']
return res
In [ ]:
# score and select best recipes among all results
def recipes_enrichment(term, min_occurence=2, min_ing=5, prnt=False):
recipes = search_fielddata('recipes', term, ['ingredients.content'], 20)
ings = {}
for recipe in recipes:
# for each raw (stemmed, processing) elasticsesarch token
for ing in recipe['fields']['ingredients.content']:
# count appariton in recipes
if ing in ings:
ings[ing] += 1
else:
ings[ing] = 1
# sorted scoring token and threshold the lowest ones
ings_scores = sorted([(v, k) for k, v in ings.items()], reverse=True, key=lambda x: x[0])
ings_trendy = set([v for k, v in ings_scores if k >= min_occurence])
# score each recipe by counting how many previous token they contain
recipes_scores = []
for recipe in recipes:
matches = 0
for ing in recipe['fields']['ingredients.content']:
if ing in ings_trendy:
matches += 1
recipes_scores.append((matches, recipe))
# keep only the recipes containing the most highest tokens
recipes_scores = sorted(recipes_scores, reverse=True, key=lambda x: x[0])
recipes_trendy = [v for k, v in recipes_scores if k >= min_ing]
if prnt:
print([r['_source']['name'] for r in recipes_trendy])
return recipes
In [ ]:
# search with boosting criterias of term, having an unit and muplitly by the inverse length (avoid long partial matches)
def search_boost(index, term, unit, size=10):
res = client.search(
index=index,
body={
'query': {
'function_score': {
'query': {
'bool': {
'must': [
{
'match': {
'name': {
'query': term,
'boost': 4
}
}
}
],
'should': [
{
'match': {
'unit': unit if unit else ''
}
},
{
'range': {
'quantity': {
'lte': 250,
'gte': 1,
'boost': 4
}
}
}
],
'minimum_should_match': 0
}
},
'script_score': {
'script': {
'lang': 'painless',
'inline': '_score * 1.0 / doc["name"].length'
}
}
}
},
'size': size
}
)['hits']['hits']
return res
In [ ]:
# estimate quantity of ingredient when having no unit
def estimate_quantity(index, term, percentile='50.0'):
res = client.search(
index=index,
body={
"query": {
"nested": {
"path": "ingredients",
"query": {
"bool": {
"must": [
{
"match": {
"ingredients.content": term
}
},
{
"exists": {
"field": "ingredients.unit"
}
}
]
}
}
}
},
"aggs": {
"average": {
"nested": {
"path": "ingredients"
},
"aggs": {
"quantity": {
"percentiles": {
"field": "ingredients.quantity"
}
}
}
}
},
"_source": ""
}
)
per = res['aggregations']['average']['quantity']['values'][percentile]
return per if per != 'NaN' else 0
In [ ]:
# merge to nutriments dict by ingoring different unit (if it occurs), can average
def merge_nutriments(arr, avg=False):
arr = [a for a in arr if a]
nut_names = set()
for nutriments in arr:
for nut, value in nutriments.items():
nut_names.add(nut)
merged = {}
averger = len(arr) if avg else 1.0
for nut_name in nut_names:
quantity = 0.0
unit = None
for nutriments in arr:
if nut_name in nutriments:
if not unit:
unit = nutriments[nut_name]['unit']
if unit == nutriments[nut_name]['unit']:
quantity += nutriments[nut_name]['quantity']
merged[nut_name] = {
'quantity': quantity / averger,
'unit': unit
}
return merged
In [ ]:
# pretty print
def pretty_nutriments(nuts):
print('\n'.join(['{:25}{:.2f}\t{}'.format(k, v['quantity'], v['unit']) for k, v in nuts.items()]))
In [ ]:
# score and select best products by boosting search and quantity estimation when needed
def ingredients_enrichment(recipe, prnt=False, log=False):
ingredients = recipe['_source']['ingredients']
if log:
print(recipe['_source']['name'])
ingredients_nutriments = []
for i in ingredients:
# remove adjective (such as small/big), can be an issue with color but generally improve the results
content = i['content']
pos = pos_tagger.tag(content.split())
content_cleaned = ' '.join([k for k, v in pos if v != 'ADJ'])
unit = i['unit']
quantity = i['quantity']
# cannot relate nutriment quantity if not unit for ingredients
if not unit:
quantity = estimate_quantity('recipes_nested', content)
if prnt:
print('======')
print(pos)
print(content_cleaned)
print(quantity)
print(unit)
print('===')
products = search_boost('products', content_cleaned, unit, 5)
# this runs only once and take care of quantity management
for p in products:
nutirment_quantity = p['_source']['quantity']
factor = quantity
if nutirment_quantity:
factor *= 100.0 / nutirment_quantity
if prnt:
print(p['_source']['name'])
print(nutirment_quantity)
print(p['_source']['unit'])
print(p['_score'])
selected_nutriments = {}
for n in p['_source']['nutriments']:
selected_nutriments[n['name']] = {
'unit': n['unit'],
'quantity': n['per_hundred'] / 100.0 * factor
}
if prnt:
print(selected_nutriments)
ingredients_nutriments.append(selected_nutriments)
break
#print(ingredients_nutriments)
final_nutriments = merge_nutriments(ingredients_nutriments)
if log:
pretty_nutriments(final_nutriments)
return final_nutriments
In [ ]:
In [ ]:
# map meal to recipe and ingredient to product for getting nutriment
def meal(name, min_occurence=2, min_ing=5):
re = recipes_enrichment(name, min_occurence=min_occurence, min_ing=min_ing)
if len(re):
return ingredients_enrichment(re[0]), re[0]['_source']['name']
return None
In [ ]:
In [ ]:
# aggregate meal for given restaurant
def resto(rest):
bilan = {
'id': rest.meta.id,
'name': rest.name,
'analysis': []
}
total = []
if 'starters' in rest:
local = []
for m in rest.starters:
#print(m.name)
nutriments, match = meal(m['name'])
local.append(nutriments)
total.append(nutriments)
bilan['analysis'].append({
'name': m.name,
'match': match,
'price': m.price,
'nutriments': nutriments,
'cat': 'starters'
})
bilan['starters'] = merge_nutriments(local, avg=True)
if 'mains' in rest:
local = []
for m in rest.mains:
#print(m.name)
nutriments, match = meal(m['name'])
local.append(nutriments)
total.append(nutriments)
bilan['analysis'].append({
'name': m.name,
'match': match,
'price': m.price,
'nutriments': nutriments,
'cat': 'mains'
})
bilan['mains'] = merge_nutriments(local, avg=True)
if 'desserts' in rest:
local = []
for m in rest.desserts:
#print(m.name)
nutriments, match = meal(m['name'])
local.append(nutriments)
total.append(nutriments)
bilan['analysis'].append({
'name': m.name,
'match': match,
'price': m.price,
'nutriments': nutriments,
'cat': 'desserts'
})
bilan['desserts'] = merge_nutriments(local, avg=True)
bilan['total'] = merge_nutriments(total, avg=True)
return bilan
In [ ]:
# process selected restaurant and store into elasticsearch
for r in tqdm(restaurants):
try:
src = resto(r)
fmt = {
'_index': 'analysis',
'_type': 'fst',
'_source': src
}
eshelper.bulk(client, [fmt])
except Exception as e:
if 'name' in r:
print(r.name)
print(e)
print('===')
In [ ]: