notebook.community

Edit and run



In [ ]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from elasticsearch import helpers as eshelper
import nltk
from nltk.tag import StanfordPOSTagger
from tqdm import tqdm



In [ ]:

    
# load stanford utilities, see readme
jar = '../data/stanford-postagger/stanford-postagger.jar'
model = '../data/stanford-postagger/models/french.tagger'
pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')



In [ ]:

    
client = Elasticsearch(hosts='http://')



In [ ]:

    
restaurants = [r for r in Search(using=client, index='restaurants').scan()]
len(restaurants)



In [ ]:

    
# some tests



In [ ]:

    
req = Search(using=client, index='recipes') \
    .query('match', **{'ingredients.content': 'asperge'})
    
for hit in req.execute():
    print(hit.name)
    print(hit.ingredients)
    print()



In [ ]:

    
req = Search(using=client, index='products') \
    .query('match', name='sachets de thé vert')
    
for hit in req.execute():
    print(hit.name)
    print(hit.meta.score)
    print([x.name for x in hit.nutriments])
    print()



In [ ]:

    
# select restaurants that will be analyzed



In [ ]:

    
restaurants = [r for r in Search(using=client, index='restaurants').query('match', **{'city': 'lyon'}).scan()]
len(restaurants)



In [ ]:

    
# overview of all naive queries results
for restaurant in restaurants:
    
    print('- Restaurant: {}'.format(restaurant.name))
    if 'mains' in restaurant:
        for main in restaurant.mains:
            
            print('  - Plat: {}'.format(main.name))
            recipes = [i for i in Search(using=client, index='recipes').query('match', name=main.name)[0].execute()]
            recipes_match = recipes
            
            for recipe in recipes_match:
                
                print('    - Recette: {}'.format(recipe.name))
                ings = [n for n in Search(using=client, index='products').query('match', name=recipe.name).execute()]
                ings_match = ings
                
                for ing in ings_match:
                    print('      - Ingredient: {}'.format(ing.name))
                    
                    for nut in ing.nutriments:
                        print("        - Nutriment: {} {}{}".format(nut.name, nut.per_hundred, nut.unit))



In [ ]:



In [ ]:

    
# search recipes that match given meal name and that have ingredients with units
def search_fielddata(index, term, fields=[], size=10):
    res = client.search(
        index=index,
        body={
            'query': {
                'match': {
                    'name': term
                }
            },
            'post_filter': {
                'exists': {
                  'field': 'ingredients.unit'
                }
              },
            'size': size,
            'fielddata_fields': fields
        }
    )['hits']['hits']
    return res



In [ ]:

    
# score and select best recipes among all results
def recipes_enrichment(term, min_occurence=2, min_ing=5, prnt=False):
    recipes = search_fielddata('recipes', term, ['ingredients.content'], 20)
    
    ings = {}
    for recipe in recipes:
        # for each raw (stemmed, processing) elasticsesarch token
        for ing in recipe['fields']['ingredients.content']:
            # count appariton in recipes
            if ing in ings:
                ings[ing] += 1
            else:
                ings[ing] = 1
    
    # sorted scoring token and threshold the lowest ones
    ings_scores = sorted([(v, k) for k, v in ings.items()], reverse=True, key=lambda x: x[0])
    ings_trendy = set([v for k, v in ings_scores if k >= min_occurence])
    
    # score each recipe by counting how many previous token they contain
    recipes_scores = []
    for recipe in recipes:
        matches = 0
        for ing in recipe['fields']['ingredients.content']:
            if ing in ings_trendy:
                matches += 1
        recipes_scores.append((matches, recipe))
        
    # keep only the recipes containing the most highest tokens
    recipes_scores = sorted(recipes_scores, reverse=True, key=lambda x: x[0])
    recipes_trendy = [v for k, v in recipes_scores if k >= min_ing]
    
    if prnt:
        print([r['_source']['name'] for r in recipes_trendy])
    
    return recipes



In [ ]:

    
# search with boosting criterias of term, having an unit and muplitly by the inverse length (avoid long partial matches)
def search_boost(index, term, unit, size=10):
    res = client.search(
        index=index,
        body={
            'query': {
                'function_score': {
                    'query': {
                        'bool': {
                            'must': [
                                {
                                    'match': {
                                        'name': {
                                            'query': term,
                                            'boost': 4
                                        }
                                    }
                                }
                            ],
                            'should': [
                                {
                                    'match': { 
                                        'unit': unit if unit else ''
                                    }
                                },
                                {
                                    'range': {
                                        'quantity': {
                                            'lte': 250,
                                            'gte': 1,
                                            'boost': 4
                                        }
                                    }
                                }
                            ],
                            'minimum_should_match': 0
                        }
                    },
                    'script_score': {
                        'script': {
                          'lang': 'painless',
                          'inline': '_score * 1.0 / doc["name"].length'
                        }
                    }
                }
            },
            'size': size
        }
    )['hits']['hits']
    return res



In [ ]:

    
# estimate quantity of ingredient when having no unit
def estimate_quantity(index, term, percentile='50.0'):
    res = client.search(
        index=index,
        body={
          "query": {
            "nested": {
              "path": "ingredients",
              "query": {
                "bool": {
                  "must": [
                    { 
                      "match": { 
                        "ingredients.content": term
                      }
                    },
                    {
                      "exists": {
                        "field": "ingredients.unit"
                      }
                    }
                  ]
                }
              }
            }
          },
          "aggs": {
            "average": {
              "nested": {
                "path": "ingredients"
              },
              "aggs": {
                "quantity": {
                  "percentiles": {
                    "field": "ingredients.quantity"
                  }
                }
              }
            }
          },
          "_source": ""
        }
    )
    per = res['aggregations']['average']['quantity']['values'][percentile]
    return per if per != 'NaN' else 0



In [ ]:

    
# merge to nutriments dict by ingoring different unit (if it occurs), can average
def merge_nutriments(arr, avg=False):
    
    arr = [a for a in arr if a]
    nut_names = set()
    for nutriments in arr:
        for nut, value in nutriments.items():
            nut_names.add(nut)
    
    merged = {}
    averger = len(arr) if avg else 1.0
    
    for nut_name in nut_names:
        quantity = 0.0
        unit = None
        
        for nutriments in arr:
            if nut_name in nutriments:
                if not unit:
                    unit = nutriments[nut_name]['unit']
                if unit == nutriments[nut_name]['unit']:
                    quantity += nutriments[nut_name]['quantity']
                
        merged[nut_name] = {
            'quantity': quantity / averger,
            'unit': unit
        }
        
    return merged



In [ ]:

    
# pretty print
def pretty_nutriments(nuts):
    print('\n'.join(['{:25}{:.2f}\t{}'.format(k, v['quantity'], v['unit']) for k, v in nuts.items()]))



In [ ]:

    
# score and select best products by boosting search and quantity estimation when needed
def ingredients_enrichment(recipe, prnt=False, log=False):
    
    ingredients = recipe['_source']['ingredients']
    
    if log:
        print(recipe['_source']['name'])
    
    ingredients_nutriments = []
    
    for i in ingredients:
        
        # remove adjective (such as small/big), can be an issue with color but generally improve the results
        content = i['content']
        pos = pos_tagger.tag(content.split())
        content_cleaned = ' '.join([k for k, v in pos if v != 'ADJ'])
        unit = i['unit']
        quantity = i['quantity']
        
        # cannot relate nutriment quantity if not unit for ingredients
        if not unit:
            quantity = estimate_quantity('recipes_nested', content)
            
        if prnt:
            print('======')
            print(pos)
            print(content_cleaned)
            print(quantity)
            print(unit)
            print('===')

        products = search_boost('products', content_cleaned, unit, 5)

        # this runs only once and take care of quantity management
        for p in products:

            nutirment_quantity = p['_source']['quantity']
            factor = quantity
            if nutirment_quantity:
                factor *= 100.0 / nutirment_quantity
                
            if prnt:
                print(p['_source']['name'])
                print(nutirment_quantity)
                print(p['_source']['unit'])
                print(p['_score'])

            selected_nutriments = {}
            for n in p['_source']['nutriments']:
                selected_nutriments[n['name']] = {
                    'unit': n['unit'],
                    'quantity': n['per_hundred'] / 100.0 * factor
                } 

            if prnt:
                print(selected_nutriments)
            ingredients_nutriments.append(selected_nutriments)
            break

    #print(ingredients_nutriments)
    
    final_nutriments = merge_nutriments(ingredients_nutriments)
      
    if log:
        pretty_nutriments(final_nutriments)
        
    return final_nutriments



In [ ]:



In [ ]:

    
# map meal to recipe and ingredient to product for getting nutriment
def meal(name, min_occurence=2, min_ing=5):
    re = recipes_enrichment(name, min_occurence=min_occurence, min_ing=min_ing)
    if len(re):
        return ingredients_enrichment(re[0]), re[0]['_source']['name']
    return None



In [ ]:



In [ ]:

    
# aggregate meal for given restaurant
def resto(rest):
    
    bilan = {
        'id': rest.meta.id,
        'name': rest.name,
        'analysis': []
    }
    
    total = []
    
    if 'starters' in rest:
        local = []
        for m in rest.starters:
            #print(m.name)
            nutriments, match = meal(m['name'])
            
            local.append(nutriments)
            total.append(nutriments)
            bilan['analysis'].append({
                'name': m.name,
                'match': match,
                'price': m.price,
                'nutriments': nutriments,
                'cat': 'starters'
            })
        bilan['starters'] = merge_nutriments(local, avg=True)
    
    if 'mains' in rest:
        local = []
        for m in rest.mains:
            #print(m.name)
            nutriments, match = meal(m['name'])
            
            local.append(nutriments)
            total.append(nutriments)
            bilan['analysis'].append({
                'name': m.name,
                'match': match,
                'price': m.price,
                'nutriments': nutriments,
                'cat': 'mains'
            })
        bilan['mains'] = merge_nutriments(local, avg=True)
        
    if 'desserts' in rest:
        local = []
        for m in rest.desserts:
            #print(m.name)
            nutriments, match = meal(m['name'])
            
            local.append(nutriments)
            total.append(nutriments)
            bilan['analysis'].append({
                'name': m.name,
                'match': match,
                'price': m.price,
                'nutriments': nutriments,
                'cat': 'desserts'
            })
        bilan['desserts'] = merge_nutriments(local, avg=True)
    
    bilan['total'] = merge_nutriments(total, avg=True)
    return bilan



In [ ]:

    
# process selected restaurant and store into elasticsearch
for r in tqdm(restaurants):
    try:
        src = resto(r)
        fmt = {
            '_index': 'analysis',
            '_type': 'fst',
            '_source': src
        }
        eshelper.bulk(client, [fmt])
    except Exception as e:
        if 'name' in r:
            print(r.name)
        print(e)
        print('===')



In [ ]: