In [ ]:
%matplotlib inline

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from tqdm import tqdm
from nltk.stem.snowball import FrenchStemmer
from nltk.corpus import stopwords

In [ ]:
client = Elasticsearch(hosts='http://')

In [ ]:
analysis = [a for a in Search(using=client, index='analysis').scan()]

In [ ]:
stemmer = FrenchStemmer()
stop = set(stopwords.words('french'))

In [ ]:
scores = []

# crude estimation, stem source name and matched name in order to count which matched words are contained into the source one
for a in analysis:
    for meal in a.analysis:
        name = stemmer.stem(meal.name)
        tokens = [stemmer.stem(t) for t in meal.match.split() if t not in stop]
        score = np.sum([1 for t in tokens if t in name]) / len(tokens)
        scores.append(score)
        print(meal.name)
        print(meal.match)
        
scores = np.array(scores)

In [ ]:
scores.mean()

In [ ]:
scores.std()

In [ ]: