notebook.community

Edit and run



In [ ]:

    
%matplotlib inline



In [ ]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from tqdm import tqdm
from nltk.stem.snowball import FrenchStemmer
from nltk.corpus import stopwords



In [ ]:

    
client = Elasticsearch(hosts='http://')



In [ ]:

    
analysis = [a for a in Search(using=client, index='analysis').scan()]



In [ ]:

    
stemmer = FrenchStemmer()
stop = set(stopwords.words('french'))



In [ ]:

    
scores = []

# crude estimation, stem source name and matched name in order to count which matched words are contained into the source one
for a in analysis:
    for meal in a.analysis:
        name = stemmer.stem(meal.name)
        tokens = [stemmer.stem(t) for t in meal.match.split() if t not in stop]
        score = np.sum([1 for t in tokens if t in name]) / len(tokens)
        scores.append(score)
        print(meal.name)
        print(meal.match)
        
scores = np.array(scores)



In [ ]:

    
scores.mean()



In [ ]:

    
scores.std()



In [ ]: