In [ ]:
%matplotlib inline
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from tqdm import tqdm
from nltk.stem.snowball import FrenchStemmer
from nltk.corpus import stopwords
In [ ]:
client = Elasticsearch(hosts='http://')
In [ ]:
analysis = [a for a in Search(using=client, index='analysis').scan()]
In [ ]:
stemmer = FrenchStemmer()
stop = set(stopwords.words('french'))
In [ ]:
scores = []
# crude estimation, stem source name and matched name in order to count which matched words are contained into the source one
for a in analysis:
for meal in a.analysis:
name = stemmer.stem(meal.name)
tokens = [stemmer.stem(t) for t in meal.match.split() if t not in stop]
score = np.sum([1 for t in tokens if t in name]) / len(tokens)
scores.append(score)
print(meal.name)
print(meal.match)
scores = np.array(scores)
In [ ]:
scores.mean()
In [ ]:
scores.std()
In [ ]: