In [1]:
import pandas as pd
pd.set_option('display.max_rows', 100)
In [2]:
from horacy import HoracyModel
model = HoracyModel('model_4000/').load()
In [25]:
results = model.tfidf_query('ventil')
df = pd.DataFrame(zip(*results),columns=['id','token','df'])
df
Out[25]:
In [30]:
results = model.tfidf_query('mechanical')
df = pd.DataFrame(zip(*results),columns=['id','token','df'])
df
Out[30]:
In [8]:
def multi_query(*args):
ids = set()
for i,arg in enumerate(args):
query,_,exclude = arg.partition(' ~~ ')
q_ids = model.inverted_query(query,exclude)
if i==0:
ids.update(q_ids)
else:
ids.intersection_update(q_ids)
return ids
def show_results(ids):
print(f"RESULTS: {len(ids)}\n\n")
for id in ids:
doc = model.get_doc(id)
text = model.doc_to_text(doc).replace('\n',' ').replace('\r',' ')
print(f"id:{id}\n{text}\n\n")
import re
def filter_results(ids,query):
q = re.compile(query)
for id in ids:
doc = model.get_doc(id)
text = model.doc_to_text(doc).replace('\n',' ').replace('\r',' ')
for sen in model.text_to_sentences(text):
if q.findall(sen):
print(id,sen)
ids = multi_query('mechanical','ventil')
filter_results(ids,'\b(age|old|year)')
In [ ]: