In [1]:
import pandas as pd
pd.set_option('display.max_rows', 100)

In [2]:
from horacy import HoracyModel
model = HoracyModel('model_4000/').load()


load done in 2.69 seconds

Outcomes data for COVID-19 after mechanical ventilation adjusted for age

ventilation


In [25]:
results = model.tfidf_query('ventil')
df = pd.DataFrame(zip(*results),columns=['id','token','df'])
df


Out[25]:
id token df
0 3719 ventilation 804
1 23641 ventilated 155
2 9440 ventilator 94
3 29220 ventilators 85
4 18845 ventilatory 54
5 52285 ventilator-associated 50
6 79235 ventilation: 11
7 23642 ventilator-induced 10
8 87766 hypoventilation 8
9 67896 non-ventilated 7
10 37127 ventilator-free 6
11 46658 ventilator-days 6
12 77019 ventilation-induced 5
13 93910 hyperventilation 5
14 23006 well-ventilated 4
15 47508 ventilatorassociated 4
16 52963 ventilating 4
17 63628 ventilatorinduced 4
18 78097 nurse:ventilated 3
19 85030 g006ventilation 3

mechanical


In [30]:
results = model.tfidf_query('mechanical')
df = pd.DataFrame(zip(*results),columns=['id','token','df'])
df


Out[30]:
id token df
0 3701 mechanical 804
1 10943 mechanically 150
2 71211 biomechanical 9
3 84684 optomechanical 9
4 53365 electromechanical 4
5 84734 nanomechanical 4

In [8]:
def multi_query(*args):
    ids = set()
    for i,arg in enumerate(args):
        query,_,exclude = arg.partition(' ~~ ')
        q_ids = model.inverted_query(query,exclude)
        if i==0:
            ids.update(q_ids)
        else:
            ids.intersection_update(q_ids)
    return ids

def show_results(ids):
    print(f"RESULTS: {len(ids)}\n\n")
    for id in ids:
        doc = model.get_doc(id)
        text = model.doc_to_text(doc).replace('\n',' ').replace('\r',' ')
        print(f"id:{id}\n{text}\n\n")

import re
def filter_results(ids,query):
    q = re.compile(query)
    for id in ids:
        doc = model.get_doc(id)
        text = model.doc_to_text(doc).replace('\n',' ').replace('\r',' ')
        for sen in model.text_to_sentences(text):
            if q.findall(sen):
                print(id,sen)
        
ids = multi_query('mechanical','ventil')
filter_results(ids,'\b(age|old|year)')

In [ ]: