In [5]:

    
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from fastparquet import ParquetFile
import os
# from ctakes_xml import CtakesXmlParser
# from sklearn.feature_extraction.text import TfidfVectorizer
from stringdist import levenshtein_norm as lev_norm
# import matplotlib

# %matplotlib inline

Read the parquet file into a pandas dataframe. Using fastparquet here because pyarrow couldn't read in a file of this size for some reason



In [7]:

    
notes_file = 'synthnotes/data/note-events.parquet'
pq_root_path = 'synthnotes/data/xml_extracted'



In [8]:

    
pf = ParquetFile(notes_file)
df = pf.to_pandas()

Get the list of ids from the processed xml files so we can select a subset of the mimic notes



In [9]:

    
xml_dir = 'synthnotes/data/xml_files'
xml_files = os.listdir(xml_dir)
ids = [int(f.split('.txt.xmi')[0]) for f in xml_files]

Select the subset of notes that we have xml output from ctakes for. Reset the index and drop some unnecessary columns



In [10]:

    
notes = df[df.ROW_ID.isin(ids)]
notes = notes.reset_index(drop=True)
notes = notes.drop(['CHARTDATE','CHARTTIME','STORETIME','CGID','ISERROR'],axis=1)



In [ ]:



In [6]:

    
def get_notes_sample(df, n=100, category='Nursing'):
    notes = df[notes_df['CATEGORY'] == 'Nursing']
    notes = notes[notes['ISERROR'].isnull()]
    notes = notes[notes['DESCRIPTION'] == 'Generic Note']
    notes = notes.sample(n=n)
    notes = notes.reset_index(drop=True)
    return notes

process the xml files and store in parquet locally

TODO: switch this to use columnar format: need to change how we extract different types of elements



In [7]:

    
# parser = CtakesXmlParser()
# schemas = list()
# for file in xml_files:
#     xml_out = parser.parse(f'{xml_dir}/{file}')
#     for k, v in xml_out.items():
#         feature_df = pd.DataFrame(list(v))  
#         if feature_df.shape[0] > 0:
#             table = pa.Table.from_pandas(feature_df)
#             pq.write_to_dataset(table, f'{pq_root_path}/{k}')
#         else:
#             print(f"{k} was empty for {file}")

Creating templates

The plan:
For each sentences in all documents:

1. Get the predicates for the sentence
2. Get the entities for the sentence
3. For each entity:
     -  append the cui code from umls concept to the end
4. Combine predicates and entities and sort based on their begin position
5. Save to a dataframe

Some helper functions:



In [29]:

    
def get_df_from_pq(root, name):
    return pq.read_table(f'{root}/{name}').to_pandas()

def transform_preds(df):
    df['frameset'] = df['frameset'].apply(lambda x: x.split('.')[0])
    return df

def transform_mentions(mentions):
    # Don't want this to fail if these have already been removed
    try:
        mentions = mentions.drop(
            ['conditional', 'history_of', 'generic', 'polarity', 'discovery_technique', 'subject'],
            axis=1)
    except:
        pass
    
    sorted_df = mentions.groupby(['sent_id', 'begin']) \
                        .apply(lambda x: x.sort_values(['begin', 'end']))
    
    # Drop the mentions that are parts of a larger span.  Only keep the containing span that holds multiple
    # mentions
    deduped = sorted_df.drop_duplicates(['sent_id', 'begin'], keep='last')
    deduped = deduped.drop_duplicates(['sent_id', 'end'], keep='first')
    return deduped.reset_index(drop=True)

def set_template_token(df, column):
    df['template_token'] = df[column]
    return df

def get_template_tokens(row):
    return pd.Series({
        'doc_id': row['doc_id'],
        'sent_id': row['sent_id'],
        'token': row['template_token'],
        'begin': row['begin'],
        'end': row['end']
        })    

# def merge_mentions_umls(mentions, umls):
#     umls['umls_xmi_id'] = umls['xmi_id']
#     mentions = mentions.merge(umls[['umls_xmi_id', 'cui']], on='umls_xmi_id')
#     return mentions

# def umls_dedup(umls):
#     return umls.drop_duplicates(subset=['cui'])

# def set_umls_join_key(umls):
#     umls['umls_xmi_id'] = umls['xmi_id']
#     return umls

def set_sentence_pos(df):
    df = df.groupby(["doc_id"]).apply(lambda x: x.sort_values(["begin"])).reset_index(drop=True)
    df['sentence_number'] = df.groupby("doc_id").cumcount()
    return df

def get_root_verb(row):
    pass

def extract_sent(row):
    begin = row['begin']
    end = row['end']
    row['TEXT'] = row['TEXT'][begin:end]
    return row

def write_notes(row):
    fn = f'raw_notes/{row["ROW_ID"]}'
    with open(fn, 'w') as f:
        f.write(row['TEXT'])
        
def get_text_from_sentence(row, notes):
    doc = notes[notes['ROW_ID'] == row['doc_id']]
    b = row['begin']
    e = row['end']
    return doc['TEXT'].iloc[0][b:e]        

def edit_dist(row, term2):
    term1 = row.loc['preferred_text']
    return lev_norm(term1, term2)
    
def get_cui( mention, umls_df):
        ont_arr = list(map(int, mention['ontology_arr'].split())) or None
        ment_text = mention['text']
        
        concepts = umls_df[umls_df['xmi_id'].isin(ont_arr)].loc[:, ['cui', 'preferred_text', 'xmi_id']]
        concepts['dist'] = concepts.apply(edit_dist, args=(ment_text,), axis=1)
        sorted_df = concepts.sort_values(by='dist', ascending=True).reset_index(drop=True)
        cui = sorted_df['cui'].iloc[0]
        xmi_id = sorted_df['xmi_id'].iloc[0]
        pref_text = sorted_df['preferred_text'].iloc[0]
        return cui, xmi_id, pref_text

Pull in the dataframes for elements we need for processing



In [12]:

    
preds = get_df_from_pq(pq_root_path, 'predicates')
mentions = get_df_from_pq(pq_root_path, 'mentions')
umls = get_df_from_pq(pq_root_path, 'umls_concepts')
sents = get_df_from_pq(pq_root_path, 'sentences')
tokens = get_df_from_pq(pq_root_path, 'tokens')



In [13]:

    
sents = sents.rename({'id': 'sent_id'}, axis=1)

sents.head()









    Out[13]:







  
    
      
      begin
      doc_id
      elem_type
      end
      sent_id
      sentence_number
      xmi_id
    
  
  
    
      0
      0
      356714
      Sentence
      150
      3cf2381f-5123-4b4c-bd6f-bea21d265ea5
      0
      25
    
    
      1
      154
      356714
      Sentence
      279
      c9d218b9-fdd3-4758-925d-ddab2e801893
      0
      31
    
    
      2
      280
      356714
      Sentence
      360
      16bf3399-a9b5-4499-875c-c1a2b3b138c3
      0
      37
    
    
      3
      365
      356714
      Sentence
      418
      fd20d6d0-4f8c-4ff1-a061-e4b78fa529a9
      0
      43
    
    
      4
      422
      356714
      Sentence
      483
      c2f28716-60c2-4d3c-b815-35939c1afc49
      0
      49

Prep sentences DF

Add raw text from notes to sentences



In [14]:

    
sents = sents.rename({'id': 'sent_id'}, axis=1)
sents = sents.merge(notes[['ROW_ID', 'TEXT']],
            left_on='doc_id', right_on='ROW_ID').drop('ROW_ID', axis=1)

sents = sents.apply(extract_sent, axis=1)
sents = sents.rename({'TEXT': 'text'}, axis=1)

Add position of sentence in document to sentences df



In [15]:

    
sents = set_sentence_pos(sents)

remove sentences without entities



In [16]:

    
sents_with_mentions = sents[
    sents['sent_id'].isin(
        mentions.drop_duplicates(subset='sent_id')['sent_id']
    )
]

Prep UMLS DF

Remove umls concepts which don't have a preferred text field



In [17]:

    
umls = umls[~umls['preferred_text'].isna()]

Pref Mentions DF

Transform mentions

Drop some unused fields
Only keep the first umls code from ontology array ( no longer doing this as it limits the cui codes we can choose from in the umls concepts table)
Sort by begin and end offsets. Remove mentions that end on the same offset. Only want to keep the full span and not split entities up. This should give us better semantic meaning
Add raw text to mentions
Add in umls concept information (CUI) to mentions a. There are many possible cuis for a the text span of an entity. Here, we're going to use the edit distance from the original span and the umls preferred text. For now, just choose the first umls concept with the best score (lowest)



In [18]:

    
mentions = get_df_from_pq(pq_root_path, 'mentions')
mentions = transform_mentions(mentions)
mentions.head()









    Out[18]:







  
    
      
      begin
      doc_id
      elem_type
      end
      id
      mention_type
      ontology_arr
      sent_id
      xmi_id
    
  
  
    
      0
      252
      441341
      Mention
      262
      77412b7d-2ffc-42f8-8896-ad218c1acda4
      MedicationMention
      5460 5480 5450 5470
      00351605-d93c-49ef-a9d3-fece550de1a0
      5496
    
    
      1
      275
      441341
      Mention
      278
      b6b9f099-e2b1-4625-8371-33a9bac4df14
      ProcedureMention
      7236 7246
      00351605-d93c-49ef-a9d3-fece550de1a0
      7260
    
    
      2
      303
      441341
      Mention
      311
      33ab5b9e-0d9f-41dd-adc0-c5025e9c4f40
      ProcedureMention
      7184 7194
      00351605-d93c-49ef-a9d3-fece550de1a0
      7208
    
    
      3
      332
      441341
      Mention
      335
      31cc3ae1-5693-4467-a85d-744a2f2e02e9
      MedicationMention
      5330 5350 5320 5340
      00351605-d93c-49ef-a9d3-fece550de1a0
      5366
    
    
      4
      365
      441341
      Mention
      385
      1a9bcb72-afe8-49cd-8dbc-8d80b8d88daa
      SignSymptomMention
      6755
      00351605-d93c-49ef-a9d3-fece550de1a0
      6768

Add original text to mentions



In [19]:

    
mentions['text'] = mentions.apply(get_text_from_sentence, args=(notes,), axis=1)
mentions.head()









    Out[19]:







  
    
      
      begin
      doc_id
      elem_type
      end
      id
      mention_type
      ontology_arr
      sent_id
      xmi_id
      text
    
  
  
    
      0
      252
      441341
      Mention
      262
      77412b7d-2ffc-42f8-8896-ad218c1acda4
      MedicationMention
      5460 5480 5450 5470
      00351605-d93c-49ef-a9d3-fece550de1a0
      5496
      Creatinine
    
    
      1
      275
      441341
      Mention
      278
      b6b9f099-e2b1-4625-8371-33a9bac4df14
      ProcedureMention
      7236 7246
      00351605-d93c-49ef-a9d3-fece550de1a0
      7260
      BUN
    
    
      2
      303
      441341
      Mention
      311
      33ab5b9e-0d9f-41dd-adc0-c5025e9c4f40
      ProcedureMention
      7184 7194
      00351605-d93c-49ef-a9d3-fece550de1a0
      7208
      dialysis
    
    
      3
      332
      441341
      Mention
      335
      31cc3ae1-5693-4467-a85d-744a2f2e02e9
      MedicationMention
      5330 5350 5320 5340
      00351605-d93c-49ef-a9d3-fece550de1a0
      5366
      PVC
    
    
      4
      365
      441341
      Mention
      385
      1a9bcb72-afe8-49cd-8dbc-8d80b8d88daa
      SignSymptomMention
      6755
      00351605-d93c-49ef-a9d3-fece550de1a0
      6768
      within normal limits

Add sentence position to mentions



In [20]:

    
mentions = mentions.merge(sents_with_mentions[['sent_id', 'sentence_number']],
              on='sent_id')
mentions.head()









    Out[20]:







  
    
      
      begin
      doc_id
      elem_type
      end
      id
      mention_type
      ontology_arr
      sent_id
      xmi_id
      text
      sentence_number
    
  
  
    
      0
      252
      441341
      Mention
      262
      77412b7d-2ffc-42f8-8896-ad218c1acda4
      MedicationMention
      5460 5480 5450 5470
      00351605-d93c-49ef-a9d3-fece550de1a0
      5496
      Creatinine
      10
    
    
      1
      275
      441341
      Mention
      278
      b6b9f099-e2b1-4625-8371-33a9bac4df14
      ProcedureMention
      7236 7246
      00351605-d93c-49ef-a9d3-fece550de1a0
      7260
      BUN
      10
    
    
      2
      303
      441341
      Mention
      311
      33ab5b9e-0d9f-41dd-adc0-c5025e9c4f40
      ProcedureMention
      7184 7194
      00351605-d93c-49ef-a9d3-fece550de1a0
      7208
      dialysis
      10
    
    
      3
      332
      441341
      Mention
      335
      31cc3ae1-5693-4467-a85d-744a2f2e02e9
      MedicationMention
      5330 5350 5320 5340
      00351605-d93c-49ef-a9d3-fece550de1a0
      5366
      PVC
      10
    
    
      4
      365
      441341
      Mention
      385
      1a9bcb72-afe8-49cd-8dbc-8d80b8d88daa
      SignSymptomMention
      6755
      00351605-d93c-49ef-a9d3-fece550de1a0
      6768
      within normal limits
      10

Prep Predicates DF

Transform predicates

Simple transformation. Just modify the frameset string to remove everything after the '.'



In [21]:

    
preds = transform_preds(preds)

Remove predicates not in sentences with mentions



In [22]:

    
print(preds.shape)
preds = preds[
    preds['sent_id'].isin( sents_with_mentions['sent_id'] )
]
print(preds.shape)









    



(4608, 9)
(3882, 9)

Add original text to predicates



In [23]:

    
preds['text'] = preds.apply(get_text_from_sentence, args=(notes,), axis=1)

Linking CUI codes to entities (mentions)

Assign cui codes to mentions (entities)

cTAKES over-generates cui and tui codes for text spans in a clinical note. There can be multiple coding schemes that have a code for a term and a cui could apply to the original text span specifically or be a generalization or abstraction over the meaning of the span. For generating text we want the cui that most closely matches the original text span. Future work could look at these generalizations to get a better sense of semantic meaning. However, this will require a deep understanding of the UMLS ontology an how to work with it to extract this kind of information.

For each mention:

Collect all the umls concept rows (based on xmi_id) that are in the mention's ontology array
Compute edit distance between the above umls rows' preferred text column and the mention's original text
Sort edit distances in ascending order
Choose the first umls concept row (a lower edit distance means the two texts are more similar)



In [30]:

    
mentions[['cui', 'umls_xmi_id', 'preferred_text']] = mentions. \
                                                     apply(get_cui, args=(umls,), axis=1, result_type='expand')
mentions.head()









    Out[30]:







  
    
      
      begin
      doc_id
      elem_type
      end
      id
      mention_type
      ontology_arr
      sent_id
      xmi_id
      text
      sentence_number
      cui
      umls_xmi_id
      preferred_text
    
  
  
    
      0
      252
      441341
      Mention
      262
      77412b7d-2ffc-42f8-8896-ad218c1acda4
      MedicationMention
      5460 5480 5450 5470
      00351605-d93c-49ef-a9d3-fece550de1a0
      5496
      Creatinine
      10
      C0010294
      5460
      Creatinine
    
    
      1
      275
      441341
      Mention
      278
      b6b9f099-e2b1-4625-8371-33a9bac4df14
      ProcedureMention
      7236 7246
      00351605-d93c-49ef-a9d3-fece550de1a0
      7260
      BUN
      10
      C0005845
      7236
      Blood urea nitrogen measurement
    
    
      2
      303
      441341
      Mention
      311
      33ab5b9e-0d9f-41dd-adc0-c5025e9c4f40
      ProcedureMention
      7184 7194
      00351605-d93c-49ef-a9d3-fece550de1a0
      7208
      dialysis
      10
      C0011946
      7184
      Dialysis procedure
    
    
      3
      332
      441341
      Mention
      335
      31cc3ae1-5693-4467-a85d-744a2f2e02e9
      MedicationMention
      5330 5350 5320 5340
      00351605-d93c-49ef-a9d3-fece550de1a0
      5366
      PVC
      10
      C0032624
      5330
      Polyvinyl Chloride
    
    
      4
      365
      441341
      Mention
      385
      1a9bcb72-afe8-49cd-8dbc-8d80b8d88daa
      SignSymptomMention
      6755
      00351605-d93c-49ef-a9d3-fece550de1a0
      6768
      within normal limits
      10
      C1265570
      6755
      Morphology within normal limits

Set the template tokens we're going to use

For mentions this is either: the type of mention, the CUI code, or the two concatenated together

For predicates it is the frameset trimmed of everything after the '.'



In [31]:

    
mentions['template_token'] = mentions['mention_type']
preds['template_token'] = preds['frameset']

preds_toks = preds.apply(get_template_tokens, axis=1)
mentions_toks = mentions.apply(get_template_tokens, axis=1)

mentions_toks.groupby(['sent_id', 'end']).head()

preds_toks.groupby(['sent_id', 'end']).head()









    Out[31]:







  
    
      
      doc_id
      sent_id
      token
      begin
      end
    
  
  
    
      0
      458728
      cb1777b8-3988-48dc-a6ee-57abb6203744
      slide
      115
      122
    
    
      1
      458728
      28899d39-f491-4230-9ebb-f742f1d132f7
      suppose
      134
      142
    
    
      2
      458728
      28899d39-f491-4230-9ebb-f742f1d132f7
      be
      146
      148
    
    
      3
      458728
      28899d39-f491-4230-9ebb-f742f1d132f7
      find
      166
      171
    
    
      4
      458728
      28899d39-f491-4230-9ebb-f742f1d132f7
      be
      178
      180
    
    
      5
      458728
      c9c66646-b14d-473f-8a11-ff44cd98d4e4
      be
      289
      292
    
    
      6
      458728
      c9c66646-b14d-473f-8a11-ff44cd98d4e4
      verse
      311
      317
    
    
      7
      458728
      c9c66646-b14d-473f-8a11-ff44cd98d4e4
      verse
      376
      382
    
    
      8
      458728
      c9c66646-b14d-473f-8a11-ff44cd98d4e4
      give
      383
      388
    
    
      9
      458728
      c9c66646-b14d-473f-8a11-ff44cd98d4e4
      verse
      414
      420
    
    
      10
      458728
      c9c66646-b14d-473f-8a11-ff44cd98d4e4
      increase
      443
      452
    
    
      11
      458728
      c9c66646-b14d-473f-8a11-ff44cd98d4e4
      be
      525
      527
    
    
      12
      458728
      c9c66646-b14d-473f-8a11-ff44cd98d4e4
      initiate
      528
      537
    
    
      13
      458728
      438b1d10-3b69-476c-991e-2719ed230a35
      give
      556
      561
    
    
      14
      458728
      7fbe30c1-0239-497f-b09a-98ebe05b7463
      start
      600
      607
    
    
      15
      458728
      7fbe30c1-0239-497f-b09a-98ebe05b7463
      increase
      635
      644
    
    
      16
      458728
      78e9c81c-9b24-4762-874a-36e98f1dabbd
      verse
      696
      702
    
    
      17
      458728
      78e9c81c-9b24-4762-874a-36e98f1dabbd
      discontinue
      703
      715
    
    
      18
      458728
      78e9c81c-9b24-4762-874a-36e98f1dabbd
      be
      731
      734
    
    
      19
      458728
      78e9c81c-9b24-4762-874a-36e98f1dabbd
      start
      735
      742
    
    
      20
      458728
      bda4697a-31aa-4d06-8409-e0596357fbb4
      monitor
      747
      756
    
    
      21
      458728
      0f7bcc60-bf7d-4e82-bf68-0ae233cf9fd5
      sats
      935
      939
    
    
      22
      458728
      0f7bcc60-bf7d-4e82-bf68-0ae233cf9fd5
      drop
      943
      950
    
    
      23
      458728
      0f7bcc60-bf7d-4e82-bf68-0ae233cf9fd5
      see
      991
      995
    
    
      24
      458728
      0f7bcc60-bf7d-4e82-bf68-0ae233cf9fd5
      tinge
      1028
      1034
    
    
      25
      458728
      4a558a51-40f8-487e-a4e4-a33fc07dd535
      change
      1071
      1078
    
    
      26
      458728
      4a558a51-40f8-487e-a4e4-a33fc07dd535
      achieve
      1085
      1092
    
    
      27
      458728
      4a558a51-40f8-487e-a4e4-a33fc07dd535
      increase
      1128
      1137
    
    
      28
      458728
      4a558a51-40f8-487e-a4e4-a33fc07dd535
      increase
      1163
      1172
    
    
      29
      458728
      19c33efc-96fc-40cb-9ea3-f6ce9caf1310
      continue
      1232
      1241
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      15
      497748
      266fb788-c10e-4f75-adc5-2159d8519537
      resolve
      792
      800
    
    
      16
      497748
      266fb788-c10e-4f75-adc5-2159d8519537
      resolve
      831
      839
    
    
      17
      497748
      c69890a4-d315-4b9f-9b78-b1cb1ab9c1b2
      transfer
      863
      874
    
    
      18
      497748
      df314bb5-10c3-4898-ac1d-9a805426ee22
      have
      1008
      1011
    
    
      19
      497748
      df314bb5-10c3-4898-ac1d-9a805426ee22
      be
      1012
      1016
    
    
      20
      497748
      df314bb5-10c3-4898-ac1d-9a805426ee22
      defer
      1017
      1025
    
    
      24
      497748
      7c34ba60-b573-44dc-a9a3-d04f18735712
      require
      1446
      1454
    
    
      25
      497748
      7c34ba60-b573-44dc-a9a3-d04f18735712
      suction
      1455
      1465
    
    
      26
      497748
      ac6890fb-5d90-47dd-9a15-2126db7a30a2
      remain
      1631
      1637
    
    
      28
      497748
      6a325968-9656-4b72-bdce-6fb4efed7a0b
      follow
      1736
      1744
    
    
      29
      497748
      6a325968-9656-4b72-bdce-6fb4efed7a0b
      alternate
      1771
      1782
    
    
      0
      438154
      4838eada-a1b4-4ec0-908b-4412c18ede87
      monitor
      71
      80
    
    
      1
      438154
      4838eada-a1b4-4ec0-908b-4412c18ede87
      continue
      148
      157
    
    
      2
      438154
      4838eada-a1b4-4ec0-908b-4412c18ede87
      make
      210
      214
    
    
      4
      438154
      28763a14-3d66-428c-ac61-ac2fa34164ad
      continue
      283
      291
    
    
      5
      438154
      28763a14-3d66-428c-ac61-ac2fa34164ad
      monitor
      292
      302
    
    
      6
      438154
      28763a14-3d66-428c-ac61-ac2fa34164ad
      stabilize
      311
      321
    
    
      7
      438154
      28763a14-3d66-428c-ac61-ac2fa34164ad
      encourage
      323
      332
    
    
      8
      438154
      a81e1460-2f06-46ed-8a2b-5b44100e2d94
      have
      372
      375
    
    
      9
      438154
      a81e1460-2f06-46ed-8a2b-5b44100e2d94
      stabilize
      376
      386
    
    
      12
      438154
      d7634bad-e7da-4277-a5ba-b5bf8d349756
      have
      493
      496
    
    
      13
      438154
      d7634bad-e7da-4277-a5ba-b5bf8d349756
      receive
      497
      505
    
    
      17
      438154
      465019a9-ef94-490e-9bca-c28bd34759ab
      start
      687
      694
    
    
      18
      438154
      95dd0f6b-01cc-4cab-b422-9fed7fed5ddb
      have
      713
      716
    
    
      19
      438154
      e1c23cab-a534-4169-a30a-3cc4acbbc76c
      find
      777
      782
    
    
      20
      438154
      e1c23cab-a534-4169-a30a-3cc4acbbc76c
      drain
      810
      818
    
    
      21
      438154
      e1c23cab-a534-4169-a30a-3cc4acbbc76c
      witness
      825
      834
    
    
      22
      438154
      e1c23cab-a534-4169-a30a-3cc4acbbc76c
      say
      863
      867
    
    
      23
      438154
      e1c23cab-a534-4169-a30a-3cc4acbbc76c
      be
      871
      873
    
    
      24
      438154
      e1c23cab-a534-4169-a30a-3cc4acbbc76c
      urinate
      874
      883
    
  

3882 rows × 5 columns

Append the two template tokens dataframes



In [32]:

    
template_tokens = preds_toks.append(mentions_toks)
temp_tokens = template_tokens.groupby(['sent_id']).apply(lambda x: x.sort_values(['begin']))



In [33]:

    
temp_tokens.head()









    Out[33]:







  
    
      
      
      doc_id
      sent_id
      token
      begin
      end
    
    
      sent_id
      
      
      
      
      
      
    
  
  
    
      00351605-d93c-49ef-a9d3-fece550de1a0
      0
      441341
      00351605-d93c-49ef-a9d3-fece550de1a0
      MedicationMention
      252
      262
    
    
      1
      441341
      00351605-d93c-49ef-a9d3-fece550de1a0
      ProcedureMention
      275
      278
    
    
      2
      441341
      00351605-d93c-49ef-a9d3-fece550de1a0
      ProcedureMention
      303
      311
    
    
      2
      441341
      00351605-d93c-49ef-a9d3-fece550de1a0
      have
      313
      319
    
    
      3
      441341
      00351605-d93c-49ef-a9d3-fece550de1a0
      MedicationMention
      332
      335

Get the semantic templates

Group the rows of the above template tokens dataframe by sentence id and join them together into a single string. Must sort by begin offset.



In [34]:

    
sem_templates = template_tokens.sort_values('begin').groupby('sent_id')['token'].apply(' '.join)
sem_templates.head()









    Out[34]:





sent_id
00351605-d93c-49ef-a9d3-fece550de1a0    MedicationMention ProcedureMention ProcedureMe...
00a8bb1c-4186-4499-9a54-ff58e6115817                               DiseaseDisorderMention
01175ae6-57d5-4c77-b51f-fcc5bbd86685          be SignSymptomMention AnatomicalSiteMention
0125390f-61cd-44cf-a940-88174fd05057                                   SignSymptomMention
0155fb93-ce86-4f5f-8436-1a7a431185c2    SignSymptomMention complain SignSymptomMention...
Name: token, dtype: object



In [35]:

    
temp_tokens.token.unique().shape









    Out[35]:





(379,)



In [49]:

    
sem_df = pd.DataFrame(sem_templates)  # What is this?
sem_df.head()

sem_df.reset_index(level=0, inplace=True)

sem_df = sem_df.rename(columns={'token': 'sem_template'})

sem_df = sem_df.merge(sents[['sent_id', 'sentence_number', 'doc_id', 'begin', 'end']],
                      left_on='sent_id', right_on='sent_id' )#.drop('id', axis=1)



In [50]:

    
sem_df.head()









    Out[50]:







  
    
      
      sent_id
      sem_template
      sentence_number
      doc_id
      begin
      end
    
  
  
    
      0
      00351605-d93c-49ef-a9d3-fece550de1a0
      MedicationMention ProcedureMention ProcedureMe...
      10
      441341
      252
      386
    
    
      1
      00a8bb1c-4186-4499-9a54-ff58e6115817
      DiseaseDisorderMention
      74
      458728
      3972
      3977
    
    
      2
      01175ae6-57d5-4c77-b51f-fcc5bbd86685
      be SignSymptomMention AnatomicalSiteMention
      4
      414695
      341
      390
    
    
      3
      0125390f-61cd-44cf-a940-88174fd05057
      SignSymptomMention
      52
      361823
      3452
      3463
    
    
      4
      0155fb93-ce86-4f5f-8436-1a7a431185c2
      SignSymptomMention complain SignSymptomMention...
      0
      378613
      0
      166

Gather corpus statistics

Average sentences per doc



In [36]:

    
avg_sents_per_doc = sents.groupby('doc_id').size().mean()
print(avg_sents_per_doc)

Average sentences w/ entities per doc



In [37]:

    
avg_sents_with_ents_per_doc = sents_with_mentions.groupby('doc_id').size().mean()
print(avg_sents_with_ents_per_doc)









    



28.306122448979593

Count of unique cuis (When removing overlapping text spans)



In [38]:

    
print(mentions['cui'].nunique())

Average # of cuis per doc



In [39]:

    
mentions.groupby('doc_id').size().mean()









    Out[39]:





62.234693877551024

Average # of cuis per sentence



In [40]:

    
mentions.groupby('sent_id').size().mean()









    Out[40]:





2.1986301369863015

Average # of words per doc (excluding newline tokens and symbols)



In [41]:

    
tokens = tokens[(~tokens['sent_id'].isnull()) & (tokens['token_type'] != 'NewlineToken')]



In [42]:

    
wc_by_doc = tokens.groupby('doc_id').count()['id'].reset_index(name='count')
wc_by_doc['count'].mean()









    Out[42]:





475.42

Average # of words per sentence



In [43]:

    
wc_by_sentence = tokens.groupby('sent_id')['id'].count().reset_index(name='count')
wc_by_sentence['count'].mean()









    Out[43]:





11.181091251175918

Get frequency of mentions



In [44]:

    
mention_counts = mentions.groupby('mention_type').size().reset_index(name='count')
mention_counts









    Out[44]:







  
    
      
      mention_type
      count
    
  
  
    
      0
      AnatomicalSiteMention
      758
    
    
      1
      DiseaseDisorderMention
      1258
    
    
      2
      MedicationMention
      1206
    
    
      3
      ProcedureMention
      1033
    
    
      4
      SignSymptomMention
      1844



In [45]:

    
mention_counts['frequency'] = mention_counts['count'] / mention_counts['count'].sum()
mention_counts









    Out[45]:







  
    
      
      mention_type
      count
      frequency
    
  
  
    
      0
      AnatomicalSiteMention
      758
      0.124283
    
    
      1
      DiseaseDisorderMention
      1258
      0.206263
    
    
      2
      MedicationMention
      1206
      0.197737
    
    
      3
      ProcedureMention
      1033
      0.169372
    
    
      4
      SignSymptomMention
      1844
      0.302345

Frequency of mentions by sentence position



In [46]:

    
mentions_by_pos = pd.crosstab(
                        mentions['mention_type'],
                        mentions['sentence_number']).apply(lambda x: x / x.sum(), axis=0)
mentions_by_pos









    Out[46]:







  
    
      sentence_number
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
    
    
      mention_type
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      AnatomicalSiteMention
      0.094463
      0.105263
      0.117904
      0.108108
      0.183673
      0.116883
      0.081081
      0.123711
      0.090909
      0.129310
      ...
      0.125
      0.000000
      0.000000
      0.25
      0.0
      0.00
      0.166667
      0.000000
      0.0
      0.0
    
    
      DiseaseDisorderMention
      0.485342
      0.277193
      0.170306
      0.099099
      0.158163
      0.129870
      0.087838
      0.144330
      0.206061
      0.129310
      ...
      0.250
      0.333333
      0.000000
      0.25
      0.0
      0.50
      0.333333
      0.000000
      1.0
      0.0
    
    
      MedicationMention
      0.032573
      0.105263
      0.174672
      0.234234
      0.260204
      0.305195
      0.344595
      0.247423
      0.206061
      0.353448
      ...
      0.250
      0.333333
      0.333333
      0.00
      0.5
      0.00
      0.166667
      0.666667
      0.0
      0.0
    
    
      ProcedureMention
      0.104235
      0.168421
      0.183406
      0.202703
      0.173469
      0.149351
      0.168919
      0.216495
      0.187879
      0.215517
      ...
      0.000
      0.333333
      0.333333
      0.25
      0.0
      0.25
      0.333333
      0.333333
      0.0
      0.0
    
    
      SignSymptomMention
      0.283388
      0.343860
      0.353712
      0.355856
      0.224490
      0.298701
      0.317568
      0.268041
      0.309091
      0.172414
      ...
      0.375
      0.000000
      0.333333
      0.25
      0.5
      0.25
      0.000000
      0.000000
      0.0
      1.0
    
  

5 rows × 83 columns

Frequency of CUIs



In [47]:

    
cui_counts = mentions.groupby('cui').size().reset_index(name='count')
cui_counts = cui_counts.sort_values('count', ascending=False).reset_index(drop=True)
cui_counts.head(10)



In [48]:

    
cui_counts['frequency'] = cui_counts['count'] / cui_counts['count'].sum()
cui_counts.head(10)

Frequency with preferred text



In [51]:

    
cui_counts_with_text = cui_counts.merge(mentions[['cui', 'preferred_text']], on='cui') \
                        .drop_duplicates('cui') \
                        .reset_index(drop=True)

cui_counts_with_text.head(10)









    Out[51]:







  
    
      
      cui
      count
      frequency
      preferred_text
    
  
  
    
      0
      C0270724
      193
      0.031645
      Infantile Neuroaxonal Dystrophy
    
    
      1
      C0030193
      69
      0.011313
      Pain
    
    
      2
      C0015967
      69
      0.011313
      Fever
    
    
      3
      C0024467
      65
      0.010657
      Magnesium
    
    
      4
      C1145670
      56
      0.009182
      Respiratory Failure
    
    
      5
      C0587081
      52
      0.008526
      Laboratory test finding
    
    
      6
      C0015846
      48
      0.007870
      Fentanyl
    
    
      7
      C0010200
      47
      0.007706
      Coughing
    
    
      8
      C0042313
      47
      0.007706
      Vancomycin
    
    
      9
      C0278060
      47
      0.007706
      Mental state

Frequency of CUIs by sentence position



In [45]:

    
cui_by_pos = pd.crosstab(mentions['cui'], mentions['sentence_number']).apply(lambda x: x / x.sum(), axis=0)
cui_by_pos.head()









    Out[45]:







  
    
      sentence_number
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
    
    
      cui
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      C0000726
      0.000000
      0.000000
      0.0
      0.000000
      0.005102
      0.0
      0.0
      0.0
      0.000000
      0.017241
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      C0000731
      0.000000
      0.000000
      0.0
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.000000
      0.000000
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      C0000737
      0.003257
      0.007018
      0.0
      0.009009
      0.000000
      0.0
      0.0
      0.0
      0.006061
      0.000000
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      C0000768
      0.000000
      0.000000
      0.0
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.000000
      0.000000
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      C0000833
      0.006515
      0.000000
      0.0
      0.000000
      0.005102
      0.0
      0.0
      0.0
      0.000000
      0.000000
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 83 columns



In [154]:

    
cui_by_pos.loc[:, 0].sort_values(ascending=False)[:10]









    Out[154]:





cui
C1145670    0.032573
C0339897    0.029316
C0262926    0.026059
C0013404    0.022801
C0018802    0.022801
C0024117    0.019544
C0013687    0.019544
C0023890    0.016287
C0278060    0.016287
C0020538    0.016287
Name: 0, dtype: float64

Number of unique templates



In [183]:

    
sem_df.head()









    Out[183]:







  
    
      
      sent_id
      sem_template
      sentence_number
      doc_id
      begin
      end
    
  
  
    
      0
      000142d2-4690-4a56-8a68-89b5831ed2aa
      ProcedureMention appear
      58
      408714
      3670
      3716
    
    
      1
      001ca15d-0e94-4933-b376-5123e22e5b13
      MedicationMention SignSymptomMention
      17
      442499
      1515
      1542
    
    
      2
      002d5022-70f9-4638-84b1-dcf42a6c0e12
      be be follow SignSymptomMention AnatomicalSite...
      10
      354315
      718
      801
    
    
      3
      00385844-a95d-4bb7-a773-70f78a3b035a
      continue titrate MedicationMention SignSymptom...
      17
      432020
      1451
      1520
    
    
      4
      0046177c-aa3b-4c8d-9554-3efea4802687
      be give MedicationMention MedicationMention Me...
      5
      464828
      548
      618



In [160]:

    
sem_df['sem_template'].nunique()









    Out[160]:





1242

Frequency of templates (identified by sentence number)



In [114]:

    
count_temps = sem_df.groupby('sem_template').size().reset_index(name='count')
count_temps = count_temps.sort_values('count', ascending=False).reset_index(drop=True)
count_temps.head(10)









    Out[114]:







  
    
      
      sem_template
      count
    
  
  
    
      0
      DiseaseDisorderMention
      321
    
    
      1
      SignSymptomMention
      141
    
    
      2
      MedicationMention
      72
    
    
      3
      ProcedureMention
      69
    
    
      4
      DiseaseDisorderMention DiseaseDisorderMention
      41
    
    
      5
      AnatomicalSiteMention
      40
    
    
      6
      SignSymptomMention SignSymptomMention
      26
    
    
      7
      alter SignSymptomMention DiseaseDisorderMention
      20
    
    
      8
      MedicationMention give
      14
    
    
      9
      SignSymptomMention SignSymptomMention SignSymp...
      14



In [115]:

    
count_temps['frequency'] = count_temps['count'] / count_temps['count'].sum()
count_temps.head(10)









    Out[115]:







  
    
      
      sem_template
      count
      frequency
    
  
  
    
      0
      DiseaseDisorderMention
      321
      0.115717
    
    
      1
      SignSymptomMention
      141
      0.050829
    
    
      2
      MedicationMention
      72
      0.025955
    
    
      3
      ProcedureMention
      69
      0.024874
    
    
      4
      DiseaseDisorderMention DiseaseDisorderMention
      41
      0.014780
    
    
      5
      AnatomicalSiteMention
      40
      0.014420
    
    
      6
      SignSymptomMention SignSymptomMention
      26
      0.009373
    
    
      7
      alter SignSymptomMention DiseaseDisorderMention
      20
      0.007210
    
    
      8
      MedicationMention give
      14
      0.005047
    
    
      9
      SignSymptomMention SignSymptomMention SignSymp...
      14
      0.005047

Frequency of templates by sentence position



In [41]:

    
sem_df.head()









    Out[41]:







  
    
      
      sent_id
      sem_template
      sentence_number
      doc_id
      begin
      end
      sentence_number
    
  
  
    
      0
      00351605-d93c-49ef-a9d3-fece550de1a0
      MedicationMention ProcedureMention ProcedureMe...
      10
      441341
      252
      386
      10
    
    
      1
      00a8bb1c-4186-4499-9a54-ff58e6115817
      DiseaseDisorderMention
      74
      458728
      3972
      3977
      74
    
    
      2
      01175ae6-57d5-4c77-b51f-fcc5bbd86685
      be SignSymptomMention AnatomicalSiteMention
      4
      414695
      341
      390
      4
    
    
      3
      0125390f-61cd-44cf-a940-88174fd05057
      SignSymptomMention
      52
      361823
      3452
      3463
      52
    
    
      4
      0155fb93-ce86-4f5f-8436-1a7a431185c2
      SignSymptomMention complain SignSymptomMention...
      0
      378613
      0
      166
      0



In [48]:

    
sem_df['sentence_number'].shape









    Out[48]:





(2774, 2)



In [51]:

    
temp_by_pos = pd.crosstab(sem_df['sem_template'], sem_df['sentence_number']).apply(lambda x: x / x.sum(), axis=0)
temp_by_pos.head()









    Out[51]:







  
    
      sentence_number
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
    
    
      sem_template
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      AnatomicalSiteMention
      0.0
      0.0
      0.012987
      0.0000
      0.0
      0.014286
      0.016129
      0.015385
      0.014286
      0.000000
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      AnatomicalSiteMention AnatomicalSiteMention
      0.0
      0.0
      0.000000
      0.0000
      0.0
      0.000000
      0.000000
      0.000000
      0.000000
      0.018519
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      AnatomicalSiteMention AnatomicalSiteMention SignSymptomMention
      0.0
      0.0
      0.000000
      0.0000
      0.0
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      AnatomicalSiteMention AnatomicalSiteMention SignSymptomMention observe drop
      0.0
      0.0
      0.012987
      0.0000
      0.0
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      AnatomicalSiteMention DiseaseDisorderMention
      0.0
      0.0
      0.012987
      0.0125
      0.0
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 83 columns

Write dataframes to parquet

We want to write these to a parquet file so that they can be used by a separate notebook to do clustering and note generation. This is just prep-work for those processes.



In [98]:

    
df_dir = 'data/processed_dfs'
# Write sentences, mentions, predicates, and umls concepts to parquet, sem_df



In [103]:

    
sents_with_mentions.to_parquet(f'{df_dir}/sentences.parquet')

mentions.to_parquet(f'{df_dir}/mentions.parquet')

preds.to_parquet(f'{df_dir}/predicates.parquet')

umls.to_parquet(f'{df_dir}/umls.parquet')

sem_df.to_parquet(f'{df_dir}/templates.parquet')

temp_by_pos.to_parquet(f'{df_dir}/templates_by_pos.parquet')



In [121]:



In [ ]:



In [ ]:

	begin	doc_id	elem_type	end	sent_id	xmi_id
0	0	356714	Sentence	150	3cf2381f-5123-4b4c-bd6f-bea21d265ea5	25
1	154	356714	Sentence	279	c9d218b9-fdd3-4758-925d-ddab2e801893	31
2	280	356714	Sentence	360	16bf3399-a9b5-4499-875c-c1a2b3b138c3	37
3	365	356714	Sentence	418	fd20d6d0-4f8c-4ff1-a061-e4b78fa529a9	43
4	422	356714	Sentence	483	c2f28716-60c2-4d3c-b815-35939c1afc49	49

	begin	doc_id	elem_type	end	id	mention_type	ontology_arr	sent_id	xmi_id
0	252	441341	Mention	262	77412b7d-2ffc-42f8-8896-ad218c1acda4	MedicationMention	5460 5480 5450 5470	00351605-d93c-49ef-a9d3-fece550de1a0	5496
1	275	441341	Mention	278	b6b9f099-e2b1-4625-8371-33a9bac4df14	ProcedureMention	7236 7246	00351605-d93c-49ef-a9d3-fece550de1a0	7260
2	303	441341	Mention	311	33ab5b9e-0d9f-41dd-adc0-c5025e9c4f40	ProcedureMention	7184 7194	00351605-d93c-49ef-a9d3-fece550de1a0	7208
3	332	441341	Mention	335	31cc3ae1-5693-4467-a85d-744a2f2e02e9	MedicationMention	5330 5350 5320 5340	00351605-d93c-49ef-a9d3-fece550de1a0	5366
4	365	441341	Mention	385	1a9bcb72-afe8-49cd-8dbc-8d80b8d88daa	SignSymptomMention	6755	00351605-d93c-49ef-a9d3-fece550de1a0	6768

	doc_id	sent_id	token	begin	end
0	458728	cb1777b8-3988-48dc-a6ee-57abb6203744	slide	115	122
1	458728	28899d39-f491-4230-9ebb-f742f1d132f7	suppose	134	142
2	458728	28899d39-f491-4230-9ebb-f742f1d132f7	be	146	148
3	458728	28899d39-f491-4230-9ebb-f742f1d132f7	find	166	171
4	458728	28899d39-f491-4230-9ebb-f742f1d132f7	be	178	180
5	458728	c9c66646-b14d-473f-8a11-ff44cd98d4e4	be	289	292
6	458728	c9c66646-b14d-473f-8a11-ff44cd98d4e4	verse	311	317
7	458728	c9c66646-b14d-473f-8a11-ff44cd98d4e4	verse	376	382
8	458728	c9c66646-b14d-473f-8a11-ff44cd98d4e4	give	383	388
9	458728	c9c66646-b14d-473f-8a11-ff44cd98d4e4	verse	414	420
10	458728	c9c66646-b14d-473f-8a11-ff44cd98d4e4	increase	443	452
11	458728	c9c66646-b14d-473f-8a11-ff44cd98d4e4	be	525	527
12	458728	c9c66646-b14d-473f-8a11-ff44cd98d4e4	initiate	528	537
13	458728	438b1d10-3b69-476c-991e-2719ed230a35	give	556	561
14	458728	7fbe30c1-0239-497f-b09a-98ebe05b7463	start	600	607
15	458728	7fbe30c1-0239-497f-b09a-98ebe05b7463	increase	635	644
16	458728	78e9c81c-9b24-4762-874a-36e98f1dabbd	verse	696	702
17	458728	78e9c81c-9b24-4762-874a-36e98f1dabbd	discontinue	703	715
18	458728	78e9c81c-9b24-4762-874a-36e98f1dabbd	be	731	734
19	458728	78e9c81c-9b24-4762-874a-36e98f1dabbd	start	735	742
20	458728	bda4697a-31aa-4d06-8409-e0596357fbb4	monitor	747	756
21	458728	0f7bcc60-bf7d-4e82-bf68-0ae233cf9fd5	sats	935	939
22	458728	0f7bcc60-bf7d-4e82-bf68-0ae233cf9fd5	drop	943	950
23	458728	0f7bcc60-bf7d-4e82-bf68-0ae233cf9fd5	see	991	995
24	458728	0f7bcc60-bf7d-4e82-bf68-0ae233cf9fd5	tinge	1028	1034
25	458728	4a558a51-40f8-487e-a4e4-a33fc07dd535	change	1071	1078
26	458728	4a558a51-40f8-487e-a4e4-a33fc07dd535	achieve	1085	1092
27	458728	4a558a51-40f8-487e-a4e4-a33fc07dd535	increase	1128	1137
28	458728	4a558a51-40f8-487e-a4e4-a33fc07dd535	increase	1163	1172
29	458728	19c33efc-96fc-40cb-9ea3-f6ce9caf1310	continue	1232	1241
...	...	...	...	...	...
15	497748	266fb788-c10e-4f75-adc5-2159d8519537	resolve	792	800
16	497748	266fb788-c10e-4f75-adc5-2159d8519537	resolve	831	839
17	497748	c69890a4-d315-4b9f-9b78-b1cb1ab9c1b2	transfer	863	874
18	497748	df314bb5-10c3-4898-ac1d-9a805426ee22	have	1008	1011
19	497748	df314bb5-10c3-4898-ac1d-9a805426ee22	be	1012	1016
20	497748	df314bb5-10c3-4898-ac1d-9a805426ee22	defer	1017	1025
24	497748	7c34ba60-b573-44dc-a9a3-d04f18735712	require	1446	1454
25	497748	7c34ba60-b573-44dc-a9a3-d04f18735712	suction	1455	1465
26	497748	ac6890fb-5d90-47dd-9a15-2126db7a30a2	remain	1631	1637
28	497748	6a325968-9656-4b72-bdce-6fb4efed7a0b	follow	1736	1744
29	497748	6a325968-9656-4b72-bdce-6fb4efed7a0b	alternate	1771	1782
0	438154	4838eada-a1b4-4ec0-908b-4412c18ede87	monitor	71	80
1	438154	4838eada-a1b4-4ec0-908b-4412c18ede87	continue	148	157
2	438154	4838eada-a1b4-4ec0-908b-4412c18ede87	make	210	214
4	438154	28763a14-3d66-428c-ac61-ac2fa34164ad	continue	283	291
5	438154	28763a14-3d66-428c-ac61-ac2fa34164ad	monitor	292	302
6	438154	28763a14-3d66-428c-ac61-ac2fa34164ad	stabilize	311	321
7	438154	28763a14-3d66-428c-ac61-ac2fa34164ad	encourage	323	332
8	438154	a81e1460-2f06-46ed-8a2b-5b44100e2d94	have	372	375
9	438154	a81e1460-2f06-46ed-8a2b-5b44100e2d94	stabilize	376	386
12	438154	d7634bad-e7da-4277-a5ba-b5bf8d349756	have	493	496
13	438154	d7634bad-e7da-4277-a5ba-b5bf8d349756	receive	497	505
17	438154	465019a9-ef94-490e-9bca-c28bd34759ab	start	687	694
18	438154	95dd0f6b-01cc-4cab-b422-9fed7fed5ddb	have	713	716
19	438154	e1c23cab-a534-4169-a30a-3cc4acbbc76c	find	777	782
20	438154	e1c23cab-a534-4169-a30a-3cc4acbbc76c	drain	810	818
21	438154	e1c23cab-a534-4169-a30a-3cc4acbbc76c	witness	825	834
22	438154	e1c23cab-a534-4169-a30a-3cc4acbbc76c	say	863	867
23	438154	e1c23cab-a534-4169-a30a-3cc4acbbc76c	be	871	873
24	438154	e1c23cab-a534-4169-a30a-3cc4acbbc76c	urinate	874	883

		doc_id	sent_id	token	begin	end
sent_id
00351605-d93c-49ef-a9d3-fece550de1a0	0	441341	00351605-d93c-49ef-a9d3-fece550de1a0	MedicationMention	252	262
	1	441341	00351605-d93c-49ef-a9d3-fece550de1a0	ProcedureMention	275	278
	2	441341	00351605-d93c-49ef-a9d3-fece550de1a0	ProcedureMention	303	311
	2	441341	00351605-d93c-49ef-a9d3-fece550de1a0	have	313	319
	3	441341	00351605-d93c-49ef-a9d3-fece550de1a0	MedicationMention	332	335

	sent_id	sem_template	sentence_number	doc_id	begin	end
0	00351605-d93c-49ef-a9d3-fece550de1a0	MedicationMention ProcedureMention ProcedureMe...	10	441341	252	386
1	00a8bb1c-4186-4499-9a54-ff58e6115817	DiseaseDisorderMention	74	458728	3972	3977
2	01175ae6-57d5-4c77-b51f-fcc5bbd86685	be SignSymptomMention AnatomicalSiteMention	4	414695	341	390
3	0125390f-61cd-44cf-a940-88174fd05057	SignSymptomMention	52	361823	3452	3463
4	0155fb93-ce86-4f5f-8436-1a7a431185c2	SignSymptomMention complain SignSymptomMention...	0	378613	0	166

	mention_type	count
0	AnatomicalSiteMention	758
1	DiseaseDisorderMention	1258
2	MedicationMention	1206
3	ProcedureMention	1033
4	SignSymptomMention	1844

	mention_type	count	frequency
0	AnatomicalSiteMention	758	0.124283
1	DiseaseDisorderMention	1258	0.206263
2	MedicationMention	1206	0.197737
3	ProcedureMention	1033	0.169372
4	SignSymptomMention	1844	0.302345

sentence_number	0	1	2	3	4	5	6	7	8	9	...	73	74	75	76	77	78	79	80	81	82
mention_type
AnatomicalSiteMention	0.094463	0.105263	0.117904	0.108108	0.183673	0.116883	0.081081	0.123711	0.090909	0.129310	...	0.125	0.000000	0.000000	0.25	0.0	0.00	0.166667	0.000000	0.0	0.0
DiseaseDisorderMention	0.485342	0.277193	0.170306	0.099099	0.158163	0.129870	0.087838	0.144330	0.206061	0.129310	...	0.250	0.333333	0.000000	0.25	0.0	0.50	0.333333	0.000000	1.0	0.0
MedicationMention	0.032573	0.105263	0.174672	0.234234	0.260204	0.305195	0.344595	0.247423	0.206061	0.353448	...	0.250	0.333333	0.333333	0.00	0.5	0.00	0.166667	0.666667	0.0	0.0
ProcedureMention	0.104235	0.168421	0.183406	0.202703	0.173469	0.149351	0.168919	0.216495	0.187879	0.215517	...	0.000	0.333333	0.333333	0.25	0.0	0.25	0.333333	0.333333	0.0	0.0
SignSymptomMention	0.283388	0.343860	0.353712	0.355856	0.224490	0.298701	0.317568	0.268041	0.309091	0.172414	...	0.375	0.000000	0.333333	0.25	0.5	0.25	0.000000	0.000000	0.0	1.0

	cui	count
0	C0270724	193
1	C0030193	69
2	C0015967	69
3	C0024467	65
4	C1145670	56
5	C0587081	52
6	C0015846	48
7	C0010200	47
8	C0042313	47
9	C0278060	47

	cui	count	frequency
0	C0270724	193	0.031645
1	C0030193	69	0.011313
2	C0015967	69	0.011313
3	C0024467	65	0.010657
4	C1145670	56	0.009182
5	C0587081	52	0.008526
6	C0015846	48	0.007870
7	C0010200	47	0.007706
8	C0042313	47	0.007706
9	C0278060	47	0.007706

	cui	count	frequency	preferred_text
0	C0270724	193	0.031645	Infantile Neuroaxonal Dystrophy
1	C0030193	69	0.011313	Pain
2	C0015967	69	0.011313	Fever
3	C0024467	65	0.010657	Magnesium
4	C1145670	56	0.009182	Respiratory Failure
5	C0587081	52	0.008526	Laboratory test finding
6	C0015846	48	0.007870	Fentanyl
7	C0010200	47	0.007706	Coughing
8	C0042313	47	0.007706	Vancomycin
9	C0278060	47	0.007706	Mental state

sentence_number	0	1	2	3	4	5	6	7	8	9	...	73	74	75	76	77	78	79	80	81	82
cui
C0000726	0.000000	0.000000	0.0	0.000000	0.005102	0.0	0.0	0.0	0.000000	0.017241	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
C0000731	0.000000	0.000000	0.0	0.000000	0.000000	0.0	0.0	0.0	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
C0000737	0.003257	0.007018	0.0	0.009009	0.000000	0.0	0.0	0.0	0.006061	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
C0000768	0.000000	0.000000	0.0	0.000000	0.000000	0.0	0.0	0.0	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
C0000833	0.006515	0.000000	0.0	0.000000	0.005102	0.0	0.0	0.0	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	sent_id	sem_template	sentence_number	doc_id	begin	end
0	000142d2-4690-4a56-8a68-89b5831ed2aa	ProcedureMention appear	58	408714	3670	3716
1	001ca15d-0e94-4933-b376-5123e22e5b13	MedicationMention SignSymptomMention	17	442499	1515	1542
2	002d5022-70f9-4638-84b1-dcf42a6c0e12	be be follow SignSymptomMention AnatomicalSite...	10	354315	718	801
3	00385844-a95d-4bb7-a773-70f78a3b035a	continue titrate MedicationMention SignSymptom...	17	432020	1451	1520
4	0046177c-aa3b-4c8d-9554-3efea4802687	be give MedicationMention MedicationMention Me...	5	464828	548	618

	sem_template	count	frequency
0	DiseaseDisorderMention	321	0.115717
1	SignSymptomMention	141	0.050829
2	MedicationMention	72	0.025955
3	ProcedureMention	69	0.024874
4	DiseaseDisorderMention DiseaseDisorderMention	41	0.014780
5	AnatomicalSiteMention	40	0.014420
6	SignSymptomMention SignSymptomMention	26	0.009373
7	alter SignSymptomMention DiseaseDisorderMention	20	0.007210
8	MedicationMention give	14	0.005047
9	SignSymptomMention SignSymptomMention SignSymp...	14	0.005047

sentence_number	0	1	2	3	4	5	6	7	8	9	...	73	74	75	76	77	78	79	80	81	82
sem_template
AnatomicalSiteMention	0.0	0.0	0.012987	0.0000	0.0	0.014286	0.016129	0.015385	0.014286	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
AnatomicalSiteMention AnatomicalSiteMention	0.0	0.0	0.000000	0.0000	0.0	0.000000	0.000000	0.000000	0.000000	0.018519	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
AnatomicalSiteMention AnatomicalSiteMention SignSymptomMention	0.0	0.0	0.000000	0.0000	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
AnatomicalSiteMention AnatomicalSiteMention SignSymptomMention observe drop	0.0	0.0	0.012987	0.0000	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
AnatomicalSiteMention DiseaseDisorderMention	0.0	0.0	0.012987	0.0125	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0