In [5]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from fastparquet import ParquetFile
import os
# from ctakes_xml import CtakesXmlParser
# from sklearn.feature_extraction.text import TfidfVectorizer
from stringdist import levenshtein_norm as lev_norm
# import matplotlib

# %matplotlib inline

Read the parquet file into a pandas dataframe. Using fastparquet here because pyarrow couldn't read in a file of this size for some reason


In [7]:
notes_file = 'synthnotes/data/note-events.parquet'
pq_root_path = 'synthnotes/data/xml_extracted'

In [8]:
pf = ParquetFile(notes_file)
df = pf.to_pandas()

Get the list of ids from the processed xml files so we can select a subset of the mimic notes


In [9]:
xml_dir = 'synthnotes/data/xml_files'
xml_files = os.listdir(xml_dir)
ids = [int(f.split('.txt.xmi')[0]) for f in xml_files]

Select the subset of notes that we have xml output from ctakes for. Reset the index and drop some unnecessary columns


In [10]:
notes = df[df.ROW_ID.isin(ids)]
notes = notes.reset_index(drop=True)
notes = notes.drop(['CHARTDATE','CHARTTIME','STORETIME','CGID','ISERROR'],axis=1)

In [ ]:


In [6]:
def get_notes_sample(df, n=100, category='Nursing'):
    notes = df[notes_df['CATEGORY'] == 'Nursing']
    notes = notes[notes['ISERROR'].isnull()]
    notes = notes[notes['DESCRIPTION'] == 'Generic Note']
    notes = notes.sample(n=n)
    notes = notes.reset_index(drop=True)
    return notes

process the xml files and store in parquet locally

TODO: switch this to use columnar format: need to change how we extract different types of elements


In [7]:
# parser = CtakesXmlParser()
# schemas = list()
# for file in xml_files:
#     xml_out = parser.parse(f'{xml_dir}/{file}')
#     for k, v in xml_out.items():
#         feature_df = pd.DataFrame(list(v))  
#         if feature_df.shape[0] > 0:
#             table = pa.Table.from_pandas(feature_df)
#             pq.write_to_dataset(table, f'{pq_root_path}/{k}')
#         else:
#             print(f"{k} was empty for {file}")

Creating templates

The plan:
For each sentences in all documents:

1. Get the predicates for the sentence
2. Get the entities for the sentence
3. For each entity:
     -  append the cui code from umls concept to the end
4. Combine predicates and entities and sort based on their begin position
5. Save to a dataframe

Some helper functions:


In [29]:
def get_df_from_pq(root, name):
    return pq.read_table(f'{root}/{name}').to_pandas()

def transform_preds(df):
    df['frameset'] = df['frameset'].apply(lambda x: x.split('.')[0])
    return df

def transform_mentions(mentions):
    # Don't want this to fail if these have already been removed
    try:
        mentions = mentions.drop(
            ['conditional', 'history_of', 'generic', 'polarity', 'discovery_technique', 'subject'],
            axis=1)
    except:
        pass
    
    sorted_df = mentions.groupby(['sent_id', 'begin']) \
                        .apply(lambda x: x.sort_values(['begin', 'end']))
    
    # Drop the mentions that are parts of a larger span.  Only keep the containing span that holds multiple
    # mentions
    deduped = sorted_df.drop_duplicates(['sent_id', 'begin'], keep='last')
    deduped = deduped.drop_duplicates(['sent_id', 'end'], keep='first')
    return deduped.reset_index(drop=True)

def set_template_token(df, column):
    df['template_token'] = df[column]
    return df

def get_template_tokens(row):
    return pd.Series({
        'doc_id': row['doc_id'],
        'sent_id': row['sent_id'],
        'token': row['template_token'],
        'begin': row['begin'],
        'end': row['end']
        })    

# def merge_mentions_umls(mentions, umls):
#     umls['umls_xmi_id'] = umls['xmi_id']
#     mentions = mentions.merge(umls[['umls_xmi_id', 'cui']], on='umls_xmi_id')
#     return mentions

# def umls_dedup(umls):
#     return umls.drop_duplicates(subset=['cui'])

# def set_umls_join_key(umls):
#     umls['umls_xmi_id'] = umls['xmi_id']
#     return umls

def set_sentence_pos(df):
    df = df.groupby(["doc_id"]).apply(lambda x: x.sort_values(["begin"])).reset_index(drop=True)
    df['sentence_number'] = df.groupby("doc_id").cumcount()
    return df

def get_root_verb(row):
    pass

def extract_sent(row):
    begin = row['begin']
    end = row['end']
    row['TEXT'] = row['TEXT'][begin:end]
    return row

def write_notes(row):
    fn = f'raw_notes/{row["ROW_ID"]}'
    with open(fn, 'w') as f:
        f.write(row['TEXT'])
        
def get_text_from_sentence(row, notes):
    doc = notes[notes['ROW_ID'] == row['doc_id']]
    b = row['begin']
    e = row['end']
    return doc['TEXT'].iloc[0][b:e]        

def edit_dist(row, term2):
    term1 = row.loc['preferred_text']
    return lev_norm(term1, term2)
    
def get_cui( mention, umls_df):
        ont_arr = list(map(int, mention['ontology_arr'].split())) or None
        ment_text = mention['text']
        
        concepts = umls_df[umls_df['xmi_id'].isin(ont_arr)].loc[:, ['cui', 'preferred_text', 'xmi_id']]
        concepts['dist'] = concepts.apply(edit_dist, args=(ment_text,), axis=1)
        sorted_df = concepts.sort_values(by='dist', ascending=True).reset_index(drop=True)
        cui = sorted_df['cui'].iloc[0]
        xmi_id = sorted_df['xmi_id'].iloc[0]
        pref_text = sorted_df['preferred_text'].iloc[0]
        return cui, xmi_id, pref_text

Pull in the dataframes for elements we need for processing


In [12]:
preds = get_df_from_pq(pq_root_path, 'predicates')
mentions = get_df_from_pq(pq_root_path, 'mentions')
umls = get_df_from_pq(pq_root_path, 'umls_concepts')
sents = get_df_from_pq(pq_root_path, 'sentences')
tokens = get_df_from_pq(pq_root_path, 'tokens')

In [13]:
sents = sents.rename({'id': 'sent_id'}, axis=1)

sents.head()


Out[13]:
begin doc_id elem_type end sent_id sentence_number xmi_id
0 0 356714 Sentence 150 3cf2381f-5123-4b4c-bd6f-bea21d265ea5 0 25
1 154 356714 Sentence 279 c9d218b9-fdd3-4758-925d-ddab2e801893 0 31
2 280 356714 Sentence 360 16bf3399-a9b5-4499-875c-c1a2b3b138c3 0 37
3 365 356714 Sentence 418 fd20d6d0-4f8c-4ff1-a061-e4b78fa529a9 0 43
4 422 356714 Sentence 483 c2f28716-60c2-4d3c-b815-35939c1afc49 0 49

Prep sentences DF

Add raw text from notes to sentences


In [14]:
sents = sents.rename({'id': 'sent_id'}, axis=1)
sents = sents.merge(notes[['ROW_ID', 'TEXT']],
            left_on='doc_id', right_on='ROW_ID').drop('ROW_ID', axis=1)

sents = sents.apply(extract_sent, axis=1)
sents = sents.rename({'TEXT': 'text'}, axis=1)

Add position of sentence in document to sentences df


In [15]:
sents = set_sentence_pos(sents)

remove sentences without entities


In [16]:
sents_with_mentions = sents[
    sents['sent_id'].isin(
        mentions.drop_duplicates(subset='sent_id')['sent_id']
    )
]

Prep UMLS DF

Remove umls concepts which don't have a preferred text field


In [17]:
umls = umls[~umls['preferred_text'].isna()]

Pref Mentions DF

Transform mentions

  1. Drop some unused fields
  2. Only keep the first umls code from ontology array ( no longer doing this as it limits the cui codes we can choose from in the umls concepts table)
  3. Sort by begin and end offsets. Remove mentions that end on the same offset. Only want to keep the full span and not split entities up. This should give us better semantic meaning
  4. Add raw text to mentions
  5. Add in umls concept information (CUI) to mentions a. There are many possible cuis for a the text span of an entity. Here, we're going to use the edit distance from the original span and the umls preferred text. For now, just choose the first umls concept with the best score (lowest)

In [18]:
mentions = get_df_from_pq(pq_root_path, 'mentions')
mentions = transform_mentions(mentions)
mentions.head()


Out[18]:
begin doc_id elem_type end id mention_type ontology_arr sent_id xmi_id
0 252 441341 Mention 262 77412b7d-2ffc-42f8-8896-ad218c1acda4 MedicationMention 5460 5480 5450 5470 00351605-d93c-49ef-a9d3-fece550de1a0 5496
1 275 441341 Mention 278 b6b9f099-e2b1-4625-8371-33a9bac4df14 ProcedureMention 7236 7246 00351605-d93c-49ef-a9d3-fece550de1a0 7260
2 303 441341 Mention 311 33ab5b9e-0d9f-41dd-adc0-c5025e9c4f40 ProcedureMention 7184 7194 00351605-d93c-49ef-a9d3-fece550de1a0 7208
3 332 441341 Mention 335 31cc3ae1-5693-4467-a85d-744a2f2e02e9 MedicationMention 5330 5350 5320 5340 00351605-d93c-49ef-a9d3-fece550de1a0 5366
4 365 441341 Mention 385 1a9bcb72-afe8-49cd-8dbc-8d80b8d88daa SignSymptomMention 6755 00351605-d93c-49ef-a9d3-fece550de1a0 6768

Add original text to mentions


In [19]:
mentions['text'] = mentions.apply(get_text_from_sentence, args=(notes,), axis=1)
mentions.head()


Out[19]:
begin doc_id elem_type end id mention_type ontology_arr sent_id xmi_id text
0 252 441341 Mention 262 77412b7d-2ffc-42f8-8896-ad218c1acda4 MedicationMention 5460 5480 5450 5470 00351605-d93c-49ef-a9d3-fece550de1a0 5496 Creatinine
1 275 441341 Mention 278 b6b9f099-e2b1-4625-8371-33a9bac4df14 ProcedureMention 7236 7246 00351605-d93c-49ef-a9d3-fece550de1a0 7260 BUN
2 303 441341 Mention 311 33ab5b9e-0d9f-41dd-adc0-c5025e9c4f40 ProcedureMention 7184 7194 00351605-d93c-49ef-a9d3-fece550de1a0 7208 dialysis
3 332 441341 Mention 335 31cc3ae1-5693-4467-a85d-744a2f2e02e9 MedicationMention 5330 5350 5320 5340 00351605-d93c-49ef-a9d3-fece550de1a0 5366 PVC
4 365 441341 Mention 385 1a9bcb72-afe8-49cd-8dbc-8d80b8d88daa SignSymptomMention 6755 00351605-d93c-49ef-a9d3-fece550de1a0 6768 within normal limits

Add sentence position to mentions


In [20]:
mentions = mentions.merge(sents_with_mentions[['sent_id', 'sentence_number']],
              on='sent_id')
mentions.head()


Out[20]:
begin doc_id elem_type end id mention_type ontology_arr sent_id xmi_id text sentence_number
0 252 441341 Mention 262 77412b7d-2ffc-42f8-8896-ad218c1acda4 MedicationMention 5460 5480 5450 5470 00351605-d93c-49ef-a9d3-fece550de1a0 5496 Creatinine 10
1 275 441341 Mention 278 b6b9f099-e2b1-4625-8371-33a9bac4df14 ProcedureMention 7236 7246 00351605-d93c-49ef-a9d3-fece550de1a0 7260 BUN 10
2 303 441341 Mention 311 33ab5b9e-0d9f-41dd-adc0-c5025e9c4f40 ProcedureMention 7184 7194 00351605-d93c-49ef-a9d3-fece550de1a0 7208 dialysis 10
3 332 441341 Mention 335 31cc3ae1-5693-4467-a85d-744a2f2e02e9 MedicationMention 5330 5350 5320 5340 00351605-d93c-49ef-a9d3-fece550de1a0 5366 PVC 10
4 365 441341 Mention 385 1a9bcb72-afe8-49cd-8dbc-8d80b8d88daa SignSymptomMention 6755 00351605-d93c-49ef-a9d3-fece550de1a0 6768 within normal limits 10

Prep Predicates DF

Transform predicates

Simple transformation. Just modify the frameset string to remove everything after the '.'


In [21]:
preds = transform_preds(preds)

Remove predicates not in sentences with mentions


In [22]:
print(preds.shape)
preds = preds[
    preds['sent_id'].isin( sents_with_mentions['sent_id'] )
]
print(preds.shape)


(4608, 9)
(3882, 9)

Add original text to predicates


In [23]:
preds['text'] = preds.apply(get_text_from_sentence, args=(notes,), axis=1)

Linking CUI codes to entities (mentions)

Assign cui codes to mentions (entities)

cTAKES over-generates cui and tui codes for text spans in a clinical note. There can be multiple coding schemes that have a code for a term and a cui could apply to the original text span specifically or be a generalization or abstraction over the meaning of the span. For generating text we want the cui that most closely matches the original text span. Future work could look at these generalizations to get a better sense of semantic meaning. However, this will require a deep understanding of the UMLS ontology an how to work with it to extract this kind of information.

For each mention:

  1. Collect all the umls concept rows (based on xmi_id) that are in the mention's ontology array
  2. Compute edit distance between the above umls rows' preferred text column and the mention's original text
  3. Sort edit distances in ascending order
  4. Choose the first umls concept row (a lower edit distance means the two texts are more similar)

In [30]:
mentions[['cui', 'umls_xmi_id', 'preferred_text']] = mentions. \
                                                     apply(get_cui, args=(umls,), axis=1, result_type='expand')
mentions.head()


Out[30]:
begin doc_id elem_type end id mention_type ontology_arr sent_id xmi_id text sentence_number cui umls_xmi_id preferred_text
0 252 441341 Mention 262 77412b7d-2ffc-42f8-8896-ad218c1acda4 MedicationMention 5460 5480 5450 5470 00351605-d93c-49ef-a9d3-fece550de1a0 5496 Creatinine 10 C0010294 5460 Creatinine
1 275 441341 Mention 278 b6b9f099-e2b1-4625-8371-33a9bac4df14 ProcedureMention 7236 7246 00351605-d93c-49ef-a9d3-fece550de1a0 7260 BUN 10 C0005845 7236 Blood urea nitrogen measurement
2 303 441341 Mention 311 33ab5b9e-0d9f-41dd-adc0-c5025e9c4f40 ProcedureMention 7184 7194 00351605-d93c-49ef-a9d3-fece550de1a0 7208 dialysis 10 C0011946 7184 Dialysis procedure
3 332 441341 Mention 335 31cc3ae1-5693-4467-a85d-744a2f2e02e9 MedicationMention 5330 5350 5320 5340 00351605-d93c-49ef-a9d3-fece550de1a0 5366 PVC 10 C0032624 5330 Polyvinyl Chloride
4 365 441341 Mention 385 1a9bcb72-afe8-49cd-8dbc-8d80b8d88daa SignSymptomMention 6755 00351605-d93c-49ef-a9d3-fece550de1a0 6768 within normal limits 10 C1265570 6755 Morphology within normal limits

Set the template tokens we're going to use

For mentions this is either: the type of mention, the CUI code, or the two concatenated together

For predicates it is the frameset trimmed of everything after the '.'


In [31]:
mentions['template_token'] = mentions['mention_type']
preds['template_token'] = preds['frameset']

preds_toks = preds.apply(get_template_tokens, axis=1)
mentions_toks = mentions.apply(get_template_tokens, axis=1)

mentions_toks.groupby(['sent_id', 'end']).head()

preds_toks.groupby(['sent_id', 'end']).head()


Out[31]:
doc_id sent_id token begin end
0 458728 cb1777b8-3988-48dc-a6ee-57abb6203744 slide 115 122
1 458728 28899d39-f491-4230-9ebb-f742f1d132f7 suppose 134 142
2 458728 28899d39-f491-4230-9ebb-f742f1d132f7 be 146 148
3 458728 28899d39-f491-4230-9ebb-f742f1d132f7 find 166 171
4 458728 28899d39-f491-4230-9ebb-f742f1d132f7 be 178 180
5 458728 c9c66646-b14d-473f-8a11-ff44cd98d4e4 be 289 292
6 458728 c9c66646-b14d-473f-8a11-ff44cd98d4e4 verse 311 317
7 458728 c9c66646-b14d-473f-8a11-ff44cd98d4e4 verse 376 382
8 458728 c9c66646-b14d-473f-8a11-ff44cd98d4e4 give 383 388
9 458728 c9c66646-b14d-473f-8a11-ff44cd98d4e4 verse 414 420
10 458728 c9c66646-b14d-473f-8a11-ff44cd98d4e4 increase 443 452
11 458728 c9c66646-b14d-473f-8a11-ff44cd98d4e4 be 525 527
12 458728 c9c66646-b14d-473f-8a11-ff44cd98d4e4 initiate 528 537
13 458728 438b1d10-3b69-476c-991e-2719ed230a35 give 556 561
14 458728 7fbe30c1-0239-497f-b09a-98ebe05b7463 start 600 607
15 458728 7fbe30c1-0239-497f-b09a-98ebe05b7463 increase 635 644
16 458728 78e9c81c-9b24-4762-874a-36e98f1dabbd verse 696 702
17 458728 78e9c81c-9b24-4762-874a-36e98f1dabbd discontinue 703 715
18 458728 78e9c81c-9b24-4762-874a-36e98f1dabbd be 731 734
19 458728 78e9c81c-9b24-4762-874a-36e98f1dabbd start 735 742
20 458728 bda4697a-31aa-4d06-8409-e0596357fbb4 monitor 747 756
21 458728 0f7bcc60-bf7d-4e82-bf68-0ae233cf9fd5 sats 935 939
22 458728 0f7bcc60-bf7d-4e82-bf68-0ae233cf9fd5 drop 943 950
23 458728 0f7bcc60-bf7d-4e82-bf68-0ae233cf9fd5 see 991 995
24 458728 0f7bcc60-bf7d-4e82-bf68-0ae233cf9fd5 tinge 1028 1034
25 458728 4a558a51-40f8-487e-a4e4-a33fc07dd535 change 1071 1078
26 458728 4a558a51-40f8-487e-a4e4-a33fc07dd535 achieve 1085 1092
27 458728 4a558a51-40f8-487e-a4e4-a33fc07dd535 increase 1128 1137
28 458728 4a558a51-40f8-487e-a4e4-a33fc07dd535 increase 1163 1172
29 458728 19c33efc-96fc-40cb-9ea3-f6ce9caf1310 continue 1232 1241
... ... ... ... ... ...
15 497748 266fb788-c10e-4f75-adc5-2159d8519537 resolve 792 800
16 497748 266fb788-c10e-4f75-adc5-2159d8519537 resolve 831 839
17 497748 c69890a4-d315-4b9f-9b78-b1cb1ab9c1b2 transfer 863 874
18 497748 df314bb5-10c3-4898-ac1d-9a805426ee22 have 1008 1011
19 497748 df314bb5-10c3-4898-ac1d-9a805426ee22 be 1012 1016
20 497748 df314bb5-10c3-4898-ac1d-9a805426ee22 defer 1017 1025
24 497748 7c34ba60-b573-44dc-a9a3-d04f18735712 require 1446 1454
25 497748 7c34ba60-b573-44dc-a9a3-d04f18735712 suction 1455 1465
26 497748 ac6890fb-5d90-47dd-9a15-2126db7a30a2 remain 1631 1637
28 497748 6a325968-9656-4b72-bdce-6fb4efed7a0b follow 1736 1744
29 497748 6a325968-9656-4b72-bdce-6fb4efed7a0b alternate 1771 1782
0 438154 4838eada-a1b4-4ec0-908b-4412c18ede87 monitor 71 80
1 438154 4838eada-a1b4-4ec0-908b-4412c18ede87 continue 148 157
2 438154 4838eada-a1b4-4ec0-908b-4412c18ede87 make 210 214
4 438154 28763a14-3d66-428c-ac61-ac2fa34164ad continue 283 291
5 438154 28763a14-3d66-428c-ac61-ac2fa34164ad monitor 292 302
6 438154 28763a14-3d66-428c-ac61-ac2fa34164ad stabilize 311 321
7 438154 28763a14-3d66-428c-ac61-ac2fa34164ad encourage 323 332
8 438154 a81e1460-2f06-46ed-8a2b-5b44100e2d94 have 372 375
9 438154 a81e1460-2f06-46ed-8a2b-5b44100e2d94 stabilize 376 386
12 438154 d7634bad-e7da-4277-a5ba-b5bf8d349756 have 493 496
13 438154 d7634bad-e7da-4277-a5ba-b5bf8d349756 receive 497 505
17 438154 465019a9-ef94-490e-9bca-c28bd34759ab start 687 694
18 438154 95dd0f6b-01cc-4cab-b422-9fed7fed5ddb have 713 716
19 438154 e1c23cab-a534-4169-a30a-3cc4acbbc76c find 777 782
20 438154 e1c23cab-a534-4169-a30a-3cc4acbbc76c drain 810 818
21 438154 e1c23cab-a534-4169-a30a-3cc4acbbc76c witness 825 834
22 438154 e1c23cab-a534-4169-a30a-3cc4acbbc76c say 863 867
23 438154 e1c23cab-a534-4169-a30a-3cc4acbbc76c be 871 873
24 438154 e1c23cab-a534-4169-a30a-3cc4acbbc76c urinate 874 883

3882 rows × 5 columns

Append the two template tokens dataframes


In [32]:
template_tokens = preds_toks.append(mentions_toks)
temp_tokens = template_tokens.groupby(['sent_id']).apply(lambda x: x.sort_values(['begin']))

In [33]:
temp_tokens.head()


Out[33]:
doc_id sent_id token begin end
sent_id
00351605-d93c-49ef-a9d3-fece550de1a0 0 441341 00351605-d93c-49ef-a9d3-fece550de1a0 MedicationMention 252 262
1 441341 00351605-d93c-49ef-a9d3-fece550de1a0 ProcedureMention 275 278
2 441341 00351605-d93c-49ef-a9d3-fece550de1a0 ProcedureMention 303 311
2 441341 00351605-d93c-49ef-a9d3-fece550de1a0 have 313 319
3 441341 00351605-d93c-49ef-a9d3-fece550de1a0 MedicationMention 332 335

Get the semantic templates

Group the rows of the above template tokens dataframe by sentence id and join them together into a single string. Must sort by begin offset.


In [34]:
sem_templates = template_tokens.sort_values('begin').groupby('sent_id')['token'].apply(' '.join)
sem_templates.head()


Out[34]:
sent_id
00351605-d93c-49ef-a9d3-fece550de1a0    MedicationMention ProcedureMention ProcedureMe...
00a8bb1c-4186-4499-9a54-ff58e6115817                               DiseaseDisorderMention
01175ae6-57d5-4c77-b51f-fcc5bbd86685          be SignSymptomMention AnatomicalSiteMention
0125390f-61cd-44cf-a940-88174fd05057                                   SignSymptomMention
0155fb93-ce86-4f5f-8436-1a7a431185c2    SignSymptomMention complain SignSymptomMention...
Name: token, dtype: object

In [35]:
temp_tokens.token.unique().shape


Out[35]:
(379,)

In [49]:
sem_df = pd.DataFrame(sem_templates)  # What is this?
sem_df.head()

sem_df.reset_index(level=0, inplace=True)

sem_df = sem_df.rename(columns={'token': 'sem_template'})

sem_df = sem_df.merge(sents[['sent_id', 'sentence_number', 'doc_id', 'begin', 'end']],
                      left_on='sent_id', right_on='sent_id' )#.drop('id', axis=1)

In [50]:
sem_df.head()


Out[50]:
sent_id sem_template sentence_number doc_id begin end
0 00351605-d93c-49ef-a9d3-fece550de1a0 MedicationMention ProcedureMention ProcedureMe... 10 441341 252 386
1 00a8bb1c-4186-4499-9a54-ff58e6115817 DiseaseDisorderMention 74 458728 3972 3977
2 01175ae6-57d5-4c77-b51f-fcc5bbd86685 be SignSymptomMention AnatomicalSiteMention 4 414695 341 390
3 0125390f-61cd-44cf-a940-88174fd05057 SignSymptomMention 52 361823 3452 3463
4 0155fb93-ce86-4f5f-8436-1a7a431185c2 SignSymptomMention complain SignSymptomMention... 0 378613 0 166

Gather corpus statistics

Average sentences per doc


In [36]:
avg_sents_per_doc = sents.groupby('doc_id').size().mean()
print(avg_sents_per_doc)


42.52

Average sentences w/ entities per doc


In [37]:
avg_sents_with_ents_per_doc = sents_with_mentions.groupby('doc_id').size().mean()
print(avg_sents_with_ents_per_doc)


28.306122448979593

Count of unique cuis (When removing overlapping text spans)


In [38]:
print(mentions['cui'].nunique())


1042

Average # of cuis per doc


In [39]:
mentions.groupby('doc_id').size().mean()


Out[39]:
62.234693877551024

Average # of cuis per sentence


In [40]:
mentions.groupby('sent_id').size().mean()


Out[40]:
2.1986301369863015

Average # of words per doc (excluding newline tokens and symbols)


In [41]:
tokens = tokens[(~tokens['sent_id'].isnull()) & (tokens['token_type'] != 'NewlineToken')]

In [42]:
wc_by_doc = tokens.groupby('doc_id').count()['id'].reset_index(name='count')
wc_by_doc['count'].mean()


Out[42]:
475.42

Average # of words per sentence


In [43]:
wc_by_sentence = tokens.groupby('sent_id')['id'].count().reset_index(name='count')
wc_by_sentence['count'].mean()


Out[43]:
11.181091251175918

Get frequency of mentions


In [44]:
mention_counts = mentions.groupby('mention_type').size().reset_index(name='count')
mention_counts


Out[44]:
mention_type count
0 AnatomicalSiteMention 758
1 DiseaseDisorderMention 1258
2 MedicationMention 1206
3 ProcedureMention 1033
4 SignSymptomMention 1844

In [45]:
mention_counts['frequency'] = mention_counts['count'] / mention_counts['count'].sum()
mention_counts


Out[45]:
mention_type count frequency
0 AnatomicalSiteMention 758 0.124283
1 DiseaseDisorderMention 1258 0.206263
2 MedicationMention 1206 0.197737
3 ProcedureMention 1033 0.169372
4 SignSymptomMention 1844 0.302345

Frequency of mentions by sentence position


In [46]:
mentions_by_pos = pd.crosstab(
                        mentions['mention_type'],
                        mentions['sentence_number']).apply(lambda x: x / x.sum(), axis=0)
mentions_by_pos


Out[46]:
sentence_number 0 1 2 3 4 5 6 7 8 9 ... 73 74 75 76 77 78 79 80 81 82
mention_type
AnatomicalSiteMention 0.094463 0.105263 0.117904 0.108108 0.183673 0.116883 0.081081 0.123711 0.090909 0.129310 ... 0.125 0.000000 0.000000 0.25 0.0 0.00 0.166667 0.000000 0.0 0.0
DiseaseDisorderMention 0.485342 0.277193 0.170306 0.099099 0.158163 0.129870 0.087838 0.144330 0.206061 0.129310 ... 0.250 0.333333 0.000000 0.25 0.0 0.50 0.333333 0.000000 1.0 0.0
MedicationMention 0.032573 0.105263 0.174672 0.234234 0.260204 0.305195 0.344595 0.247423 0.206061 0.353448 ... 0.250 0.333333 0.333333 0.00 0.5 0.00 0.166667 0.666667 0.0 0.0
ProcedureMention 0.104235 0.168421 0.183406 0.202703 0.173469 0.149351 0.168919 0.216495 0.187879 0.215517 ... 0.000 0.333333 0.333333 0.25 0.0 0.25 0.333333 0.333333 0.0 0.0
SignSymptomMention 0.283388 0.343860 0.353712 0.355856 0.224490 0.298701 0.317568 0.268041 0.309091 0.172414 ... 0.375 0.000000 0.333333 0.25 0.5 0.25 0.000000 0.000000 0.0 1.0

5 rows × 83 columns

Frequency of CUIs


In [47]:
cui_counts = mentions.groupby('cui').size().reset_index(name='count')
cui_counts = cui_counts.sort_values('count', ascending=False).reset_index(drop=True)
cui_counts.head(10)


Out[47]:
cui count
0 C0270724 193
1 C0030193 69
2 C0015967 69
3 C0024467 65
4 C1145670 56
5 C0587081 52
6 C0015846 48
7 C0010200 47
8 C0042313 47
9 C0278060 47

In [48]:
cui_counts['frequency'] = cui_counts['count'] / cui_counts['count'].sum()
cui_counts.head(10)


Out[48]:
cui count frequency
0 C0270724 193 0.031645
1 C0030193 69 0.011313
2 C0015967 69 0.011313
3 C0024467 65 0.010657
4 C1145670 56 0.009182
5 C0587081 52 0.008526
6 C0015846 48 0.007870
7 C0010200 47 0.007706
8 C0042313 47 0.007706
9 C0278060 47 0.007706

Frequency with preferred text


In [51]:
cui_counts_with_text = cui_counts.merge(mentions[['cui', 'preferred_text']], on='cui') \
                        .drop_duplicates('cui') \
                        .reset_index(drop=True)

cui_counts_with_text.head(10)


Out[51]:
cui count frequency preferred_text
0 C0270724 193 0.031645 Infantile Neuroaxonal Dystrophy
1 C0030193 69 0.011313 Pain
2 C0015967 69 0.011313 Fever
3 C0024467 65 0.010657 Magnesium
4 C1145670 56 0.009182 Respiratory Failure
5 C0587081 52 0.008526 Laboratory test finding
6 C0015846 48 0.007870 Fentanyl
7 C0010200 47 0.007706 Coughing
8 C0042313 47 0.007706 Vancomycin
9 C0278060 47 0.007706 Mental state

Frequency of CUIs by sentence position


In [45]:
cui_by_pos = pd.crosstab(mentions['cui'], mentions['sentence_number']).apply(lambda x: x / x.sum(), axis=0)
cui_by_pos.head()


Out[45]:
sentence_number 0 1 2 3 4 5 6 7 8 9 ... 73 74 75 76 77 78 79 80 81 82
cui
C0000726 0.000000 0.000000 0.0 0.000000 0.005102 0.0 0.0 0.0 0.000000 0.017241 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
C0000731 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
C0000737 0.003257 0.007018 0.0 0.009009 0.000000 0.0 0.0 0.0 0.006061 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
C0000768 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
C0000833 0.006515 0.000000 0.0 0.000000 0.005102 0.0 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 83 columns


In [154]:
cui_by_pos.loc[:, 0].sort_values(ascending=False)[:10]


Out[154]:
cui
C1145670    0.032573
C0339897    0.029316
C0262926    0.026059
C0013404    0.022801
C0018802    0.022801
C0024117    0.019544
C0013687    0.019544
C0023890    0.016287
C0278060    0.016287
C0020538    0.016287
Name: 0, dtype: float64

Number of unique templates


In [183]:
sem_df.head()


Out[183]:
sent_id sem_template sentence_number doc_id begin end
0 000142d2-4690-4a56-8a68-89b5831ed2aa ProcedureMention appear 58 408714 3670 3716
1 001ca15d-0e94-4933-b376-5123e22e5b13 MedicationMention SignSymptomMention 17 442499 1515 1542
2 002d5022-70f9-4638-84b1-dcf42a6c0e12 be be follow SignSymptomMention AnatomicalSite... 10 354315 718 801
3 00385844-a95d-4bb7-a773-70f78a3b035a continue titrate MedicationMention SignSymptom... 17 432020 1451 1520
4 0046177c-aa3b-4c8d-9554-3efea4802687 be give MedicationMention MedicationMention Me... 5 464828 548 618

In [160]:
sem_df['sem_template'].nunique()


Out[160]:
1242

Frequency of templates (identified by sentence number)


In [114]:
count_temps = sem_df.groupby('sem_template').size().reset_index(name='count')
count_temps = count_temps.sort_values('count', ascending=False).reset_index(drop=True)
count_temps.head(10)


Out[114]:
sem_template count
0 DiseaseDisorderMention 321
1 SignSymptomMention 141
2 MedicationMention 72
3 ProcedureMention 69
4 DiseaseDisorderMention DiseaseDisorderMention 41
5 AnatomicalSiteMention 40
6 SignSymptomMention SignSymptomMention 26
7 alter SignSymptomMention DiseaseDisorderMention 20
8 MedicationMention give 14
9 SignSymptomMention SignSymptomMention SignSymp... 14

In [115]:
count_temps['frequency'] = count_temps['count'] / count_temps['count'].sum()
count_temps.head(10)


Out[115]:
sem_template count frequency
0 DiseaseDisorderMention 321 0.115717
1 SignSymptomMention 141 0.050829
2 MedicationMention 72 0.025955
3 ProcedureMention 69 0.024874
4 DiseaseDisorderMention DiseaseDisorderMention 41 0.014780
5 AnatomicalSiteMention 40 0.014420
6 SignSymptomMention SignSymptomMention 26 0.009373
7 alter SignSymptomMention DiseaseDisorderMention 20 0.007210
8 MedicationMention give 14 0.005047
9 SignSymptomMention SignSymptomMention SignSymp... 14 0.005047

Frequency of templates by sentence position


In [41]:
sem_df.head()


Out[41]:
sent_id sem_template sentence_number doc_id begin end sentence_number
0 00351605-d93c-49ef-a9d3-fece550de1a0 MedicationMention ProcedureMention ProcedureMe... 10 441341 252 386 10
1 00a8bb1c-4186-4499-9a54-ff58e6115817 DiseaseDisorderMention 74 458728 3972 3977 74
2 01175ae6-57d5-4c77-b51f-fcc5bbd86685 be SignSymptomMention AnatomicalSiteMention 4 414695 341 390 4
3 0125390f-61cd-44cf-a940-88174fd05057 SignSymptomMention 52 361823 3452 3463 52
4 0155fb93-ce86-4f5f-8436-1a7a431185c2 SignSymptomMention complain SignSymptomMention... 0 378613 0 166 0

In [48]:
sem_df['sentence_number'].shape


Out[48]:
(2774, 2)

In [51]:
temp_by_pos = pd.crosstab(sem_df['sem_template'], sem_df['sentence_number']).apply(lambda x: x / x.sum(), axis=0)
temp_by_pos.head()


Out[51]:
sentence_number 0 1 2 3 4 5 6 7 8 9 ... 73 74 75 76 77 78 79 80 81 82
sem_template
AnatomicalSiteMention 0.0 0.0 0.012987 0.0000 0.0 0.014286 0.016129 0.015385 0.014286 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
AnatomicalSiteMention AnatomicalSiteMention 0.0 0.0 0.000000 0.0000 0.0 0.000000 0.000000 0.000000 0.000000 0.018519 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
AnatomicalSiteMention AnatomicalSiteMention SignSymptomMention 0.0 0.0 0.000000 0.0000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
AnatomicalSiteMention AnatomicalSiteMention SignSymptomMention observe drop 0.0 0.0 0.012987 0.0000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
AnatomicalSiteMention DiseaseDisorderMention 0.0 0.0 0.012987 0.0125 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 83 columns

Write dataframes to parquet

We want to write these to a parquet file so that they can be used by a separate notebook to do clustering and note generation. This is just prep-work for those processes.


In [98]:
df_dir = 'data/processed_dfs'
# Write sentences, mentions, predicates, and umls concepts to parquet, sem_df

In [103]:
sents_with_mentions.to_parquet(f'{df_dir}/sentences.parquet')

mentions.to_parquet(f'{df_dir}/mentions.parquet')

preds.to_parquet(f'{df_dir}/predicates.parquet')

umls.to_parquet(f'{df_dir}/umls.parquet')

sem_df.to_parquet(f'{df_dir}/templates.parquet')

temp_by_pos.to_parquet(f'{df_dir}/templates_by_pos.parquet')

In [121]:


In [ ]:


In [ ]: