In [5]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from fastparquet import ParquetFile
import os
# from ctakes_xml import CtakesXmlParser
# from sklearn.feature_extraction.text import TfidfVectorizer
from stringdist import levenshtein_norm as lev_norm
# import matplotlib
# %matplotlib inline
In [7]:
notes_file = 'synthnotes/data/note-events.parquet'
pq_root_path = 'synthnotes/data/xml_extracted'
In [8]:
pf = ParquetFile(notes_file)
df = pf.to_pandas()
In [9]:
xml_dir = 'synthnotes/data/xml_files'
xml_files = os.listdir(xml_dir)
ids = [int(f.split('.txt.xmi')[0]) for f in xml_files]
In [10]:
notes = df[df.ROW_ID.isin(ids)]
notes = notes.reset_index(drop=True)
notes = notes.drop(['CHARTDATE','CHARTTIME','STORETIME','CGID','ISERROR'],axis=1)
In [ ]:
In [6]:
def get_notes_sample(df, n=100, category='Nursing'):
notes = df[notes_df['CATEGORY'] == 'Nursing']
notes = notes[notes['ISERROR'].isnull()]
notes = notes[notes['DESCRIPTION'] == 'Generic Note']
notes = notes.sample(n=n)
notes = notes.reset_index(drop=True)
return notes
In [7]:
# parser = CtakesXmlParser()
# schemas = list()
# for file in xml_files:
# xml_out = parser.parse(f'{xml_dir}/{file}')
# for k, v in xml_out.items():
# feature_df = pd.DataFrame(list(v))
# if feature_df.shape[0] > 0:
# table = pa.Table.from_pandas(feature_df)
# pq.write_to_dataset(table, f'{pq_root_path}/{k}')
# else:
# print(f"{k} was empty for {file}")
The plan:
For each sentences in all documents:
1. Get the predicates for the sentence
2. Get the entities for the sentence
3. For each entity:
- append the cui code from umls concept to the end
4. Combine predicates and entities and sort based on their begin position
5. Save to a dataframe
In [29]:
def get_df_from_pq(root, name):
return pq.read_table(f'{root}/{name}').to_pandas()
def transform_preds(df):
df['frameset'] = df['frameset'].apply(lambda x: x.split('.')[0])
return df
def transform_mentions(mentions):
# Don't want this to fail if these have already been removed
try:
mentions = mentions.drop(
['conditional', 'history_of', 'generic', 'polarity', 'discovery_technique', 'subject'],
axis=1)
except:
pass
sorted_df = mentions.groupby(['sent_id', 'begin']) \
.apply(lambda x: x.sort_values(['begin', 'end']))
# Drop the mentions that are parts of a larger span. Only keep the containing span that holds multiple
# mentions
deduped = sorted_df.drop_duplicates(['sent_id', 'begin'], keep='last')
deduped = deduped.drop_duplicates(['sent_id', 'end'], keep='first')
return deduped.reset_index(drop=True)
def set_template_token(df, column):
df['template_token'] = df[column]
return df
def get_template_tokens(row):
return pd.Series({
'doc_id': row['doc_id'],
'sent_id': row['sent_id'],
'token': row['template_token'],
'begin': row['begin'],
'end': row['end']
})
# def merge_mentions_umls(mentions, umls):
# umls['umls_xmi_id'] = umls['xmi_id']
# mentions = mentions.merge(umls[['umls_xmi_id', 'cui']], on='umls_xmi_id')
# return mentions
# def umls_dedup(umls):
# return umls.drop_duplicates(subset=['cui'])
# def set_umls_join_key(umls):
# umls['umls_xmi_id'] = umls['xmi_id']
# return umls
def set_sentence_pos(df):
df = df.groupby(["doc_id"]).apply(lambda x: x.sort_values(["begin"])).reset_index(drop=True)
df['sentence_number'] = df.groupby("doc_id").cumcount()
return df
def get_root_verb(row):
pass
def extract_sent(row):
begin = row['begin']
end = row['end']
row['TEXT'] = row['TEXT'][begin:end]
return row
def write_notes(row):
fn = f'raw_notes/{row["ROW_ID"]}'
with open(fn, 'w') as f:
f.write(row['TEXT'])
def get_text_from_sentence(row, notes):
doc = notes[notes['ROW_ID'] == row['doc_id']]
b = row['begin']
e = row['end']
return doc['TEXT'].iloc[0][b:e]
def edit_dist(row, term2):
term1 = row.loc['preferred_text']
return lev_norm(term1, term2)
def get_cui( mention, umls_df):
ont_arr = list(map(int, mention['ontology_arr'].split())) or None
ment_text = mention['text']
concepts = umls_df[umls_df['xmi_id'].isin(ont_arr)].loc[:, ['cui', 'preferred_text', 'xmi_id']]
concepts['dist'] = concepts.apply(edit_dist, args=(ment_text,), axis=1)
sorted_df = concepts.sort_values(by='dist', ascending=True).reset_index(drop=True)
cui = sorted_df['cui'].iloc[0]
xmi_id = sorted_df['xmi_id'].iloc[0]
pref_text = sorted_df['preferred_text'].iloc[0]
return cui, xmi_id, pref_text
In [12]:
preds = get_df_from_pq(pq_root_path, 'predicates')
mentions = get_df_from_pq(pq_root_path, 'mentions')
umls = get_df_from_pq(pq_root_path, 'umls_concepts')
sents = get_df_from_pq(pq_root_path, 'sentences')
tokens = get_df_from_pq(pq_root_path, 'tokens')
In [13]:
sents = sents.rename({'id': 'sent_id'}, axis=1)
sents.head()
Out[13]:
In [14]:
sents = sents.rename({'id': 'sent_id'}, axis=1)
sents = sents.merge(notes[['ROW_ID', 'TEXT']],
left_on='doc_id', right_on='ROW_ID').drop('ROW_ID', axis=1)
sents = sents.apply(extract_sent, axis=1)
sents = sents.rename({'TEXT': 'text'}, axis=1)
In [15]:
sents = set_sentence_pos(sents)
In [16]:
sents_with_mentions = sents[
sents['sent_id'].isin(
mentions.drop_duplicates(subset='sent_id')['sent_id']
)
]
In [17]:
umls = umls[~umls['preferred_text'].isna()]
In [18]:
mentions = get_df_from_pq(pq_root_path, 'mentions')
mentions = transform_mentions(mentions)
mentions.head()
Out[18]:
In [19]:
mentions['text'] = mentions.apply(get_text_from_sentence, args=(notes,), axis=1)
mentions.head()
Out[19]:
In [20]:
mentions = mentions.merge(sents_with_mentions[['sent_id', 'sentence_number']],
on='sent_id')
mentions.head()
Out[20]:
In [21]:
preds = transform_preds(preds)
In [22]:
print(preds.shape)
preds = preds[
preds['sent_id'].isin( sents_with_mentions['sent_id'] )
]
print(preds.shape)
In [23]:
preds['text'] = preds.apply(get_text_from_sentence, args=(notes,), axis=1)
cTAKES over-generates cui and tui codes for text spans in a clinical note. There can be multiple coding schemes that have a code for a term and a cui could apply to the original text span specifically or be a generalization or abstraction over the meaning of the span. For generating text we want the cui that most closely matches the original text span. Future work could look at these generalizations to get a better sense of semantic meaning. However, this will require a deep understanding of the UMLS ontology an how to work with it to extract this kind of information.
For each mention:
In [30]:
mentions[['cui', 'umls_xmi_id', 'preferred_text']] = mentions. \
apply(get_cui, args=(umls,), axis=1, result_type='expand')
mentions.head()
Out[30]:
In [31]:
mentions['template_token'] = mentions['mention_type']
preds['template_token'] = preds['frameset']
preds_toks = preds.apply(get_template_tokens, axis=1)
mentions_toks = mentions.apply(get_template_tokens, axis=1)
mentions_toks.groupby(['sent_id', 'end']).head()
preds_toks.groupby(['sent_id', 'end']).head()
Out[31]:
In [32]:
template_tokens = preds_toks.append(mentions_toks)
temp_tokens = template_tokens.groupby(['sent_id']).apply(lambda x: x.sort_values(['begin']))
In [33]:
temp_tokens.head()
Out[33]:
In [34]:
sem_templates = template_tokens.sort_values('begin').groupby('sent_id')['token'].apply(' '.join)
sem_templates.head()
Out[34]:
In [35]:
temp_tokens.token.unique().shape
Out[35]:
In [49]:
sem_df = pd.DataFrame(sem_templates) # What is this?
sem_df.head()
sem_df.reset_index(level=0, inplace=True)
sem_df = sem_df.rename(columns={'token': 'sem_template'})
sem_df = sem_df.merge(sents[['sent_id', 'sentence_number', 'doc_id', 'begin', 'end']],
left_on='sent_id', right_on='sent_id' )#.drop('id', axis=1)
In [50]:
sem_df.head()
Out[50]:
In [36]:
avg_sents_per_doc = sents.groupby('doc_id').size().mean()
print(avg_sents_per_doc)
In [37]:
avg_sents_with_ents_per_doc = sents_with_mentions.groupby('doc_id').size().mean()
print(avg_sents_with_ents_per_doc)
In [38]:
print(mentions['cui'].nunique())
In [39]:
mentions.groupby('doc_id').size().mean()
Out[39]:
In [40]:
mentions.groupby('sent_id').size().mean()
Out[40]:
In [41]:
tokens = tokens[(~tokens['sent_id'].isnull()) & (tokens['token_type'] != 'NewlineToken')]
In [42]:
wc_by_doc = tokens.groupby('doc_id').count()['id'].reset_index(name='count')
wc_by_doc['count'].mean()
Out[42]:
In [43]:
wc_by_sentence = tokens.groupby('sent_id')['id'].count().reset_index(name='count')
wc_by_sentence['count'].mean()
Out[43]:
In [44]:
mention_counts = mentions.groupby('mention_type').size().reset_index(name='count')
mention_counts
Out[44]:
In [45]:
mention_counts['frequency'] = mention_counts['count'] / mention_counts['count'].sum()
mention_counts
Out[45]:
In [46]:
mentions_by_pos = pd.crosstab(
mentions['mention_type'],
mentions['sentence_number']).apply(lambda x: x / x.sum(), axis=0)
mentions_by_pos
Out[46]:
In [47]:
cui_counts = mentions.groupby('cui').size().reset_index(name='count')
cui_counts = cui_counts.sort_values('count', ascending=False).reset_index(drop=True)
cui_counts.head(10)
Out[47]:
In [48]:
cui_counts['frequency'] = cui_counts['count'] / cui_counts['count'].sum()
cui_counts.head(10)
Out[48]:
In [51]:
cui_counts_with_text = cui_counts.merge(mentions[['cui', 'preferred_text']], on='cui') \
.drop_duplicates('cui') \
.reset_index(drop=True)
cui_counts_with_text.head(10)
Out[51]:
In [45]:
cui_by_pos = pd.crosstab(mentions['cui'], mentions['sentence_number']).apply(lambda x: x / x.sum(), axis=0)
cui_by_pos.head()
Out[45]:
In [154]:
cui_by_pos.loc[:, 0].sort_values(ascending=False)[:10]
Out[154]:
In [183]:
sem_df.head()
Out[183]:
In [160]:
sem_df['sem_template'].nunique()
Out[160]:
In [114]:
count_temps = sem_df.groupby('sem_template').size().reset_index(name='count')
count_temps = count_temps.sort_values('count', ascending=False).reset_index(drop=True)
count_temps.head(10)
Out[114]:
In [115]:
count_temps['frequency'] = count_temps['count'] / count_temps['count'].sum()
count_temps.head(10)
Out[115]:
In [41]:
sem_df.head()
Out[41]:
In [48]:
sem_df['sentence_number'].shape
Out[48]:
In [51]:
temp_by_pos = pd.crosstab(sem_df['sem_template'], sem_df['sentence_number']).apply(lambda x: x / x.sum(), axis=0)
temp_by_pos.head()
Out[51]:
In [98]:
df_dir = 'data/processed_dfs'
# Write sentences, mentions, predicates, and umls concepts to parquet, sem_df
In [103]:
sents_with_mentions.to_parquet(f'{df_dir}/sentences.parquet')
mentions.to_parquet(f'{df_dir}/mentions.parquet')
preds.to_parquet(f'{df_dir}/predicates.parquet')
umls.to_parquet(f'{df_dir}/umls.parquet')
sem_df.to_parquet(f'{df_dir}/templates.parquet')
temp_by_pos.to_parquet(f'{df_dir}/templates_by_pos.parquet')
In [121]:
In [ ]:
In [ ]: