In [151]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from fastparquet import ParquetFile
import os
from ctakes_xml import CtakesXmlParser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, MiniBatchKMeans
import matplotlib
%matplotlib inline
In [ ]:
In [177]:
# do the reading
templates = pd.read_parquet('data/processed_dfs/templates.parquet' )
sentences = pd.read_parquet('data/processed_dfs/sentences.parquet')
mentions = pd.read_parquet('data/processed_dfs/mentions.parquet')
umls = pd.read_parquet('data/processed_dfs/umls.parquet')
In [179]:
sentences.head()
Out[179]:
In [180]:
mentions.head()
Out[180]:
In [181]:
templates.head()
Out[181]:
1 Make an empty data frame with the fields to hold template info 2 For each sentence:
* Get the predicates for that sentence
* trim the frameset after the '.'
* Get the mentions
* Get mention type
* Append umls cui to end of mention (just take the first one)
* Order the predicates and mentions by begin offset
* Combine into a string separated by spaces
* Write the template and semantic template to the dataframe
In [186]:
print(len(templates))
# templates = templates.drop_duplicates('sem_template')
# print(len(templates))
In [188]:
def get_vectors(df):
tf = TfidfVectorizer()
return tf.fit_transform(df['sem_template'])
# Only use unique templates
vectors = get_vectors(templates)
vecd = vectors.todense()
print(vectors.shape)
In [189]:
cluster_sizes = [70, 80, 90, 100, 110, 120, 125, 130, 140, 150, 200]
for n_cluster in cluster_sizes:
km = KMeans( init='k-means++', max_iter=100, n_init=1,
n_clusters=n_cluster, verbose=False)
km.fit(vectors)
predictions = km.predict(vectors)
sil_score = silhouette_score(vectors, predictions, metric='euclidean')
print(f"Silhouette score for n_clusters={n_cluster}:")
print(sil_score)
km = KMeans( init='k-means++', max_iter=100, n_init=1,
n_clusters=120, verbose=False)
km.fit(vectors)
predictions = km.predict(vectors)
sil_score = silhouette_score(vectors, predictions, metric='euclidean')
# print(km.cluster_centers_.shape)
# order_centroids = km.cluster_centers_.argsort()[:, ::-1]
# terms = tf.get_feature_names()
# for i in range(50):
# print("Cluster %d:" % i, end='')
# for ind in order_centroids[i, :15]:
# print(' %s' % terms[ind], end='')
# print()
In [190]:
predictions = km.predict(vectors)
In [191]:
silhouette_score(vectors, predictions, metric='euclidean')
Out[191]:
In [192]:
templates['cluster'] = predictions
In [193]:
templates.head()
Out[193]:
In [194]:
sentences.shape
Out[194]:
In [195]:
sentences = sentences.merge(templates[['sent_id', 'cluster']], on='sent_id')
mentions = mentions.merge(templates[['sent_id', 'cluster']], on='sent_id')
In [196]:
sentences.head()
Out[196]:
In [197]:
mentions.head()
Out[197]:
In [198]:
pdf = pd.DataFrame(predictions, columns=['cluster'])
In [199]:
cluster_counts = pdf.groupby('cluster').size().reset_index(name='count')
cluster_counts['count'].plot(kind='bar')
Out[199]:
In [200]:
cluster_counts['frequency'] = cluster_counts['count'] / cluster_counts['count'].sum()
cluster_counts.head()
Out[200]:
In [202]:
cui_clust_freq = mentions.groupby(['cui', 'cluster']).size().reset_index(name='cluster_count')
cui_clust_freq.sort_values('cluster_count', ascending=False).head(10)
Out[202]:
In [205]:
num_clusters_per_cui = cui_clust_freq.groupby('cui').size().reset_index(name='num_clusters')
# avg_num_clusters = .agg({'num_clusters': 'mean'})
num_clusters_per_cui.sort_values('num_clusters', ascending=False).head(10)
Out[205]:
In [206]:
print("Max number of clusters that a cui appears in")
print(num_clusters_per_cui.agg({'num_clusters': 'max'}))
print('Average number of clusters that cuis appear in:')
print(num_clusters_per_cui.agg({'num_clusters': 'mean'}))
In [213]:
max_clusters = num_clusters_per_cui[num_clusters_per_cui['num_clusters'] == 23]
max_clusters
Out[213]:
In [214]:
mentions[mentions['cui'].isin(max_clusters['cui'])]['preferred_text'].unique()
Out[214]:
In [216]:
num_cuis_in_cluster_freq = cui_clust_freq[['cui', 'cluster']] \
.groupby('cluster') \
.size() \
.reset_index(name="num_cuis_in_cluster")
num_cuis_in_cluster_freq.sort_values('num_cuis_in_cluster', ascending=False)
Out[216]:
In [217]:
num_cuis_in_cluster_freq.agg({'num_cuis_in_cluster': 'mean'})
Out[217]:
In [218]:
cluster_label_by_sentence_pos = pd.crosstab(templates['cluster']
,templates['sentence_number']
).apply(lambda x: x / x.sum(), axis=0)
cluster_label_by_sentence_pos
Out[218]:
In [219]:
mentions[mentions['cluster'] == 1]
Out[219]:
In [223]:
umls[umls['xmi_id'].isin([17309, 11768, 11337, 4456, 15539, 16616, 10061, 13422]) ]
Out[223]:
In [228]:
sentences[sentences['sent_id'] == 'f918cc4a-2f8b-4c5e-a904-3de84efe714b']
Out[228]:
In [229]:
notes = pd.read_parquet('data/note-events.parquet', engine='fastparquet')
In [235]:
notes[notes['ROW_ID'] == 333908]['TEXT'].iloc[0][1368:1372]
Out[235]:
In [503]:
doc_ids = templates['doc_id'].unique()
notes = notes[notes['ROW_ID'].isin(doc_ids)]
notes = notes.reset_index(drop=True)
# notes = notes.drop(['CHARTDATE','CHARTTIME','STORETIME','CGID','ISERROR'],axis=1)
In [504]:
doc = notes.sample(n=1)
doc_id = doc['ROW_ID'].iloc[0]
doc_id
Out[504]:
In [505]:
ents_in_doc = mentions[mentions['doc_id'] == doc['ROW_ID'].iloc[0]]
ments_in_doc = ents_in_doc.mention_type.unique()
# print(ments_in_doc)
ents_in_doc.head()
# get metions where mention_type is in doc entities types
print(len(mentions))
doc_ments = mentions[mentions.cui.isin(ents_in_doc.cui.unique())]
# print(len(doc_ments))
doc_ments.head()
Out[505]:
In [507]:
# get templates that have the corresponding sentence ids from doc_ments
template_candidates = templates[templates.sent_id.isin(doc_ments.sent_id)]
template_candidates.head()
Out[507]:
In [508]:
candidate_cluster_labels = template_candidates.cluster.sort_values().unique()
candidate_clusters = cluster_label_by_sentence_pos.iloc[candidate_cluster_labels]
In [509]:
sent_pos = 0
# remove cluster labels not present in template candidates
selected_cluster = candidate_clusters.sample(
n=1,
weights=candidate_clusters.loc[:,sent_pos]
).iloc[0].name
selected_cluster
# templates_in_cluster = template_candidates[template_candidates['cluster'] == selected_cluster.iloc[0].index]
Out[509]:
In [510]:
cluster_templates = template_candidates[template_candidates.cluster == selected_cluster]
cluster_templates.head()
Out[510]:
In [511]:
# templates_at_pos = cluster_templates[cluster_templates.sentence_number == sent_pos]
template = cluster_templates.sample(n=1)
template
Out[511]:
In [512]:
# sentences[sentences.sent_id == 'deef8a81-b222-4d1f-aa3f-7dfc160cb428'].iloc[0].text
In [ ]:
In [513]:
# get mentions in this template
template_id = template.iloc[0]['sent_id']
ments_in_temp = mentions[mentions.sent_id == template_id]
ments_in_temp
# Get the sentence for that template
raw_sentence = sentences[sentences.sent_id == template_id]
raw_sentence.iloc[0].text
# Select entities from entities in the document that match that entity type
#
Out[513]:
In [514]:
ments_in_temp
Out[514]:
In [515]:
# ments_in_temp.drop(ments_in_temp.loc[482].name, axis=0)
In [516]:
concepts = umls[umls.cui == ments_in_temp.iloc[0].cui]
concepts.head()
Out[516]:
In [517]:
# ents_in_doc
In [ ]:
In [518]:
# txt_counts.sample(n=1, weights=txt_counts.cnt).iloc[0].text
In [644]:
def template_filler(template, sentences, entities, all_mentions):
# print(template.sem_template)
num_start = len(entities)
template_id = template.iloc[0]['sent_id']
ments_in_temp = all_mentions[all_mentions.sent_id == template_id]
raw_sentence = sentences[sentences.sent_id == template_id]
# print(f'raw sent df size: {len(raw_sentence)}')
# print(template_id)
sent_begin = raw_sentence.iloc[0].begin
sent_end = raw_sentence.iloc[0].end
raw_text = raw_sentence.iloc[0].text
replacements = []
# rows_to_drop = []
# print('Mention types in template')
# print(ments_in_temp.mention_type.unique())
# print('types in entities')
# print(entities.mention_type.unique())
for i, row in ments_in_temp.iterrows():
ents_subset = entities[entities.mention_type == row.mention_type]
if len(ents_subset) == 0:
print('Empty list of doc entities')
print(entities.mention_type)
print(row.mention_type)
break
rand_ent = ents_subset.sample(n=1)
entities = entities[entities['id'] != rand_ent.iloc[0]['id']]
# rows_to_drop.append(rand_ent.iloc[0].name)
ent_cui = rand_ent.iloc[0].cui
# print(ent_cui)
span_text = get_text_for_mention(ent_cui, all_mentions)
replacements.append({
'text' : span_text,
'begin' : row.begin - sent_begin,
'end' : row.end - sent_begin,
})
new_sentence = ''
for i, r in enumerate(replacements):
if i == 0:
new_sentence += raw_text[0 : r['begin'] ]
else:
new_sentence += raw_text[replacements[i-1]['end'] : r['begin']]
new_sentence += r['text']
if(len(replacements) > 1):
new_sentence += raw_text[replacements[-1]['end'] : ]
# clean up
num_end = len(entities)
# print(f"Dropped {num_start - num_end} rows")
return new_sentence, entities
# Find all the text associated with the cui of the mention in the template
# choose a text span based on frequency
def get_text_for_mention(cui, mentions):
txt_counts = mentions[mentions.cui == cui].groupby('text').size().reset_index(name='cnt')
return txt_counts.sample(n=1, weights=txt_counts.cnt).iloc[0].text
In [657]:
# Select document to write note for
# doc = notes.sample(n=1)
# doc_id = doc['ROW_ID'].iloc[0]
doc_id = 374185
# Get all the entities in the chosen document
ents_in_doc = mentions[mentions['doc_id'] == doc_id]
new_doc_sentences = []
sent_pos = 0
while len(ents_in_doc) > 0:
# print(f"Sentence position: {sent_pos}")
# print(f"Length of remaining entities: {len(ents_in_doc)}")
# Get list of possible mentions based on CUIs found in the document
mentions_pool = mentions[(mentions.cui.isin(ents_in_doc.cui.unique()))
& (mentions.mention_type.isin(ents_in_doc.mention_type.unique()))]
# Get template pool based on mentions pool
# TODO: Need to only choose templates where all the mentions are in `ents_in_doc`
template_candidates = templates[templates.sent_id.isin(mentions_pool.sent_id)]
# ts = len(template_candidates.sent_id.unique())
# ms = len(mentions_pool.sent_id.unique())
# print(ts, ms)
def all_ents_present(row, doc_ents, ments_pool):
# Get mentions in this template
all_temp_ments = ments_pool[ments_pool['sent_id'] == row['sent_id']]
available_mentions = all_temp_ments[all_temp_ments['mention_type'].isin(doc_ents['mention_type'])]
return (len(available_mentions) > 0)
mask = template_candidates.apply(all_ents_present,
args=(ents_in_doc, mentions_pool),
axis=1)
template_candidates = template_candidates[mask]
# print(f'num templates: {len(template_candidates)}')
#If there are no more possible templates then break
if len(template_candidates) == 0:
break
# Get candidate clusters based on template pool
# Remove the cluster labels that aren't present in template bank
candidate_cluster_labels = template_candidates.cluster.sort_values().unique()
candidate_clusters = cluster_label_by_sentence_pos.iloc[candidate_cluster_labels]
# print(f"Num clusters: {len(candidate_clusters)}")
# Select cluster based on frequency at sentence position
selected_cluster = None
try:
selected_cluster = candidate_clusters.sample(
n=1,
weights=candidate_clusters.loc[:,sent_pos]
).iloc[0].name
except:
# It's possible the clusters we chose don't appear at that position
# so we can choose randomly
# print('choosing random cluster')
selected_cluster = candidate_clusters.sample(n=1).iloc[0].name
# print('selected cluster:')
# print(selected_cluster)
cluster_templates = template_candidates[template_candidates.cluster == selected_cluster]
# Choose template from cluster at random
template = cluster_templates.sample(n=1)
template_id = template.iloc[0]['sent_id']
# Get mentions in the template
ments_in_temp = mentions[mentions.sent_id == template_id]
# Write the sentence and update entities found in the document !!!
t, ents_in_doc = template_filler(template, sentences, ents_in_doc, mentions_pool)
new_doc_sentences.append(t)
sent_pos += 1
In [658]:
'\n'.join(new_doc_sentences)
Out[658]:
In [659]:
notes[notes.ROW_ID == 374185].iloc[0].TEXT
Out[659]:
In [638]:
mentions.groupby('doc_id').size().reset_index(name='cnt').sort_values('cnt').head(10)
Out[638]:
In [617]:
mentions[mentions.doc_id == 476781]
Out[617]:
In [ ]: