In [151]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from fastparquet import ParquetFile
import os
from ctakes_xml import CtakesXmlParser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, MiniBatchKMeans
import matplotlib

%matplotlib inline

Read in parquet files from pre-processing


In [ ]:


In [177]:
# do the reading
templates = pd.read_parquet('data/processed_dfs/templates.parquet' )

sentences = pd.read_parquet('data/processed_dfs/sentences.parquet')
mentions = pd.read_parquet('data/processed_dfs/mentions.parquet')
umls = pd.read_parquet('data/processed_dfs/umls.parquet')

In [179]:
sentences.head()


Out[179]:
begin doc_id end sent_id sentence_number xmi_id text
1 10 333701 304 2d8125eb-99e7-414e-95f1-9bfb20664d0d 1 31 Chief Complaint: 82 yo M with h/o dementia, se...
2 308 333701 378 2f9f207c-9cde-47d6-9e6d-cda0c864fae2 2 37 Airway, Inability to Protect (Risk for Aspirat...
3 382 333701 422 d95f5daf-b9e2-400b-95e3-7bbf36b4ed01 3 43 Clearance, Cough) , Pneumonia/Aspiration
5 441 333701 620 034d7746-eeed-4240-b6a4-42a7d6115937 5 55 Impaired gag, weak cough, unable to clear sec...
7 635 333701 902 37e5d910-fe36-4124-a414-304b4d98d3de 7 67 HOB >30 degrees, alb/atr treatment frequency ...

In [180]:
mentions.head()


Out[180]:
begin doc_id end id mention_type ontology_arr sent_id xmi_id text sentence_number cui umls_xmi_id preferred_text template_token
0 3680 408714 3683 9a53e800-6b2b-451d-94a6-fb355c76e7a0 ProcedureMention [15031, 15021] 000142d2-4690-4a56-8a68-89b5831ed2aa 15045 ABG 58 C0150411 15031 Analysis of arterial blood gases and pH ProcedureMention
1 1515 442499 1523 1ae0dc2f-b533-4d3a-ac89-d89bfe6fa2b9 MedicationMention [6130, 6100, 6150, 6120, 6140, 6110] 001ca15d-0e94-4933-b376-5123e22e5b13 6168 Morphine 17 C0026549 6130 Morphine MedicationMention
2 1532 442499 1542 742efc2f-1005-4455-b647-6b671bb61e0a SignSymptomMention [8825] 001ca15d-0e94-4933-b376-5123e22e5b13 8838 air hunger 17 C0231848 8825 Air hunger SignSymptomMention
3 759 354315 767 e9651709-c7ad-48cc-87b3-e2ae527a8ed8 SignSymptomMention [19885] 002d5022-70f9-4638-84b1-dcf42a6c0e12 19898 services 10 C0557854 19885 Services SignSymptomMention
4 769 354315 774 80e6aaab-9f60-4ba2-ab89-8a6c97bff1cc AnatomicalSiteMention [23649] 002d5022-70f9-4638-84b1-dcf42a6c0e12 23662 Renal 10 C0022646 23649 Kidney AnatomicalSiteMention

In [181]:
templates.head()


Out[181]:
sent_id sem_template sentence_number doc_id begin end
0 000142d2-4690-4a56-8a68-89b5831ed2aa ProcedureMention appear 58 408714 3670 3716
1 001ca15d-0e94-4933-b376-5123e22e5b13 MedicationMention SignSymptomMention 17 442499 1515 1542
2 002d5022-70f9-4638-84b1-dcf42a6c0e12 be be follow SignSymptomMention AnatomicalSite... 10 354315 718 801
3 00385844-a95d-4bb7-a773-70f78a3b035a continue titrate MedicationMention SignSymptom... 17 432020 1451 1520
4 0046177c-aa3b-4c8d-9554-3efea4802687 be give MedicationMention MedicationMention Me... 5 464828 548 618

To make templates:

1 Make an empty data frame with the fields to hold template info 2 For each sentence:

*  Get the predicates for that sentence
    *  trim the frameset after the '.'
*  Get the mentions
    * Get mention type
    * Append umls cui to end of mention (just take the first one)
* Order the predicates and mentions by begin offset
* Combine into a string separated by spaces
* Write the template and semantic template to the dataframe

In [186]:
print(len(templates))
# templates = templates.drop_duplicates('sem_template')
# print(len(templates))


2774

In [188]:
def get_vectors(df):
    tf = TfidfVectorizer()
    return tf.fit_transform(df['sem_template'])
    
# Only use unique templates
vectors = get_vectors(templates)

vecd = vectors.todense()
print(vectors.shape)


(2774, 374)

In [189]:
cluster_sizes = [70, 80, 90, 100, 110, 120, 125, 130, 140, 150, 200]

for n_cluster in cluster_sizes:
    
    km = KMeans( init='k-means++', max_iter=100, n_init=1,
                 n_clusters=n_cluster, verbose=False)

    km.fit(vectors)
    predictions = km.predict(vectors)
    sil_score = silhouette_score(vectors, predictions, metric='euclidean')
    print(f"Silhouette score for n_clusters={n_cluster}:")
    print(sil_score)
    
km = KMeans( init='k-means++', max_iter=100, n_init=1,
                 n_clusters=120, verbose=False)

km.fit(vectors)
predictions = km.predict(vectors)
sil_score = silhouette_score(vectors, predictions, metric='euclidean')
# print(km.cluster_centers_.shape)

# order_centroids = km.cluster_centers_.argsort()[:, ::-1]

# terms = tf.get_feature_names()
# for i in range(50):
#     print("Cluster %d:" % i, end='')
#     for ind in order_centroids[i, :15]:
#         print(' %s' % terms[ind], end='')
#     print()


Silhouette score for n_clusters=70:
0.3589132719557676
Silhouette score for n_clusters=80:
0.3831557012648031
Silhouette score for n_clusters=90:
0.37868213831132375
Silhouette score for n_clusters=100:
0.3993674287322699
Silhouette score for n_clusters=110:
0.4012596541630207
Silhouette score for n_clusters=120:
0.4172058130840672
Silhouette score for n_clusters=125:
0.40448816578497976
Silhouette score for n_clusters=130:
0.4114589927686263
Silhouette score for n_clusters=140:
0.41636762685641776
Silhouette score for n_clusters=150:
0.4308854561074002
Silhouette score for n_clusters=200:
0.4581207686510411

In [190]:
predictions = km.predict(vectors)

In [191]:
silhouette_score(vectors, predictions, metric='euclidean')


Out[191]:
0.40552092055144384

In [192]:
templates['cluster'] = predictions

In [193]:
templates.head()


Out[193]:
sent_id sem_template sentence_number doc_id begin end cluster
0 000142d2-4690-4a56-8a68-89b5831ed2aa ProcedureMention appear 58 408714 3670 3716 18
1 001ca15d-0e94-4933-b376-5123e22e5b13 MedicationMention SignSymptomMention 17 442499 1515 1542 42
2 002d5022-70f9-4638-84b1-dcf42a6c0e12 be be follow SignSymptomMention AnatomicalSite... 10 354315 718 801 47
3 00385844-a95d-4bb7-a773-70f78a3b035a continue titrate MedicationMention SignSymptom... 17 432020 1451 1520 32
4 0046177c-aa3b-4c8d-9554-3efea4802687 be give MedicationMention MedicationMention Me... 5 464828 548 618 12

In [194]:
sentences.shape


Out[194]:
(2774, 7)

Add cluster labels to sentences and mentions (entities)


In [195]:
sentences = sentences.merge(templates[['sent_id', 'cluster']], on='sent_id')
mentions = mentions.merge(templates[['sent_id', 'cluster']], on='sent_id')

In [196]:
sentences.head()


Out[196]:
begin doc_id end sent_id sentence_number xmi_id text cluster
0 10 333701 304 2d8125eb-99e7-414e-95f1-9bfb20664d0d 1 31 Chief Complaint: 82 yo M with h/o dementia, se... 71
1 308 333701 378 2f9f207c-9cde-47d6-9e6d-cda0c864fae2 2 37 Airway, Inability to Protect (Risk for Aspirat... 82
2 382 333701 422 d95f5daf-b9e2-400b-95e3-7bbf36b4ed01 3 43 Clearance, Cough) , Pneumonia/Aspiration 71
3 441 333701 620 034d7746-eeed-4240-b6a4-42a7d6115937 5 55 Impaired gag, weak cough, unable to clear sec... 42
4 635 333701 902 37e5d910-fe36-4124-a414-304b4d98d3de 7 67 HOB >30 degrees, alb/atr treatment frequency ... 16

In [197]:
mentions.head()


Out[197]:
begin doc_id end id mention_type ontology_arr sent_id xmi_id text sentence_number cui umls_xmi_id preferred_text template_token cluster
0 3680 408714 3683 9a53e800-6b2b-451d-94a6-fb355c76e7a0 ProcedureMention [15031, 15021] 000142d2-4690-4a56-8a68-89b5831ed2aa 15045 ABG 58 C0150411 15031 Analysis of arterial blood gases and pH ProcedureMention 18
1 1515 442499 1523 1ae0dc2f-b533-4d3a-ac89-d89bfe6fa2b9 MedicationMention [6130, 6100, 6150, 6120, 6140, 6110] 001ca15d-0e94-4933-b376-5123e22e5b13 6168 Morphine 17 C0026549 6130 Morphine MedicationMention 42
2 1532 442499 1542 742efc2f-1005-4455-b647-6b671bb61e0a SignSymptomMention [8825] 001ca15d-0e94-4933-b376-5123e22e5b13 8838 air hunger 17 C0231848 8825 Air hunger SignSymptomMention 42
3 759 354315 767 e9651709-c7ad-48cc-87b3-e2ae527a8ed8 SignSymptomMention [19885] 002d5022-70f9-4638-84b1-dcf42a6c0e12 19898 services 10 C0557854 19885 Services SignSymptomMention 47
4 769 354315 774 80e6aaab-9f60-4ba2-ab89-8a6c97bff1cc AnatomicalSiteMention [23649] 002d5022-70f9-4638-84b1-dcf42a6c0e12 23662 Renal 10 C0022646 23649 Kidney AnatomicalSiteMention 47

Get the size of each cluster


In [198]:
pdf = pd.DataFrame(predictions, columns=['cluster'])

In [199]:
cluster_counts = pdf.groupby('cluster').size().reset_index(name='count')
cluster_counts['count'].plot(kind='bar')


Out[199]:
<matplotlib.axes._subplots.AxesSubplot at 0x7faee398a320>

In [200]:
cluster_counts['frequency'] = cluster_counts['count'] / cluster_counts['count'].sum()
cluster_counts.head()


Out[200]:
cluster count frequency
0 0 14 0.005047
1 1 416 0.149964
2 2 198 0.071377
3 3 5 0.001802
4 4 2 0.000721
5 5 97 0.034968
6 6 26 0.009373
7 7 13 0.004686
8 8 95 0.034247
9 9 32 0.011536
10 10 1 0.000360
11 11 9 0.003244
12 12 34 0.012257
13 13 28 0.010094
14 14 68 0.024513
15 15 16 0.005768
16 16 14 0.005047
17 17 36 0.012978
18 18 14 0.005047
19 19 22 0.007931
20 20 19 0.006849
21 21 7 0.002523
22 22 23 0.008291
23 23 17 0.006128
24 24 47 0.016943
25 25 13 0.004686
26 26 27 0.009733
27 27 20 0.007210
28 28 35 0.012617
29 29 25 0.009012
... ... ... ...
90 90 6 0.002163
91 91 8 0.002884
92 92 18 0.006489
93 93 94 0.033886
94 94 32 0.011536
95 95 5 0.001802
96 96 17 0.006128
97 97 5 0.001802
98 98 6 0.002163
99 99 6 0.002163
100 100 20 0.007210
101 101 5 0.001802
102 102 5 0.001802
103 103 28 0.010094
104 104 7 0.002523
105 105 8 0.002884
106 106 8 0.002884
107 107 15 0.005407
108 108 3 0.001081
109 109 7 0.002523
110 110 13 0.004686
111 111 12 0.004326
112 112 2 0.000721
113 113 10 0.003605
114 114 12 0.004326
115 115 9 0.003244
116 116 18 0.006489
117 117 3 0.001081
118 118 2 0.000721
119 119 9 0.003244

120 rows × 3 columns

Get the distribution of CUIs in each cluster

How many clusters on average does a CUI appear in


In [202]:
cui_clust_freq = mentions.groupby(['cui', 'cluster']).size().reset_index(name='cluster_count')
cui_clust_freq.sort_values('cluster_count', ascending=False).head(10)


Out[202]:
cui cluster cluster_count
2095 C0270724 1 163
3004 C1145670 1 40
1319 C0035222 1 34
632 C0015967 2 31
3332 C4048181 2 29
910 C0022660 1 28
914 C0022661 1 26
854 C0020649 2 21
1021 C0024467 12 20
2134 C0278060 27 20

In [205]:
num_clusters_per_cui = cui_clust_freq.groupby('cui').size().reset_index(name='num_clusters')
# avg_num_clusters = .agg({'num_clusters': 'mean'})
num_clusters_per_cui.sort_values('num_clusters', ascending=False).head(10)


Out[205]:
cui num_clusters
323 C0030193 32
683 C0310367 28
843 C0587081 25
112 C0010200 24
902 C0751781 23
221 C0019134 23
279 C0024467 23
412 C0039985 22
179 C0015967 20
859 C0699142 19

Max and average number of clusters that CUIs appear in


In [206]:
print("Max number of clusters that a cui appears in")
print(num_clusters_per_cui.agg({'num_clusters': 'max'}))

print('Average number of clusters that cuis appear in:')
print(num_clusters_per_cui.agg({'num_clusters': 'mean'}))


Max number of clusters that a cui appears in
num_clusters    32
dtype: int64
Average number of clusters that cuis appear in:
num_clusters    3.199616
dtype: float64

In [213]:
max_clusters = num_clusters_per_cui[num_clusters_per_cui['num_clusters'] == 23]
max_clusters


Out[213]:
cui num_clusters
221 C0019134 23
279 C0024467 23
902 C0751781 23

The preferred text of cuis that occur in the most number of clusters


In [214]:
mentions[mentions['cui'].isin(max_clusters['cui'])]['preferred_text'].unique()


Out[214]:
array(['Magnesium', 'Dentatorubral-Pallidoluysian Atrophy', 'Heparin'],
      dtype=object)

Average number of unique CUIs in a cluster


In [216]:
num_cuis_in_cluster_freq = cui_clust_freq[['cui', 'cluster']] \
                                    .groupby('cluster') \
                                    .size() \
                                    .reset_index(name="num_cuis_in_cluster")
num_cuis_in_cluster_freq.sort_values('num_cuis_in_cluster', ascending=False)


Out[216]:
cluster num_cuis_in_cluster
1 1 179
2 2 117
63 63 111
93 93 103
36 36 94
58 58 87
8 8 85
39 39 78
6 6 78
14 14 71
44 44 65
5 5 65
26 26 63
71 71 63
89 89 59
24 24 57
43 43 54
12 12 54
32 32 53
42 42 49
22 22 46
38 38 44
28 28 43
100 100 43
19 19 43
17 17 41
47 47 40
30 30 38
103 103 38
16 16 36
... ... ...
68 68 8
97 97 8
105 105 7
101 101 7
69 69 7
46 46 7
50 50 7
67 67 7
90 90 7
102 102 6
82 82 6
21 21 6
99 99 6
108 108 5
109 109 5
70 70 4
98 98 4
80 80 4
59 59 4
112 112 3
74 74 3
57 57 3
4 4 3
84 84 3
27 27 2
48 48 2
117 117 2
10 10 1
118 118 1
52 52 1

120 rows × 2 columns


In [217]:
num_cuis_in_cluster_freq.agg({'num_cuis_in_cluster': 'mean'})


Out[217]:
num_cuis_in_cluster    27.783333
dtype: float64

Get the cluster label frequency by sentence position


In [218]:
cluster_label_by_sentence_pos = pd.crosstab(templates['cluster']
                                            ,templates['sentence_number']
                                           ).apply(lambda x: x / x.sum(), axis=0)
cluster_label_by_sentence_pos


Out[218]:
sentence_number 0 1 2 3 4 5 6 7 8 9 ... 73 74 75 76 77 78 79 80 81 82
cluster
0 0.000000 0.000000 0.000000 0.0250 0.028571 0.042857 0.032258 0.030769 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
1 0.253731 0.092105 0.025974 0.0125 0.057143 0.042857 0.016129 0.046154 0.071429 0.092593 ... 0.2 0.333333 0.0 0.333333 0.000000 0.333333 0.0 0.0 1.0 0.0
2 0.074627 0.013158 0.012987 0.0625 0.014286 0.042857 0.016129 0.046154 0.114286 0.055556 ... 0.2 0.000000 0.0 0.000000 0.666667 0.000000 0.0 0.0 0.0 0.0
3 0.000000 0.000000 0.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.018519 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
4 0.000000 0.000000 0.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
5 0.014925 0.013158 0.090909 0.0625 0.000000 0.000000 0.016129 0.061538 0.028571 0.092593 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.5 0.0 0.0 0.0
6 0.000000 0.000000 0.000000 0.0000 0.014286 0.014286 0.032258 0.000000 0.014286 0.037037 ... 0.0 0.000000 0.5 0.000000 0.000000 0.000000 0.0 1.0 0.0 0.0
7 0.000000 0.000000 0.000000 0.0000 0.028571 0.000000 0.000000 0.000000 0.014286 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
8 0.014925 0.000000 0.012987 0.0250 0.028571 0.028571 0.000000 0.015385 0.042857 0.074074 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
9 0.000000 0.000000 0.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.014286 0.000000 ... 0.0 0.000000 0.0 0.333333 0.000000 0.000000 0.0 0.0 0.0 1.0
10 0.000000 0.000000 0.000000 0.0000 0.014286 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
11 0.000000 0.000000 0.012987 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
12 0.000000 0.026316 0.025974 0.0125 0.014286 0.014286 0.064516 0.000000 0.014286 0.018519 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
13 0.000000 0.000000 0.012987 0.0125 0.000000 0.000000 0.000000 0.000000 0.014286 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.5 0.0 0.0 0.0
14 0.029851 0.013158 0.012987 0.0000 0.000000 0.014286 0.016129 0.015385 0.014286 0.037037 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
15 0.000000 0.000000 0.000000 0.0000 0.014286 0.014286 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
16 0.000000 0.000000 0.000000 0.0000 0.000000 0.000000 0.000000 0.061538 0.014286 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
17 0.000000 0.000000 0.000000 0.0000 0.000000 0.000000 0.000000 0.046154 0.057143 0.000000 ... 0.0 0.000000 0.5 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
18 0.000000 0.000000 0.012987 0.0125 0.000000 0.000000 0.016129 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
19 0.000000 0.065789 0.051948 0.0000 0.000000 0.028571 0.000000 0.000000 0.014286 0.018519 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
20 0.014925 0.039474 0.051948 0.0125 0.028571 0.028571 0.000000 0.030769 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
21 0.000000 0.000000 0.000000 0.0125 0.000000 0.000000 0.032258 0.000000 0.000000 0.018519 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
22 0.059701 0.065789 0.012987 0.0000 0.028571 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
23 0.000000 0.000000 0.000000 0.0250 0.000000 0.000000 0.000000 0.000000 0.000000 0.018519 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
24 0.000000 0.013158 0.000000 0.0250 0.042857 0.071429 0.032258 0.076923 0.014286 0.018519 ... 0.2 0.333333 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
25 0.000000 0.000000 0.000000 0.0000 0.014286 0.000000 0.000000 0.000000 0.014286 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
26 0.059701 0.000000 0.025974 0.0250 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
27 0.000000 0.026316 0.012987 0.0125 0.000000 0.000000 0.000000 0.000000 0.014286 0.055556 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
28 0.000000 0.000000 0.025974 0.0000 0.014286 0.000000 0.032258 0.000000 0.014286 0.018519 ... 0.0 0.333333 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
29 0.000000 0.000000 0.012987 0.0000 0.000000 0.042857 0.048387 0.000000 0.014286 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
90 0.000000 0.000000 0.000000 0.0000 0.014286 0.014286 0.032258 0.000000 0.000000 0.037037 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
91 0.000000 0.000000 0.000000 0.0000 0.000000 0.000000 0.016129 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
92 0.000000 0.000000 0.000000 0.0375 0.000000 0.000000 0.000000 0.015385 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
93 0.000000 0.052632 0.038961 0.0500 0.028571 0.071429 0.016129 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
94 0.000000 0.026316 0.025974 0.0125 0.042857 0.000000 0.016129 0.000000 0.014286 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.333333 0.0 0.0 0.0 0.0
95 0.000000 0.000000 0.000000 0.0000 0.000000 0.014286 0.000000 0.015385 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.333333 0.000000 0.0 0.0 0.0 0.0
96 0.000000 0.000000 0.025974 0.0000 0.028571 0.000000 0.000000 0.000000 0.000000 0.037037 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
97 0.014925 0.000000 0.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.028571 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
98 0.000000 0.000000 0.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
99 0.000000 0.000000 0.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
100 0.000000 0.013158 0.000000 0.0000 0.014286 0.014286 0.000000 0.000000 0.000000 0.000000 ... 0.2 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
101 0.000000 0.013158 0.000000 0.0000 0.000000 0.014286 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
102 0.000000 0.000000 0.012987 0.0250 0.000000 0.014286 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
103 0.000000 0.013158 0.012987 0.0125 0.028571 0.000000 0.000000 0.015385 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
104 0.000000 0.000000 0.000000 0.0000 0.028571 0.000000 0.016129 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
105 0.000000 0.000000 0.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.014286 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
106 0.000000 0.000000 0.012987 0.0000 0.028571 0.028571 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
107 0.000000 0.000000 0.012987 0.0250 0.000000 0.000000 0.000000 0.015385 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
108 0.000000 0.026316 0.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
109 0.000000 0.000000 0.000000 0.0125 0.000000 0.000000 0.000000 0.015385 0.000000 0.018519 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
110 0.059701 0.026316 0.025974 0.0000 0.014286 0.028571 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
111 0.014925 0.000000 0.000000 0.0000 0.000000 0.028571 0.016129 0.000000 0.071429 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
112 0.000000 0.000000 0.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
113 0.000000 0.000000 0.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
114 0.000000 0.000000 0.000000 0.0000 0.014286 0.000000 0.000000 0.015385 0.028571 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
115 0.000000 0.000000 0.012987 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
116 0.000000 0.000000 0.000000 0.0000 0.000000 0.028571 0.000000 0.061538 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
117 0.000000 0.000000 0.038961 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
118 0.029851 0.000000 0.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
119 0.000000 0.000000 0.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0

120 rows × 83 columns

Get the number of documents in each cluster


In [219]:
mentions[mentions['cluster'] == 1]


Out[219]:
begin doc_id end id mention_type ontology_arr sent_id xmi_id text sentence_number cui umls_xmi_id preferred_text template_token cluster
14 1527 443072 1531 e50bfffb-721e-4dcc-90d8-dca1d6e45ea5 DiseaseDisorderMention [6376] 009a71c7-cdf5-407d-9595-970d338038ed 6389 Plan 27 C0270724 6376 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
16 653 441342 656 f8fbe689-662d-4b40-94bc-ee9300ceb500 DiseaseDisorderMention [8954] 00ee58f6-cac3-495d-9f91-70fb8e79764f 8967 HTN 9 C0020538 8954 Hypertensive disease DiseaseDisorderMention 1
17 658 441342 661 365196de-2e9a-4253-b168-29f73e055d21 DiseaseDisorderMention [9339] 00ee58f6-cac3-495d-9f91-70fb8e79764f 9352 ARF 9 C0022660 9339 Kidney Failure, Acute DiseaseDisorderMention 1
18 674 441342 692 b9bb5e80-b1ba-428d-b388-a56daff051df SignSymptomMention [10533] 00ee58f6-cac3-495d-9f91-70fb8e79764f 10546 infectious process 9 C0006277 10533 Bronchitis SignSymptomMention 1
19 694 441342 714 1d906a9d-5cc2-4c34-aa8b-4e0fd18de01b DiseaseDisorderMention [9207] 00ee58f6-cac3-495d-9f91-70fb8e79764f 9220 gastric peptic ulcer 9 C0577559 9207 Mass of body structure DiseaseDisorderMention 1
20 715 441342 722 3f0026fc-8d41-4519-ab8b-d447c7b70495 DiseaseDisorderMention [9251] 00ee58f6-cac3-495d-9f91-70fb8e79764f 9264 disease 9 C0012634 9251 Disease DiseaseDisorderMention 1
27 2361 458728 2365 9181f72f-ae82-41aa-8e93-b6a1175e1c75 DiseaseDisorderMention [17309] 01a1e37e-acab-4245-811a-cbdf6d196c3d 17322 Plan 33 C0270724 17309 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
50 2254 378647 2258 ef0cad34-8f69-47ad-8838-1e48d5c17dfd DiseaseDisorderMention [11768] 02d1ff23-b8a8-4a61-8528-68c9b9c81683 11781 Plan 31 C0270724 11768 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
52 868 458691 887 655dda8e-9902-49ba-8d8a-188a001a5688 DiseaseDisorderMention [12913] 02e10911-c5e5-47e5-8de1-c940acb642f6 12926 Respiratory failure 8 C1145670 12913 Respiratory Failure DiseaseDisorderMention 1
53 900 458691 904 14d04147-09d9-4954-854f-978e645fc3d0 DiseaseDisorderMention [12264] 02e10911-c5e5-47e5-8de1-c940acb642f6 12277 ARDS 8 C0035222 12264 Respiratory Distress Syndrome, Adult DiseaseDisorderMention 1
54 297 350891 300 621ac8c8-7fb4-4b37-8d2e-a2c055b5c25d DiseaseDisorderMention [7175] 02e3ac64-04bc-43f4-b44c-2dfd20a03747 7188 Pan 5 C0031036 7175 Polyarteritis Nodosa DiseaseDisorderMention 1
62 2527 355788 2531 2c39699f-54c4-4918-ae2e-22eee2044bfa DiseaseDisorderMention [11337] 03121359-ada0-4d3b-a85d-8766b55c94ad 11350 Plan 44 C0428886 11337 Mean blood pressure DiseaseDisorderMention 1
75 3376 408714 3383 e4ef6ed4-30b7-482a-83de-d5d7f14b1bb4 DiseaseDisorderMention [12858] 038dbac0-69ca-4751-961e-15949aae92c7 12871 Anxiety 50 C0003467 12858 Anxiety DiseaseDisorderMention 1
96 623 438154 627 feb692dd-17dc-4364-9c9d-2f545393ef67 DiseaseDisorderMention [4456] 04c20bf7-b6f5-4c50-98b9-6ad2d436a46a 4469 Plan 18 C0270724 4456 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
124 3643 442309 3647 d7ec3deb-0387-4d1b-9131-9f7ab6760698 DiseaseDisorderMention [15539] 05f13f07-32ca-4334-ba79-3760071805bb 15552 Plan 59 C0270724 15539 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
133 2471 458728 2478 3a15f527-56ed-4ddb-ac85-8f89b599806b DiseaseDisorderMention [17452] 064bfeac-f01d-4152-a919-fbed9b73cb86 17465 Anxiety 37 C0003467 17452 Anxiety DiseaseDisorderMention 1
138 4089 355793 4093 6a3dcad8-389a-4ab4-9648-27c2c49f65ef DiseaseDisorderMention [16616] 06fe4ce6-f145-489e-881b-652b870c7a54 16629 Plan 76 C0270724 16616 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
168 1780 350384 1784 9192a6a8-0bef-4be4-97bd-19217cf82f68 DiseaseDisorderMention [10061] 07b64045-c4ee-4970-8e29-d88f1f191bea 10074 Plan 41 C0032952 10061 Prednisone DiseaseDisorderMention 1
169 83 355538 91 e13708a1-d200-4677-9b0d-e37352ce818f MedicationMention [9382, 9372] 07c07c34-a842-4ee5-8f4d-d8baac197f14 9396 Antibody 2 C0003241 9382 Antibodies MedicationMention 1
170 92 355538 100 70ddb4e8-fee7-46b3-8919-5e7a0caf8fe9 DiseaseDisorderMention [10961] 07c07c34-a842-4ee5-8f4d-d8baac197f14 10974 Syndrome 2 C0039082 10961 Syndrome DiseaseDisorderMention 1
171 113 355538 146 6c6ae979-4d5d-43fd-bca2-4c9d85ac900d DiseaseDisorderMention [9982] 07c07c34-a842-4ee5-8f4d-d8baac197f14 9995 Microangiopathic Hemolytic Anemia 2 C0221021 9982 Microangiopathic hemolytic anemia DiseaseDisorderMention 1
172 147 355538 150 b06b2f75-4f8b-4ecc-85bb-a239d4e20b4b DiseaseDisorderMention [9938] 07c07c34-a842-4ee5-8f4d-d8baac197f14 9951 TTP 2 C0034155 9938 Purpura, Thrombotic Thrombocytopenic DiseaseDisorderMention 1
173 169 355538 173 23e299ce-06ad-4f99-902e-ba9ae24d5888 DiseaseDisorderMention [10785, 10795, 10805] 07c07c34-a842-4ee5-8f4d-d8baac197f14 10820 ESRD 2 C0022661 10785 Kidney Failure, Chronic DiseaseDisorderMention 1
174 178 355538 204 27da854e-af62-47ca-b1e3-f17cf9304850 ProcedureMention [12171] 07c07c34-a842-4ee5-8f4d-d8baac197f14 12184 cadaveric renal transplant 2 C0401176 12171 Cadaveric renal transplant ProcedureMention 1
179 1010 339201 1028 16eb969e-519d-439f-92f9-804fa6f27756 DiseaseDisorderMention [13510] 07de3131-edbd-4310-8184-e42cdc646218 13523 Alcohol withdrawal 29 C0236663 13510 Alcohol withdrawal syndrome DiseaseDisorderMention 1
180 1040 339201 1056 e1b7e133-46d4-47be-887c-667f9214c9c3 DiseaseDisorderMention [13081] 07de3131-edbd-4310-8184-e42cdc646218 13094 delirium tremens 29 C0023901 13081 Liver Function Tests DiseaseDisorderMention 1
181 1058 339201 1061 0fe2b584-e6b5-4096-8c92-396773d1c79d DiseaseDisorderMention [13906] 07de3131-edbd-4310-8184-e42cdc646218 13919 DTs 29 C0001957 13906 Alcohol Withdrawal Delirium DiseaseDisorderMention 1
182 1063 339201 1071 172f9ff8-121e-435a-b461-31252e5a2af4 SignSymptomMention [15512] 07de3131-edbd-4310-8184-e42cdc646218 15525 seizures 29 C0036572 15512 Seizures SignSymptomMention 1
198 760 438166 764 8f72a361-5636-420c-8812-1254765262a7 DiseaseDisorderMention [5019] 086b67da-3425-4475-8433-e3e842c2a61f 5032 Plan 21 C0000726 5019 Abdomen DiseaseDisorderMention 1
216 3535 464827 3544 c108c585-e0fd-43fd-bb9b-46f54c5c7632 DiseaseDisorderMention [14483, 14473, 14463] 0926a6ac-3fee-4484-8afc-2fd8130216a0 14498 infection 58 C1260298 14483 tigecycline DiseaseDisorderMention 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5805 1373 333703 1377 771bbaa7-eb06-4ba1-97af-d8640d04292d DiseaseDisorderMention [8838] f267ebd3-bb58-4d17-a1c2-3667e23bdd52 8851 Plan 19 C0270724 8838 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
5827 2779 354151 2798 a9a532af-adc4-417e-a87e-06b1af4e2786 DiseaseDisorderMention [15079] f335e36b-99aa-4088-a9d0-baef831c1a45 15092 Respiratory failure 49 C1145670 15079 Respiratory Failure DiseaseDisorderMention 1
5828 2811 354151 2815 0ae7c32f-9a83-46f7-8cb4-010b7078c4c0 DiseaseDisorderMention [15739] f335e36b-99aa-4088-a9d0-baef831c1a45 15752 ARDS 49 C0035222 15739 Respiratory Distress Syndrome, Adult DiseaseDisorderMention 1
5844 1284 443072 1303 48c40579-4e8a-403b-8a09-7ff03a0d823a DiseaseDisorderMention [6112] f3c06428-9576-454f-b460-b7b2983104f0 6125 Respiratory failure 21 C1145670 6112 Respiratory Failure DiseaseDisorderMention 1
5845 1316 443072 1320 f1023fa7-04f9-4c64-b494-6c5cb08cc9c0 DiseaseDisorderMention [6156] f3c06428-9576-454f-b460-b7b2983104f0 6169 ARDS 21 C0035222 6156 Respiratory Distress Syndrome, Adult DiseaseDisorderMention 1
5851 1544 422294 1548 24477922-4973-43e0-8aa8-61b92495096d DiseaseDisorderMention [7495] f400dbe2-d547-4182-bf2b-306196e45120 7508 Plan 36 C0270724 7495 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
5861 24 354315 27 0c2cf874-0596-4c89-b698-c377248a46ec SignSymptomMention [20186] f4ba23d4-50d3-48cb-bfdc-8fc384a0824d 20199 PMH 0 C0262926 20186 Medical History SignSymptomMention 1
5862 44 354315 48 00cf6719-a0a6-4d55-ba1f-88558eb136b1 DiseaseDisorderMention [19315, 19325, 19335] f4ba23d4-50d3-48cb-bfdc-8fc384a0824d 19350 ESRD 0 C0022661 19315 Kidney Failure, Chronic DiseaseDisorderMention 1
5863 59 354315 63 ccec44db-7213-446d-a754-17aae4f4317b DiseaseDisorderMention [18633] f4ba23d4-50d3-48cb-bfdc-8fc384a0824d 18646 COPD 0 C0024117 18633 Chronic Obstructive Airway Disease DiseaseDisorderMention 1
5864 65 354315 91 d310ef02-4a80-44ed-9320-a6a77cae4c6a DiseaseDisorderMention [19590] f4ba23d4-50d3-48cb-bfdc-8fc384a0824d 19603 ischemic\n cardiomyopathy 0 C0349782 19590 Ischemic cardiomyopathy DiseaseDisorderMention 1
5865 93 354315 97 f42dd455-99dd-489e-a414-cee592a00a12 DiseaseDisorderMention [18039] f4ba23d4-50d3-48cb-bfdc-8fc384a0824d 18052 MRSA 0 C0343401 18039 MRSA - Methicillin resistant Staphylococcus au... DiseaseDisorderMention 1
5866 110 354315 113 18bd8a6c-edfd-4c7b-9cb0-76a9c15fedc7 DiseaseDisorderMention [18908] f4ba23d4-50d3-48cb-bfdc-8fc384a0824d 18921 OSA 0 C0520679 18908 Sleep Apnea, Obstructive DiseaseDisorderMention 1
5867 142 354315 145 8a465df4-b213-4269-b802-6b14b5d744f7 DiseaseDisorderMention [19480] f4ba23d4-50d3-48cb-bfdc-8fc384a0824d 19493 CHF 0 C0018802 19480 Congestive heart failure DiseaseDisorderMention 1
5881 2333 361934 2337 5ef12247-fde5-4b51-8486-fde99354372b DiseaseDisorderMention [9451] f4ef4b9b-a581-44e9-8c14-549186a3ef50 9464 Plan 39 C0270724 9451 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
5951 1368 333908 1372 be108ccc-ddb4-47c0-b273-10af4f99fe33 DiseaseDisorderMention [9630] f918cc4a-2f8b-4c5e-a904-3de84efe714b 9643 Plan 18 C0270724 9630 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
5954 1548 339201 1552 4e60214f-2712-48e2-9cc1-09b621dd8c8b DiseaseDisorderMention [13422] f9483446-c5b4-47bf-82e1-42cafc8c319c 13435 Plan 39 C0270724 13422 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
5958 1941 497748 1945 28a92539-426d-41e3-b9b0-244604f427b0 DiseaseDisorderMention [7178] f98eff4f-c4bc-40cf-aa4d-e60607336a58 7191 plan 36 C0017973 7178 Glycosaminoglycans DiseaseDisorderMention 1
5963 3114 438389 3118 96331d13-cf7b-4825-adf4-5b698a502f59 DiseaseDisorderMention [15326] f9ce978c-9bc0-4cb7-b6c8-1f1d83912729 15339 Plan 46 C0270724 15326 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
5967 410 341364 414 d3e6e998-fc45-4d83-ab5a-6da7ec707f02 DiseaseDisorderMention [2309] fa0ad4ac-d860-4241-bafc-28f5fff766ff 2322 Plan 9 C0270724 2309 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
6011 2533 336621 2537 f1e5b1d2-c242-4710-a23f-d149dc20ea1e DiseaseDisorderMention [11684] fb8ce413-3499-495c-bda3-15537d919296 11697 Plan 44 C0270724 11684 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
6021 1634 354151 1658 96af2f8e-b3d9-43f4-a5f4-fdcea44bd8ab DiseaseDisorderMention [15365, 15375] fc48db0a-ff04-457a-adaa-53b608b51540 15389 Renal failure, End stage 25 C0022661 15365 Kidney Failure, Chronic DiseaseDisorderMention 1
6022 1649 354151 1652 a483ed81-247d-4ed3-aecc-f3691648dae4 MedicationMention [12276, 12266] fc48db0a-ff04-457a-adaa-53b608b51540 12290 End 25 C0082420 12276 Endoglin, human MedicationMention 1
6023 1660 354151 1683 522e1559-f0b7-4cf5-9f15-8073777f51d0 DiseaseDisorderMention [15496, 15486] fc48db0a-ff04-457a-adaa-53b608b51540 15510 End stage renal disease 25 C0022661 15496 Kidney Failure, Chronic DiseaseDisorderMention 1
6024 1685 354151 1689 e6b3e836-9a73-478e-b816-5b34a7ed9598 DiseaseDisorderMention [15177, 15167, 15187] fc48db0a-ff04-457a-adaa-53b608b51540 15202 ESRD 25 C0022661 15177 Kidney Failure, Chronic DiseaseDisorderMention 1
6027 3750 337300 3754 65c279ea-f692-4157-b614-d3b8c2f67112 DiseaseDisorderMention [18532] fc851549-121a-4c61-a625-4bad13c066dc 18545 Plan 65 C0270724 18532 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
6034 626 463241 630 3975816b-3ed0-4809-a469-437e4fe95e94 DiseaseDisorderMention [2788] fd105c98-7556-458e-be31-27b47d7f1024 2801 Plan 7 C0270724 2788 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
6037 846 336937 850 35f625d6-5722-49d1-9dca-a9c744754928 DiseaseDisorderMention [6893] fd6d0327-2549-4332-a24d-252e5338d621 6906 Plan 13 C0270724 6893 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
6067 1808 366321 1812 97c9e6ed-08b7-4644-a3d2-a5e98523b735 DiseaseDisorderMention [11048] fe6e1c34-fda0-4c51-aa69-13d7d82f05b8 11061 Plan 31 C2830004 11048 Somnolence DiseaseDisorderMention 1
6088 3317 464827 3333 ae72b021-8e64-46b6-a879-e811b5d1ff2f DiseaseDisorderMention [13153, 13143] ffa19818-d69b-4b37-b7fa-730985657e09 13167 Wound dehiscence 53 C0259768 13153 Wound dehiscence DiseaseDisorderMention 1
6098 1660 355538 1664 168e3d33-161a-4eca-b23a-6dae71b0a092 DiseaseDisorderMention [10565] ffe8b11f-9e92-496b-a1f1-e70f225bd171 10578 Plan 31 C0270724 10565 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1

642 rows × 15 columns


In [223]:
umls[umls['xmi_id'].isin([17309, 11768, 11337, 4456, 15539, 16616, 10061, 13422]) ]


Out[223]:
code coding_scheme cui disambiguated doc_id id preferred_text tui xmi_id
148 52713000 SNOMEDCT_US C0270724 False 458728 4d9e058d-26bb-4c4a-bc40-a78068b10eae Infantile Neuroaxonal Dystrophy T047 17309
67 52713000 SNOMEDCT_US C0270724 False 355788 7c97f733-5d46-4ea1-b21d-9852b9adf99d Infantile Neuroaxonal Dystrophy T047 11337
205 52713000 SNOMEDCT_US C0270724 False 339201 6a9fd3cf-0848-4570-814e-2b14282b0fbd Infantile Neuroaxonal Dystrophy T047 13422
152 439569004 SNOMEDCT_US C0035273 False 438365 12c464c8-e829-491d-95e0-ab1e8923c56b Resuscitation procedure T061 17309
77 52713000 SNOMEDCT_US C0270724 False 350384 f8e98cf6-0afe-46a7-af7d-1cbfcddf852e Infantile Neuroaxonal Dystrophy T047 10061
59 312250003 SNOMEDCT_US C0024485 False 338922 c6ec27c9-7897-45a6-a5cf-899909b0144e Magnetic Resonance Imaging T060 4456
96 52713000 SNOMEDCT_US C0270724 False 442309 007e67f7-69c8-428d-b8ef-d3589e06f2d6 Infantile Neuroaxonal Dystrophy T047 15539
22 10312003 SNOMEDCT_US C0032952 False 424894 48223bcb-e16c-46f0-bfdd-1c502c9d388c Prednisone T121 10061
99 52713000 SNOMEDCT_US C0270724 False 355793 b4b962f9-ea4d-4ae3-95f3-d9447e743dc6 Infantile Neuroaxonal Dystrophy T047 16616
152 439569004 SNOMEDCT_US C0035273 False 438389 ca5721fd-e5d9-4a5b-ace4-fb68acb89fcc Resuscitation procedure T061 17309
97 6797001 SNOMEDCT_US C0428886 False 361934 cfbf85e6-c90f-4328-b205-3ffa92cf9494 Mean blood pressure T033 11337
71 271808008 SNOMEDCT_US C1850534 False 443260 6f912b79-3395-4a2a-b1c8-05442bc6d976 Edema, generalized T033 10061
162 52713000 SNOMEDCT_US C0270724 False 378647 528115c7-6d1f-47d9-a8d4-61d9584c3bf4 Infantile Neuroaxonal Dystrophy T047 11768
141 423827005 SNOMEDCT_US C0014245 False 343682 e18c6053-bea2-409a-8e28-70255a0d6624 Endoscopy (procedure) T060 13422
51 52713000 SNOMEDCT_US C0270724 False 438154 797dc390-6484-4b34-94cb-14d1704fe84e Infantile Neuroaxonal Dystrophy T047 4456

In [228]:
sentences[sentences['sent_id'] == 'f918cc4a-2f8b-4c5e-a904-3de84efe714b']


Out[228]:
begin doc_id end sent_id sentence_number xmi_id text cluster
62 1368 333908 1373 f918cc4a-2f8b-4c5e-a904-3de84efe714b 18 133 Plan: 1

In [229]:
notes = pd.read_parquet('data/note-events.parquet', engine='fastparquet')

In [235]:
notes[notes['ROW_ID'] == 333908]['TEXT'].iloc[0][1368:1372]


Out[235]:
'Plan'

Generating Notes

Get all the entities for the document


In [503]:
doc_ids = templates['doc_id'].unique()
notes = notes[notes['ROW_ID'].isin(doc_ids)]
notes = notes.reset_index(drop=True)
# notes = notes.drop(['CHARTDATE','CHARTTIME','STORETIME','CGID','ISERROR'],axis=1)

In [504]:
doc = notes.sample(n=1)
doc_id = doc['ROW_ID'].iloc[0]
doc_id


Out[504]:
361930

Drop templates that contain entities not in the document


In [505]:
ents_in_doc = mentions[mentions['doc_id'] == doc['ROW_ID'].iloc[0]]
ments_in_doc = ents_in_doc.mention_type.unique()
# print(ments_in_doc)
ents_in_doc.head()

# get metions where mention_type is in doc entities types
print(len(mentions))
doc_ments = mentions[mentions.cui.isin(ents_in_doc.cui.unique())]
# print(len(doc_ments))


doc_ments.head()


Out[505]:
begin doc_id end id mention_type ontology_arr sent_id xmi_id text sentence_number cui umls_xmi_id preferred_text template_token cluster
114 876 361930 881 c37ca5d7-74de-47d4-b437-468c1e6b97e6 AnatomicalSiteMention [16821] 0565613d-e39b-4576-b670-e3e88f5cf982 16834 Blood 8 C0005767 16821 Blood AnatomicalSiteMention 33
115 884 361930 898 63a2bbd1-c119-4a6c-ba2c-e016b1c5d4a0 ProcedureMention [15816, 15806] 0565613d-e39b-4576-b670-e3e88f5cf982 15830 urine cultures 8 C0430404 15816 Urine for culture ProcedureMention 33
116 923 361930 934 a42518a0-8b0d-4c47-a566-8bfbf5efc069 MedicationMention [12536] 0565613d-e39b-4576-b670-e3e88f5cf982 12549 Antibiotics 8 C0003232 12536 Antibiotics MedicationMention 33
419 392 361930 405 d1caf679-169f-47c1-8c6a-c1423692fb02 ProcedureMention [16022] 12f3be14-c673-4b85-bebf-faa41151c1b9 16035 stabilization 4 C1293130 16022 Stabilization ProcedureMention 79
420 423 361930 431 aaf7ca5a-7c5e-4d7d-91c1-6760e05a5269 AnatomicalSiteMention [16990] 12f3be14-c673-4b85-bebf-faa41151c1b9 17003 arterial 4 C0003842 16990 Arteries AnatomicalSiteMention 79

In [507]:
# get templates that have the corresponding sentence ids from doc_ments
template_candidates = templates[templates.sent_id.isin(doc_ments.sent_id)]
template_candidates.head()


Out[507]:
sent_id sem_template sentence_number doc_id begin end cluster
0 000142d2-4690-4a56-8a68-89b5831ed2aa ProcedureMention appear 58 408714 3670 3716 18
2 002d5022-70f9-4638-84b1-dcf42a6c0e12 be be follow SignSymptomMention AnatomicalSite... 10 354315 718 801 47
3 00385844-a95d-4bb7-a773-70f78a3b035a continue titrate MedicationMention SignSymptom... 17 432020 1451 1520 32
4 0046177c-aa3b-4c8d-9554-3efea4802687 be give MedicationMention MedicationMention Me... 5 464828 548 618 12
5 0063b06f-e43d-474b-bf27-b67e474b22de report SignSymptomMention have come SignSympto... 30 340820 2199 2277 62

Choose a cluster based on cluster frequency for that sentence position


In [508]:
candidate_cluster_labels = template_candidates.cluster.sort_values().unique()
candidate_clusters = cluster_label_by_sentence_pos.iloc[candidate_cluster_labels]

In [509]:
sent_pos = 0

# remove cluster labels not present in template candidates
selected_cluster = candidate_clusters.sample(
                                        n=1,
                                        weights=candidate_clusters.loc[:,sent_pos]
                        ).iloc[0].name
selected_cluster
# templates_in_cluster = template_candidates[template_candidates['cluster'] == selected_cluster.iloc[0].index]


Out[509]:
26

In [510]:
cluster_templates = template_candidates[template_candidates.cluster == selected_cluster]
cluster_templates.head()


Out[510]:
sent_id sem_template sentence_number doc_id begin end cluster
252 17c47df8-37b5-47cd-a1b4-b056fa6553a9 DiseaseDisorderMention SignSymptomMention 40 333908 1956 1979 26
281 1a3970bd-14fd-49e5-a606-ca7f61fc0eaf SignSymptomMention DiseaseDisorderMention Sign... 0 427806 0 147 26
517 2d836805-7d80-4809-811d-fb3fd0c6c3b1 have DiseaseDisorderMention SignSymptomMention... 14 378629 1135 1220 26
1327 79d5ea31-7213-457a-82af-ed755e165778 give SignSymptomMention SignSymptomMention Dis... 2 442499 255 619 26
1908 b06329da-4fb9-45f6-a384-9e25b425ee7f SignSymptomMention DiseaseDisorderMention Sign... 11 361823 1163 1254 26

Choose a template from the cluster base on frequency for that sentence position


In [511]:
# templates_at_pos = cluster_templates[cluster_templates.sentence_number == sent_pos]
template = cluster_templates.sample(n=1)
template


Out[511]:
sent_id sem_template sentence_number doc_id begin end cluster
252 17c47df8-37b5-47cd-a1b4-b056fa6553a9 DiseaseDisorderMention SignSymptomMention 40 333908 1956 1979 26

In [512]:
# sentences[sentences.sent_id == 'deef8a81-b222-4d1f-aa3f-7dfc160cb428'].iloc[0].text

In [ ]:

Fill template blank

Choosing text

Select text to fill the template blank based on the frequency of strings for the CUI associated with the mention


In [513]:
# get mentions in this template
template_id = template.iloc[0]['sent_id']
ments_in_temp = mentions[mentions.sent_id == template_id]
ments_in_temp
# Get the sentence for that template
raw_sentence = sentences[sentences.sent_id == template_id]
raw_sentence.iloc[0].text

# Select entities from entities in the document that match that entity type
#


Out[513]:
'gtt if hypotension r/t.'

In [514]:
ments_in_temp


Out[514]:
begin doc_id end id mention_type ontology_arr sent_id xmi_id text sentence_number cui umls_xmi_id preferred_text template_token cluster
525 1956 333908 1959 4b0ab019-d02c-4f7b-a363-2a24ead17145 DiseaseDisorderMention [9255, 9245] 17c47df8-37b5-47cd-a1b4-b056fa6553a9 9269 gtt 40 C0042029 9255 Urinary tract infection DiseaseDisorderMention 26
526 1963 333908 1974 62064326-c2c6-4490-ab47-c3117e8bcd79 SignSymptomMention [10289] 17c47df8-37b5-47cd-a1b4-b056fa6553a9 10302 hypotension 40 C0020649 10289 Hypotension SignSymptomMention 26

In [515]:
# ments_in_temp.drop(ments_in_temp.loc[482].name, axis=0)

In [516]:
concepts = umls[umls.cui == ments_in_temp.iloc[0].cui]
concepts.head()


Out[516]:
code coding_scheme cui disambiguated doc_id id preferred_text tui xmi_id
50 68566005 SNOMEDCT_US C0042029 False 453048 ba0d4374-d679-4683-b85a-55e31b20de88 Urinary tract infection T047 9255
154 68566005 SNOMEDCT_US C0042029 False 374110 46b50269-81b1-4055-930f-0cf480fe5868 Urinary tract infection T047 18650

In [517]:
# ents_in_doc

In [ ]:


In [518]:
# txt_counts.sample(n=1, weights=txt_counts.cnt).iloc[0].text

In [644]:
def template_filler(template, sentences, entities, all_mentions):
#     print(template.sem_template)
    num_start = len(entities)
    
    template_id = template.iloc[0]['sent_id']
    
    ments_in_temp = all_mentions[all_mentions.sent_id == template_id]
    
    raw_sentence = sentences[sentences.sent_id == template_id]
#     print(f'raw sent df size: {len(raw_sentence)}')
#     print(template_id)
    sent_begin = raw_sentence.iloc[0].begin
    sent_end = raw_sentence.iloc[0].end

    raw_text = raw_sentence.iloc[0].text
    
    replacements = []
#     rows_to_drop = []

#     print('Mention types in template')
#     print(ments_in_temp.mention_type.unique())
#     print('types in entities')
#     print(entities.mention_type.unique())

    for i, row in ments_in_temp.iterrows():
        ents_subset = entities[entities.mention_type == row.mention_type]

        if len(ents_subset) == 0:
            print('Empty list of doc entities')
            print(entities.mention_type)
            print(row.mention_type)
            break
        rand_ent = ents_subset.sample(n=1)
        entities = entities[entities['id'] != rand_ent.iloc[0]['id']]
#         rows_to_drop.append(rand_ent.iloc[0].name)
        
        ent_cui = rand_ent.iloc[0].cui
#         print(ent_cui)
        span_text = get_text_for_mention(ent_cui, all_mentions)
        replacements.append({
            'text' : span_text,
            'begin' : row.begin - sent_begin,
            'end' : row.end - sent_begin,
        })
        
    new_sentence = ''
    for i, r in enumerate(replacements):
        if i == 0:
            new_sentence += raw_text[0 : r['begin'] ]
        else:
            new_sentence += raw_text[replacements[i-1]['end'] : r['begin']]
        new_sentence += r['text']
    
    if(len(replacements) > 1):
        new_sentence += raw_text[replacements[-1]['end'] : ]
        
    # clean up
    num_end = len(entities)
#     print(f"Dropped {num_start - num_end} rows")
    return new_sentence, entities
    
        
        
# Find all the text associated with the cui of the mention in the template
# choose a text span based on frequency
def get_text_for_mention(cui, mentions):
    txt_counts = mentions[mentions.cui == cui].groupby('text').size().reset_index(name='cnt')
    return txt_counts.sample(n=1, weights=txt_counts.cnt).iloc[0].text

Write a full note


In [657]:
# Select document to write note for
# doc = notes.sample(n=1)
# doc_id = doc['ROW_ID'].iloc[0]
doc_id = 374185
# Get all the entities in the chosen document

ents_in_doc = mentions[mentions['doc_id'] == doc_id]

new_doc_sentences = []
sent_pos = 0

while len(ents_in_doc) > 0:
#     print(f"Sentence position: {sent_pos}")
#     print(f"Length of remaining entities: {len(ents_in_doc)}")
    # Get list of possible mentions based on CUIs found in the document

    mentions_pool = mentions[(mentions.cui.isin(ents_in_doc.cui.unique()))
                            & (mentions.mention_type.isin(ents_in_doc.mention_type.unique()))]

    # Get template pool based on mentions pool
    # TODO: Need to only choose templates where all the mentions are in `ents_in_doc`
    template_candidates = templates[templates.sent_id.isin(mentions_pool.sent_id)]
    
#     ts = len(template_candidates.sent_id.unique())
#     ms = len(mentions_pool.sent_id.unique())
#     print(ts, ms)
    
    def all_ents_present(row, doc_ents, ments_pool):
        # Get mentions in this template
        all_temp_ments = ments_pool[ments_pool['sent_id'] == row['sent_id']]
        available_mentions = all_temp_ments[all_temp_ments['mention_type'].isin(doc_ents['mention_type'])]
        
        return (len(available_mentions) > 0)
        
    mask = template_candidates.apply(all_ents_present,
                                     args=(ents_in_doc, mentions_pool),
                                     axis=1)
    template_candidates = template_candidates[mask]
#     print(f'num templates: {len(template_candidates)}')
    #If there are no more possible templates then break
    if len(template_candidates) == 0:
        break

    # Get candidate clusters based on template pool

    # Remove the cluster labels that aren't present in template bank
    candidate_cluster_labels = template_candidates.cluster.sort_values().unique()
    candidate_clusters = cluster_label_by_sentence_pos.iloc[candidate_cluster_labels]
#     print(f"Num clusters: {len(candidate_clusters)}")
    # Select cluster based on frequency at sentence position
    selected_cluster = None
    try:
        selected_cluster = candidate_clusters.sample(
                                                n=1,
                                                weights=candidate_clusters.loc[:,sent_pos]
                                ).iloc[0].name
    except:
        # It's possible the clusters we chose don't appear at that position
        # so we can choose randomly
#         print('choosing random cluster')
        selected_cluster = candidate_clusters.sample(n=1).iloc[0].name
#     print('selected cluster:')
#     print(selected_cluster)
    cluster_templates = template_candidates[template_candidates.cluster == selected_cluster]

    # Choose template from cluster at random
    template = cluster_templates.sample(n=1)
    template_id = template.iloc[0]['sent_id']
    
    # Get mentions in the template   
    ments_in_temp = mentions[mentions.sent_id == template_id]

    # Write the sentence and update entities found in the document !!!
    t, ents_in_doc = template_filler(template, sentences, ents_in_doc, mentions_pool)
    new_doc_sentences.append(t)
    sent_pos += 1

In [658]:
'\n'.join(new_doc_sentences)


Out[658]:
'Patient transferred in with cooling mattress, temp on adm 100.2\n   axillary\nAssessment:\n    Patient\ns HCT  dropped to 24.4 at 2330 hrs[ aim maintain > 25], NGT\n   in place, initiallt aspirated 20 cc of dark brown blood ,later  got\n   bilious aspirate, HR 90-100bpm b/p stable 100-130 systolic, Started on\n   Clear scopolamine\nPEG\nAssessment:\n    Patient\ns HCT  dropped to 24.4 at 2330 hrs[ aim maintain > 25], NGT\n   in place, initiallt aspirated 20 cc of dark brown blood ,later  got\n   bilious aspirate, HR 90-100bpm b/p stable 100-130 systolic, Started on\n   Clear morphine\nLactulose to keep stools liquid\nBarrier cream to protect maceration,  turn\nEvents :\n          Axillary temp 101.2- Liquid Tylenol\nTITLE:\n   Comfort care (CMO\nAfter the patch has been applied to the back of the  right ear, there\n   has been no secretions & no more Gurgling sounds, patients respirations\n   continue to be shallow but regular.\nO2 changed to\n   aerosol\nPt. given iv T12\nHe was then noted to develop worsening tachypnea\n   and increased oxygen\nIV Morphine given before Turns & positioning'

In [659]:
notes[notes.ROW_ID == 374185].iloc[0].TEXT


Out[659]:
'TITLE:\n   Events :\n          Axillary temp 101.2- Liquid Tylenol given via Peg Tube.\n          Patient very gurgly at the beginning of the shift, has\n   scopolamine patch on him, small thick yellow secretions obtained .\n          IV Morphine given before Turns & positioning.  Gurgling\n   reduced post morphine.\n          Oxygen reduced to 35 % the lowest possible with the aerosol\n   mask as per Resident\ns orders as patient is CMO\n'

Write until all mentions have been used


In [638]:
mentions.groupby('doc_id').size().reset_index(name='cnt').sort_values('cnt').head(10)


Out[638]:
doc_id cnt
77 443064 3
35 366305 5
92 464813 7
46 384296 9
95 476781 10
15 341364 12
87 459534 13
90 464653 13
40 374185 14
64 438151 16

In [617]:
mentions[mentions.doc_id == 476781]


Out[617]:
begin doc_id end id mention_type ontology_arr sent_id xmi_id text sentence_number cui umls_xmi_id preferred_text template_token cluster
1320 111 476781 123 8743a755-7e36-4400-9f8d-14c89d5b0624 DiseaseDisorderMention [1682] 36c214a8-424f-4b54-a3b2-84f6daf1aef1 1695 Bell's palsy 0 C0376175 1682 Bell Palsy DiseaseDisorderMention 43
1321 140 476781 149 50b0dd7b-f6bb-4a15-8b75-4b4a48b72e5a MedicationMention [1569, 1539, 1589, 1559, 1529, 1579, 1549, 1519] 36c214a8-424f-4b54-a3b2-84f6daf1aef1 1609 acyclovir 0 C0001367 1569 Acyclovir MedicationMention 43
3763 299 476781 312 d3679191-ebfc-4a01-8555-34e72d25c81a SignSymptomMention [2007, 1997] 9d69d84d-1184-4bb3-ab7a-bd44c370f375 2021 mental status 2 C0278060 2007 Mental state SignSymptomMention 27
3764 318 476781 326 3ade634c-9672-49a2-99b1-c7151ac1ac2a DiseaseDisorderMention [1780, 1770] 9d69d84d-1184-4bb3-ab7a-bd44c370f375 1794 Delirium 2 C0011206 1780 Delirium DiseaseDisorderMention 27
4731 370 476781 374 7aa86886-5eca-445e-8ee4-765175707413 DiseaseDisorderMention [1726] c63b6ac6-9b5a-4bb7-b5b4-04345e4340bc 1739 Plan 6 C0270724 1726 Infantile Neuroaxonal Dystrophy DiseaseDisorderMention 1
5891 187 476781 194 22b64b29-2690-4c34-bc40-94cb258a6d91 SignSymptomMention [1825] f5898ed8-cf97-4c65-b1f3-4d207548b6de 1838 history 1 C0262926 1825 Medical History SignSymptomMention 38
5892 198 476781 202 fe9fcf13-19f2-4771-b793-553fc9a15f91 AnatomicalSiteMention [2119] f5898ed8-cf97-4c65-b1f3-4d207548b6de 2132 neck 1 C0027530 2119 Neck AnatomicalSiteMention 38
5893 207 476781 220 55dead84-b220-4019-bb86-458c9bb85ec7 SignSymptomMention [1911] f5898ed8-cf97-4c65-b1f3-4d207548b6de 1924 shoulder pain 1 C0037011 1911 Shoulder Pain SignSymptomMention 38
5894 260 476781 273 be575703-158a-4e44-81a7-a538f1bfb8a4 MedicationMention [1639] f5898ed8-cf97-4c65-b1f3-4d207548b6de 1652 oral morphine 1 C0026549 1639 Morphine MedicationMention 38
5895 278 476781 286 2e34af2e-74c5-46d2-8036-944bb889dc6a MedicationMention [1377, 1367] f5898ed8-cf97-4c65-b1f3-4d207548b6de 1391 percocet 1 C0086787 1377 Percocet MedicationMention 38

In [ ]: