In [151]:

    
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from fastparquet import ParquetFile
import os
from ctakes_xml import CtakesXmlParser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, MiniBatchKMeans
import matplotlib

%matplotlib inline

Read in parquet files from pre-processing



In [ ]:



In [177]:

    
# do the reading
templates = pd.read_parquet('data/processed_dfs/templates.parquet' )

sentences = pd.read_parquet('data/processed_dfs/sentences.parquet')
mentions = pd.read_parquet('data/processed_dfs/mentions.parquet')
umls = pd.read_parquet('data/processed_dfs/umls.parquet')



In [179]:

    
sentences.head()









    Out[179]:







  
    
      
      begin
      doc_id
      end
      sent_id
      sentence_number
      xmi_id
      text
    
  
  
    
      1
      10
      333701
      304
      2d8125eb-99e7-414e-95f1-9bfb20664d0d
      1
      31
      Chief Complaint: 82 yo M with h/o dementia, se...
    
    
      2
      308
      333701
      378
      2f9f207c-9cde-47d6-9e6d-cda0c864fae2
      2
      37
      Airway, Inability to Protect (Risk for Aspirat...
    
    
      3
      382
      333701
      422
      d95f5daf-b9e2-400b-95e3-7bbf36b4ed01
      3
      43
      Clearance, Cough) , Pneumonia/Aspiration
    
    
      5
      441
      333701
      620
      034d7746-eeed-4240-b6a4-42a7d6115937
      5
      55
      Impaired gag, weak cough,  unable to clear sec...
    
    
      7
      635
      333701
      902
      37e5d910-fe36-4124-a414-304b4d98d3de
      7
      67
      HOB >30 degrees, alb/atr treatment  frequency ...



In [180]:

    
mentions.head()









    Out[180]:







  
    
      
      begin
      doc_id
      end
      id
      mention_type
      ontology_arr
      sent_id
      xmi_id
      text
      sentence_number
      cui
      umls_xmi_id
      preferred_text
      template_token
    
  
  
    
      0
      3680
      408714
      3683
      9a53e800-6b2b-451d-94a6-fb355c76e7a0
      ProcedureMention
      [15031, 15021]
      000142d2-4690-4a56-8a68-89b5831ed2aa
      15045
      ABG
      58
      C0150411
      15031
      Analysis of arterial blood gases and pH
      ProcedureMention
    
    
      1
      1515
      442499
      1523
      1ae0dc2f-b533-4d3a-ac89-d89bfe6fa2b9
      MedicationMention
      [6130, 6100, 6150, 6120, 6140, 6110]
      001ca15d-0e94-4933-b376-5123e22e5b13
      6168
      Morphine
      17
      C0026549
      6130
      Morphine
      MedicationMention
    
    
      2
      1532
      442499
      1542
      742efc2f-1005-4455-b647-6b671bb61e0a
      SignSymptomMention
      [8825]
      001ca15d-0e94-4933-b376-5123e22e5b13
      8838
      air hunger
      17
      C0231848
      8825
      Air hunger
      SignSymptomMention
    
    
      3
      759
      354315
      767
      e9651709-c7ad-48cc-87b3-e2ae527a8ed8
      SignSymptomMention
      [19885]
      002d5022-70f9-4638-84b1-dcf42a6c0e12
      19898
      services
      10
      C0557854
      19885
      Services
      SignSymptomMention
    
    
      4
      769
      354315
      774
      80e6aaab-9f60-4ba2-ab89-8a6c97bff1cc
      AnatomicalSiteMention
      [23649]
      002d5022-70f9-4638-84b1-dcf42a6c0e12
      23662
      Renal
      10
      C0022646
      23649
      Kidney
      AnatomicalSiteMention



In [181]:

    
templates.head()









    Out[181]:







  
    
      
      sent_id
      sem_template
      sentence_number
      doc_id
      begin
      end
    
  
  
    
      0
      000142d2-4690-4a56-8a68-89b5831ed2aa
      ProcedureMention appear
      58
      408714
      3670
      3716
    
    
      1
      001ca15d-0e94-4933-b376-5123e22e5b13
      MedicationMention SignSymptomMention
      17
      442499
      1515
      1542
    
    
      2
      002d5022-70f9-4638-84b1-dcf42a6c0e12
      be be follow SignSymptomMention AnatomicalSite...
      10
      354315
      718
      801
    
    
      3
      00385844-a95d-4bb7-a773-70f78a3b035a
      continue titrate MedicationMention SignSymptom...
      17
      432020
      1451
      1520
    
    
      4
      0046177c-aa3b-4c8d-9554-3efea4802687
      be give MedicationMention MedicationMention Me...
      5
      464828
      548
      618

To make templates:

1 Make an empty data frame with the fields to hold template info 2 For each sentence:

*  Get the predicates for that sentence
    *  trim the frameset after the '.'
*  Get the mentions
    * Get mention type
    * Append umls cui to end of mention (just take the first one)
* Order the predicates and mentions by begin offset
* Combine into a string separated by spaces
* Write the template and semantic template to the dataframe



In [186]:

    
print(len(templates))
# templates = templates.drop_duplicates('sem_template')
# print(len(templates))



In [188]:

    
def get_vectors(df):
    tf = TfidfVectorizer()
    return tf.fit_transform(df['sem_template'])
    
# Only use unique templates
vectors = get_vectors(templates)

vecd = vectors.todense()
print(vectors.shape)









    



(2774, 374)



In [189]:

    
cluster_sizes = [70, 80, 90, 100, 110, 120, 125, 130, 140, 150, 200]

for n_cluster in cluster_sizes:
    
    km = KMeans( init='k-means++', max_iter=100, n_init=1,
                 n_clusters=n_cluster, verbose=False)

    km.fit(vectors)
    predictions = km.predict(vectors)
    sil_score = silhouette_score(vectors, predictions, metric='euclidean')
    print(f"Silhouette score for n_clusters={n_cluster}:")
    print(sil_score)
    
km = KMeans( init='k-means++', max_iter=100, n_init=1,
                 n_clusters=120, verbose=False)

km.fit(vectors)
predictions = km.predict(vectors)
sil_score = silhouette_score(vectors, predictions, metric='euclidean')
# print(km.cluster_centers_.shape)

# order_centroids = km.cluster_centers_.argsort()[:, ::-1]

# terms = tf.get_feature_names()
# for i in range(50):
#     print("Cluster %d:" % i, end='')
#     for ind in order_centroids[i, :15]:
#         print(' %s' % terms[ind], end='')
#     print()









    



Silhouette score for n_clusters=70:
0.3589132719557676
Silhouette score for n_clusters=80:
0.3831557012648031
Silhouette score for n_clusters=90:
0.37868213831132375
Silhouette score for n_clusters=100:
0.3993674287322699
Silhouette score for n_clusters=110:
0.4012596541630207
Silhouette score for n_clusters=120:
0.4172058130840672
Silhouette score for n_clusters=125:
0.40448816578497976
Silhouette score for n_clusters=130:
0.4114589927686263
Silhouette score for n_clusters=140:
0.41636762685641776
Silhouette score for n_clusters=150:
0.4308854561074002
Silhouette score for n_clusters=200:
0.4581207686510411



In [190]:

    
predictions = km.predict(vectors)



In [191]:

    
silhouette_score(vectors, predictions, metric='euclidean')









    Out[191]:





0.40552092055144384



In [192]:

    
templates['cluster'] = predictions



In [193]:

    
templates.head()









    Out[193]:







  
    
      
      sent_id
      sem_template
      sentence_number
      doc_id
      begin
      end
      cluster
    
  
  
    
      0
      000142d2-4690-4a56-8a68-89b5831ed2aa
      ProcedureMention appear
      58
      408714
      3670
      3716
      18
    
    
      1
      001ca15d-0e94-4933-b376-5123e22e5b13
      MedicationMention SignSymptomMention
      17
      442499
      1515
      1542
      42
    
    
      2
      002d5022-70f9-4638-84b1-dcf42a6c0e12
      be be follow SignSymptomMention AnatomicalSite...
      10
      354315
      718
      801
      47
    
    
      3
      00385844-a95d-4bb7-a773-70f78a3b035a
      continue titrate MedicationMention SignSymptom...
      17
      432020
      1451
      1520
      32
    
    
      4
      0046177c-aa3b-4c8d-9554-3efea4802687
      be give MedicationMention MedicationMention Me...
      5
      464828
      548
      618
      12



In [194]:

    
sentences.shape









    Out[194]:





(2774, 7)

Add cluster labels to sentences and mentions (entities)



In [195]:

    
sentences = sentences.merge(templates[['sent_id', 'cluster']], on='sent_id')
mentions = mentions.merge(templates[['sent_id', 'cluster']], on='sent_id')



In [196]:

    
sentences.head()









    Out[196]:







  
    
      
      begin
      doc_id
      end
      sent_id
      sentence_number
      xmi_id
      text
      cluster
    
  
  
    
      0
      10
      333701
      304
      2d8125eb-99e7-414e-95f1-9bfb20664d0d
      1
      31
      Chief Complaint: 82 yo M with h/o dementia, se...
      71
    
    
      1
      308
      333701
      378
      2f9f207c-9cde-47d6-9e6d-cda0c864fae2
      2
      37
      Airway, Inability to Protect (Risk for Aspirat...
      82
    
    
      2
      382
      333701
      422
      d95f5daf-b9e2-400b-95e3-7bbf36b4ed01
      3
      43
      Clearance, Cough) , Pneumonia/Aspiration
      71
    
    
      3
      441
      333701
      620
      034d7746-eeed-4240-b6a4-42a7d6115937
      5
      55
      Impaired gag, weak cough,  unable to clear sec...
      42
    
    
      4
      635
      333701
      902
      37e5d910-fe36-4124-a414-304b4d98d3de
      7
      67
      HOB >30 degrees, alb/atr treatment  frequency ...
      16



In [197]:

    
mentions.head()









    Out[197]:







  
    
      
      begin
      doc_id
      end
      id
      mention_type
      ontology_arr
      sent_id
      xmi_id
      text
      sentence_number
      cui
      umls_xmi_id
      preferred_text
      template_token
      cluster
    
  
  
    
      0
      3680
      408714
      3683
      9a53e800-6b2b-451d-94a6-fb355c76e7a0
      ProcedureMention
      [15031, 15021]
      000142d2-4690-4a56-8a68-89b5831ed2aa
      15045
      ABG
      58
      C0150411
      15031
      Analysis of arterial blood gases and pH
      ProcedureMention
      18
    
    
      1
      1515
      442499
      1523
      1ae0dc2f-b533-4d3a-ac89-d89bfe6fa2b9
      MedicationMention
      [6130, 6100, 6150, 6120, 6140, 6110]
      001ca15d-0e94-4933-b376-5123e22e5b13
      6168
      Morphine
      17
      C0026549
      6130
      Morphine
      MedicationMention
      42
    
    
      2
      1532
      442499
      1542
      742efc2f-1005-4455-b647-6b671bb61e0a
      SignSymptomMention
      [8825]
      001ca15d-0e94-4933-b376-5123e22e5b13
      8838
      air hunger
      17
      C0231848
      8825
      Air hunger
      SignSymptomMention
      42
    
    
      3
      759
      354315
      767
      e9651709-c7ad-48cc-87b3-e2ae527a8ed8
      SignSymptomMention
      [19885]
      002d5022-70f9-4638-84b1-dcf42a6c0e12
      19898
      services
      10
      C0557854
      19885
      Services
      SignSymptomMention
      47
    
    
      4
      769
      354315
      774
      80e6aaab-9f60-4ba2-ab89-8a6c97bff1cc
      AnatomicalSiteMention
      [23649]
      002d5022-70f9-4638-84b1-dcf42a6c0e12
      23662
      Renal
      10
      C0022646
      23649
      Kidney
      AnatomicalSiteMention
      47

Get the size of each cluster



In [198]:

    
pdf = pd.DataFrame(predictions, columns=['cluster'])



In [199]:

    
cluster_counts = pdf.groupby('cluster').size().reset_index(name='count')
cluster_counts['count'].plot(kind='bar')









    Out[199]:





<matplotlib.axes._subplots.AxesSubplot at 0x7faee398a320>



In [200]:

    
cluster_counts['frequency'] = cluster_counts['count'] / cluster_counts['count'].sum()
cluster_counts.head()









    Out[200]:







  
    
      
      cluster
      count
      frequency
    
  
  
    
      0
      0
      14
      0.005047
    
    
      1
      1
      416
      0.149964
    
    
      2
      2
      198
      0.071377
    
    
      3
      3
      5
      0.001802
    
    
      4
      4
      2
      0.000721
    
    
      5
      5
      97
      0.034968
    
    
      6
      6
      26
      0.009373
    
    
      7
      7
      13
      0.004686
    
    
      8
      8
      95
      0.034247
    
    
      9
      9
      32
      0.011536
    
    
      10
      10
      1
      0.000360
    
    
      11
      11
      9
      0.003244
    
    
      12
      12
      34
      0.012257
    
    
      13
      13
      28
      0.010094
    
    
      14
      14
      68
      0.024513
    
    
      15
      15
      16
      0.005768
    
    
      16
      16
      14
      0.005047
    
    
      17
      17
      36
      0.012978
    
    
      18
      18
      14
      0.005047
    
    
      19
      19
      22
      0.007931
    
    
      20
      20
      19
      0.006849
    
    
      21
      21
      7
      0.002523
    
    
      22
      22
      23
      0.008291
    
    
      23
      23
      17
      0.006128
    
    
      24
      24
      47
      0.016943
    
    
      25
      25
      13
      0.004686
    
    
      26
      26
      27
      0.009733
    
    
      27
      27
      20
      0.007210
    
    
      28
      28
      35
      0.012617
    
    
      29
      29
      25
      0.009012
    
    
      ...
      ...
      ...
      ...
    
    
      90
      90
      6
      0.002163
    
    
      91
      91
      8
      0.002884
    
    
      92
      92
      18
      0.006489
    
    
      93
      93
      94
      0.033886
    
    
      94
      94
      32
      0.011536
    
    
      95
      95
      5
      0.001802
    
    
      96
      96
      17
      0.006128
    
    
      97
      97
      5
      0.001802
    
    
      98
      98
      6
      0.002163
    
    
      99
      99
      6
      0.002163
    
    
      100
      100
      20
      0.007210
    
    
      101
      101
      5
      0.001802
    
    
      102
      102
      5
      0.001802
    
    
      103
      103
      28
      0.010094
    
    
      104
      104
      7
      0.002523
    
    
      105
      105
      8
      0.002884
    
    
      106
      106
      8
      0.002884
    
    
      107
      107
      15
      0.005407
    
    
      108
      108
      3
      0.001081
    
    
      109
      109
      7
      0.002523
    
    
      110
      110
      13
      0.004686
    
    
      111
      111
      12
      0.004326
    
    
      112
      112
      2
      0.000721
    
    
      113
      113
      10
      0.003605
    
    
      114
      114
      12
      0.004326
    
    
      115
      115
      9
      0.003244
    
    
      116
      116
      18
      0.006489
    
    
      117
      117
      3
      0.001081
    
    
      118
      118
      2
      0.000721
    
    
      119
      119
      9
      0.003244
    
  

120 rows × 3 columns

Get the distribution of CUIs in each cluster

How many clusters on average does a CUI appear in



In [202]:

    
cui_clust_freq = mentions.groupby(['cui', 'cluster']).size().reset_index(name='cluster_count')
cui_clust_freq.sort_values('cluster_count', ascending=False).head(10)









    Out[202]:







  
    
      
      cui
      cluster
      cluster_count
    
  
  
    
      2095
      C0270724
      1
      163
    
    
      3004
      C1145670
      1
      40
    
    
      1319
      C0035222
      1
      34
    
    
      632
      C0015967
      2
      31
    
    
      3332
      C4048181
      2
      29
    
    
      910
      C0022660
      1
      28
    
    
      914
      C0022661
      1
      26
    
    
      854
      C0020649
      2
      21
    
    
      1021
      C0024467
      12
      20
    
    
      2134
      C0278060
      27
      20



In [205]:

    
num_clusters_per_cui = cui_clust_freq.groupby('cui').size().reset_index(name='num_clusters')
# avg_num_clusters = .agg({'num_clusters': 'mean'})
num_clusters_per_cui.sort_values('num_clusters', ascending=False).head(10)









    Out[205]:







  
    
      
      cui
      num_clusters
    
  
  
    
      323
      C0030193
      32
    
    
      683
      C0310367
      28
    
    
      843
      C0587081
      25
    
    
      112
      C0010200
      24
    
    
      902
      C0751781
      23
    
    
      221
      C0019134
      23
    
    
      279
      C0024467
      23
    
    
      412
      C0039985
      22
    
    
      179
      C0015967
      20
    
    
      859
      C0699142
      19

Max and average number of clusters that CUIs appear in



In [206]:

    
print("Max number of clusters that a cui appears in")
print(num_clusters_per_cui.agg({'num_clusters': 'max'}))

print('Average number of clusters that cuis appear in:')
print(num_clusters_per_cui.agg({'num_clusters': 'mean'}))









    



Max number of clusters that a cui appears in
num_clusters    32
dtype: int64
Average number of clusters that cuis appear in:
num_clusters    3.199616
dtype: float64



In [213]:

    
max_clusters = num_clusters_per_cui[num_clusters_per_cui['num_clusters'] == 23]
max_clusters









    Out[213]:







  
    
      
      cui
      num_clusters
    
  
  
    
      221
      C0019134
      23
    
    
      279
      C0024467
      23
    
    
      902
      C0751781
      23

The preferred text of cuis that occur in the most number of clusters



In [214]:

    
mentions[mentions['cui'].isin(max_clusters['cui'])]['preferred_text'].unique()









    Out[214]:





array(['Magnesium', 'Dentatorubral-Pallidoluysian Atrophy', 'Heparin'],
      dtype=object)

Average number of unique CUIs in a cluster



In [216]:

    
num_cuis_in_cluster_freq = cui_clust_freq[['cui', 'cluster']] \
                                    .groupby('cluster') \
                                    .size() \
                                    .reset_index(name="num_cuis_in_cluster")
num_cuis_in_cluster_freq.sort_values('num_cuis_in_cluster', ascending=False)









    Out[216]:







  
    
      
      cluster
      num_cuis_in_cluster
    
  
  
    
      1
      1
      179
    
    
      2
      2
      117
    
    
      63
      63
      111
    
    
      93
      93
      103
    
    
      36
      36
      94
    
    
      58
      58
      87
    
    
      8
      8
      85
    
    
      39
      39
      78
    
    
      6
      6
      78
    
    
      14
      14
      71
    
    
      44
      44
      65
    
    
      5
      5
      65
    
    
      26
      26
      63
    
    
      71
      71
      63
    
    
      89
      89
      59
    
    
      24
      24
      57
    
    
      43
      43
      54
    
    
      12
      12
      54
    
    
      32
      32
      53
    
    
      42
      42
      49
    
    
      22
      22
      46
    
    
      38
      38
      44
    
    
      28
      28
      43
    
    
      100
      100
      43
    
    
      19
      19
      43
    
    
      17
      17
      41
    
    
      47
      47
      40
    
    
      30
      30
      38
    
    
      103
      103
      38
    
    
      16
      16
      36
    
    
      ...
      ...
      ...
    
    
      68
      68
      8
    
    
      97
      97
      8
    
    
      105
      105
      7
    
    
      101
      101
      7
    
    
      69
      69
      7
    
    
      46
      46
      7
    
    
      50
      50
      7
    
    
      67
      67
      7
    
    
      90
      90
      7
    
    
      102
      102
      6
    
    
      82
      82
      6
    
    
      21
      21
      6
    
    
      99
      99
      6
    
    
      108
      108
      5
    
    
      109
      109
      5
    
    
      70
      70
      4
    
    
      98
      98
      4
    
    
      80
      80
      4
    
    
      59
      59
      4
    
    
      112
      112
      3
    
    
      74
      74
      3
    
    
      57
      57
      3
    
    
      4
      4
      3
    
    
      84
      84
      3
    
    
      27
      27
      2
    
    
      48
      48
      2
    
    
      117
      117
      2
    
    
      10
      10
      1
    
    
      118
      118
      1
    
    
      52
      52
      1
    
  

120 rows × 2 columns



In [217]:

    
num_cuis_in_cluster_freq.agg({'num_cuis_in_cluster': 'mean'})









    Out[217]:





num_cuis_in_cluster    27.783333
dtype: float64

Get the cluster label frequency by sentence position



In [218]:

    
cluster_label_by_sentence_pos = pd.crosstab(templates['cluster']
                                            ,templates['sentence_number']
                                           ).apply(lambda x: x / x.sum(), axis=0)
cluster_label_by_sentence_pos









    Out[218]:







  
    
      sentence_number
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
    
    
      cluster
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      0.000000
      0.000000
      0.000000
      0.0250
      0.028571
      0.042857
      0.032258
      0.030769
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0.253731
      0.092105
      0.025974
      0.0125
      0.057143
      0.042857
      0.016129
      0.046154
      0.071429
      0.092593
      ...
      0.2
      0.333333
      0.0
      0.333333
      0.000000
      0.333333
      0.0
      0.0
      1.0
      0.0
    
    
      2
      0.074627
      0.013158
      0.012987
      0.0625
      0.014286
      0.042857
      0.016129
      0.046154
      0.114286
      0.055556
      ...
      0.2
      0.000000
      0.0
      0.000000
      0.666667
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.018519
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      4
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      5
      0.014925
      0.013158
      0.090909
      0.0625
      0.000000
      0.000000
      0.016129
      0.061538
      0.028571
      0.092593
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.5
      0.0
      0.0
      0.0
    
    
      6
      0.000000
      0.000000
      0.000000
      0.0000
      0.014286
      0.014286
      0.032258
      0.000000
      0.014286
      0.037037
      ...
      0.0
      0.000000
      0.5
      0.000000
      0.000000
      0.000000
      0.0
      1.0
      0.0
      0.0
    
    
      7
      0.000000
      0.000000
      0.000000
      0.0000
      0.028571
      0.000000
      0.000000
      0.000000
      0.014286
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      8
      0.014925
      0.000000
      0.012987
      0.0250
      0.028571
      0.028571
      0.000000
      0.015385
      0.042857
      0.074074
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      9
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.014286
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.333333
      0.000000
      0.000000
      0.0
      0.0
      0.0
      1.0
    
    
      10
      0.000000
      0.000000
      0.000000
      0.0000
      0.014286
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      11
      0.000000
      0.000000
      0.012987
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      12
      0.000000
      0.026316
      0.025974
      0.0125
      0.014286
      0.014286
      0.064516
      0.000000
      0.014286
      0.018519
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      13
      0.000000
      0.000000
      0.012987
      0.0125
      0.000000
      0.000000
      0.000000
      0.000000
      0.014286
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.5
      0.0
      0.0
      0.0
    
    
      14
      0.029851
      0.013158
      0.012987
      0.0000
      0.000000
      0.014286
      0.016129
      0.015385
      0.014286
      0.037037
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      15
      0.000000
      0.000000
      0.000000
      0.0000
      0.014286
      0.014286
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      16
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.061538
      0.014286
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      17
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.046154
      0.057143
      0.000000
      ...
      0.0
      0.000000
      0.5
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      18
      0.000000
      0.000000
      0.012987
      0.0125
      0.000000
      0.000000
      0.016129
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      19
      0.000000
      0.065789
      0.051948
      0.0000
      0.000000
      0.028571
      0.000000
      0.000000
      0.014286
      0.018519
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      20
      0.014925
      0.039474
      0.051948
      0.0125
      0.028571
      0.028571
      0.000000
      0.030769
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      21
      0.000000
      0.000000
      0.000000
      0.0125
      0.000000
      0.000000
      0.032258
      0.000000
      0.000000
      0.018519
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      22
      0.059701
      0.065789
      0.012987
      0.0000
      0.028571
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      23
      0.000000
      0.000000
      0.000000
      0.0250
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.018519
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      24
      0.000000
      0.013158
      0.000000
      0.0250
      0.042857
      0.071429
      0.032258
      0.076923
      0.014286
      0.018519
      ...
      0.2
      0.333333
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      25
      0.000000
      0.000000
      0.000000
      0.0000
      0.014286
      0.000000
      0.000000
      0.000000
      0.014286
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      26
      0.059701
      0.000000
      0.025974
      0.0250
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      27
      0.000000
      0.026316
      0.012987
      0.0125
      0.000000
      0.000000
      0.000000
      0.000000
      0.014286
      0.055556
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      28
      0.000000
      0.000000
      0.025974
      0.0000
      0.014286
      0.000000
      0.032258
      0.000000
      0.014286
      0.018519
      ...
      0.0
      0.333333
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      29
      0.000000
      0.000000
      0.012987
      0.0000
      0.000000
      0.042857
      0.048387
      0.000000
      0.014286
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      90
      0.000000
      0.000000
      0.000000
      0.0000
      0.014286
      0.014286
      0.032258
      0.000000
      0.000000
      0.037037
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      91
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.016129
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      92
      0.000000
      0.000000
      0.000000
      0.0375
      0.000000
      0.000000
      0.000000
      0.015385
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      93
      0.000000
      0.052632
      0.038961
      0.0500
      0.028571
      0.071429
      0.016129
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      94
      0.000000
      0.026316
      0.025974
      0.0125
      0.042857
      0.000000
      0.016129
      0.000000
      0.014286
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.333333
      0.0
      0.0
      0.0
      0.0
    
    
      95
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.014286
      0.000000
      0.015385
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.333333
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      96
      0.000000
      0.000000
      0.025974
      0.0000
      0.028571
      0.000000
      0.000000
      0.000000
      0.000000
      0.037037
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      97
      0.014925
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.028571
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      98
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      99
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      100
      0.000000
      0.013158
      0.000000
      0.0000
      0.014286
      0.014286
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.2
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      101
      0.000000
      0.013158
      0.000000
      0.0000
      0.000000
      0.014286
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      102
      0.000000
      0.000000
      0.012987
      0.0250
      0.000000
      0.014286
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      103
      0.000000
      0.013158
      0.012987
      0.0125
      0.028571
      0.000000
      0.000000
      0.015385
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      104
      0.000000
      0.000000
      0.000000
      0.0000
      0.028571
      0.000000
      0.016129
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      105
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.014286
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      106
      0.000000
      0.000000
      0.012987
      0.0000
      0.028571
      0.028571
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      107
      0.000000
      0.000000
      0.012987
      0.0250
      0.000000
      0.000000
      0.000000
      0.015385
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      108
      0.000000
      0.026316
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      109
      0.000000
      0.000000
      0.000000
      0.0125
      0.000000
      0.000000
      0.000000
      0.015385
      0.000000
      0.018519
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      110
      0.059701
      0.026316
      0.025974
      0.0000
      0.014286
      0.028571
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      111
      0.014925
      0.000000
      0.000000
      0.0000
      0.000000
      0.028571
      0.016129
      0.000000
      0.071429
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      112
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      113
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      114
      0.000000
      0.000000
      0.000000
      0.0000
      0.014286
      0.000000
      0.000000
      0.015385
      0.028571
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      115
      0.000000
      0.000000
      0.012987
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      116
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.028571
      0.000000
      0.061538
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      117
      0.000000
      0.000000
      0.038961
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      118
      0.029851
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
    
      119
      0.000000
      0.000000
      0.000000
      0.0000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
    
  

120 rows × 83 columns

Get the number of documents in each cluster



In [219]:

    
mentions[mentions['cluster'] == 1]









    Out[219]:







  
    
      
      begin
      doc_id
      end
      id
      mention_type
      ontology_arr
      sent_id
      xmi_id
      text
      sentence_number
      cui
      umls_xmi_id
      preferred_text
      template_token
      cluster
    
  
  
    
      14
      1527
      443072
      1531
      e50bfffb-721e-4dcc-90d8-dca1d6e45ea5
      DiseaseDisorderMention
      [6376]
      009a71c7-cdf5-407d-9595-970d338038ed
      6389
      Plan
      27
      C0270724
      6376
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      16
      653
      441342
      656
      f8fbe689-662d-4b40-94bc-ee9300ceb500
      DiseaseDisorderMention
      [8954]
      00ee58f6-cac3-495d-9f91-70fb8e79764f
      8967
      HTN
      9
      C0020538
      8954
      Hypertensive disease
      DiseaseDisorderMention
      1
    
    
      17
      658
      441342
      661
      365196de-2e9a-4253-b168-29f73e055d21
      DiseaseDisorderMention
      [9339]
      00ee58f6-cac3-495d-9f91-70fb8e79764f
      9352
      ARF
      9
      C0022660
      9339
      Kidney Failure, Acute
      DiseaseDisorderMention
      1
    
    
      18
      674
      441342
      692
      b9bb5e80-b1ba-428d-b388-a56daff051df
      SignSymptomMention
      [10533]
      00ee58f6-cac3-495d-9f91-70fb8e79764f
      10546
      infectious process
      9
      C0006277
      10533
      Bronchitis
      SignSymptomMention
      1
    
    
      19
      694
      441342
      714
      1d906a9d-5cc2-4c34-aa8b-4e0fd18de01b
      DiseaseDisorderMention
      [9207]
      00ee58f6-cac3-495d-9f91-70fb8e79764f
      9220
      gastric peptic ulcer
      9
      C0577559
      9207
      Mass of body structure
      DiseaseDisorderMention
      1
    
    
      20
      715
      441342
      722
      3f0026fc-8d41-4519-ab8b-d447c7b70495
      DiseaseDisorderMention
      [9251]
      00ee58f6-cac3-495d-9f91-70fb8e79764f
      9264
      disease
      9
      C0012634
      9251
      Disease
      DiseaseDisorderMention
      1
    
    
      27
      2361
      458728
      2365
      9181f72f-ae82-41aa-8e93-b6a1175e1c75
      DiseaseDisorderMention
      [17309]
      01a1e37e-acab-4245-811a-cbdf6d196c3d
      17322
      Plan
      33
      C0270724
      17309
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      50
      2254
      378647
      2258
      ef0cad34-8f69-47ad-8838-1e48d5c17dfd
      DiseaseDisorderMention
      [11768]
      02d1ff23-b8a8-4a61-8528-68c9b9c81683
      11781
      Plan
      31
      C0270724
      11768
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      52
      868
      458691
      887
      655dda8e-9902-49ba-8d8a-188a001a5688
      DiseaseDisorderMention
      [12913]
      02e10911-c5e5-47e5-8de1-c940acb642f6
      12926
      Respiratory failure
      8
      C1145670
      12913
      Respiratory Failure
      DiseaseDisorderMention
      1
    
    
      53
      900
      458691
      904
      14d04147-09d9-4954-854f-978e645fc3d0
      DiseaseDisorderMention
      [12264]
      02e10911-c5e5-47e5-8de1-c940acb642f6
      12277
      ARDS
      8
      C0035222
      12264
      Respiratory Distress Syndrome, Adult
      DiseaseDisorderMention
      1
    
    
      54
      297
      350891
      300
      621ac8c8-7fb4-4b37-8d2e-a2c055b5c25d
      DiseaseDisorderMention
      [7175]
      02e3ac64-04bc-43f4-b44c-2dfd20a03747
      7188
      Pan
      5
      C0031036
      7175
      Polyarteritis Nodosa
      DiseaseDisorderMention
      1
    
    
      62
      2527
      355788
      2531
      2c39699f-54c4-4918-ae2e-22eee2044bfa
      DiseaseDisorderMention
      [11337]
      03121359-ada0-4d3b-a85d-8766b55c94ad
      11350
      Plan
      44
      C0428886
      11337
      Mean blood pressure
      DiseaseDisorderMention
      1
    
    
      75
      3376
      408714
      3383
      e4ef6ed4-30b7-482a-83de-d5d7f14b1bb4
      DiseaseDisorderMention
      [12858]
      038dbac0-69ca-4751-961e-15949aae92c7
      12871
      Anxiety
      50
      C0003467
      12858
      Anxiety
      DiseaseDisorderMention
      1
    
    
      96
      623
      438154
      627
      feb692dd-17dc-4364-9c9d-2f545393ef67
      DiseaseDisorderMention
      [4456]
      04c20bf7-b6f5-4c50-98b9-6ad2d436a46a
      4469
      Plan
      18
      C0270724
      4456
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      124
      3643
      442309
      3647
      d7ec3deb-0387-4d1b-9131-9f7ab6760698
      DiseaseDisorderMention
      [15539]
      05f13f07-32ca-4334-ba79-3760071805bb
      15552
      Plan
      59
      C0270724
      15539
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      133
      2471
      458728
      2478
      3a15f527-56ed-4ddb-ac85-8f89b599806b
      DiseaseDisorderMention
      [17452]
      064bfeac-f01d-4152-a919-fbed9b73cb86
      17465
      Anxiety
      37
      C0003467
      17452
      Anxiety
      DiseaseDisorderMention
      1
    
    
      138
      4089
      355793
      4093
      6a3dcad8-389a-4ab4-9648-27c2c49f65ef
      DiseaseDisorderMention
      [16616]
      06fe4ce6-f145-489e-881b-652b870c7a54
      16629
      Plan
      76
      C0270724
      16616
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      168
      1780
      350384
      1784
      9192a6a8-0bef-4be4-97bd-19217cf82f68
      DiseaseDisorderMention
      [10061]
      07b64045-c4ee-4970-8e29-d88f1f191bea
      10074
      Plan
      41
      C0032952
      10061
      Prednisone
      DiseaseDisorderMention
      1
    
    
      169
      83
      355538
      91
      e13708a1-d200-4677-9b0d-e37352ce818f
      MedicationMention
      [9382, 9372]
      07c07c34-a842-4ee5-8f4d-d8baac197f14
      9396
      Antibody
      2
      C0003241
      9382
      Antibodies
      MedicationMention
      1
    
    
      170
      92
      355538
      100
      70ddb4e8-fee7-46b3-8919-5e7a0caf8fe9
      DiseaseDisorderMention
      [10961]
      07c07c34-a842-4ee5-8f4d-d8baac197f14
      10974
      Syndrome
      2
      C0039082
      10961
      Syndrome
      DiseaseDisorderMention
      1
    
    
      171
      113
      355538
      146
      6c6ae979-4d5d-43fd-bca2-4c9d85ac900d
      DiseaseDisorderMention
      [9982]
      07c07c34-a842-4ee5-8f4d-d8baac197f14
      9995
      Microangiopathic Hemolytic Anemia
      2
      C0221021
      9982
      Microangiopathic hemolytic anemia
      DiseaseDisorderMention
      1
    
    
      172
      147
      355538
      150
      b06b2f75-4f8b-4ecc-85bb-a239d4e20b4b
      DiseaseDisorderMention
      [9938]
      07c07c34-a842-4ee5-8f4d-d8baac197f14
      9951
      TTP
      2
      C0034155
      9938
      Purpura, Thrombotic Thrombocytopenic
      DiseaseDisorderMention
      1
    
    
      173
      169
      355538
      173
      23e299ce-06ad-4f99-902e-ba9ae24d5888
      DiseaseDisorderMention
      [10785, 10795, 10805]
      07c07c34-a842-4ee5-8f4d-d8baac197f14
      10820
      ESRD
      2
      C0022661
      10785
      Kidney Failure, Chronic
      DiseaseDisorderMention
      1
    
    
      174
      178
      355538
      204
      27da854e-af62-47ca-b1e3-f17cf9304850
      ProcedureMention
      [12171]
      07c07c34-a842-4ee5-8f4d-d8baac197f14
      12184
      cadaveric renal transplant
      2
      C0401176
      12171
      Cadaveric renal transplant
      ProcedureMention
      1
    
    
      179
      1010
      339201
      1028
      16eb969e-519d-439f-92f9-804fa6f27756
      DiseaseDisorderMention
      [13510]
      07de3131-edbd-4310-8184-e42cdc646218
      13523
      Alcohol withdrawal
      29
      C0236663
      13510
      Alcohol withdrawal syndrome
      DiseaseDisorderMention
      1
    
    
      180
      1040
      339201
      1056
      e1b7e133-46d4-47be-887c-667f9214c9c3
      DiseaseDisorderMention
      [13081]
      07de3131-edbd-4310-8184-e42cdc646218
      13094
      delirium tremens
      29
      C0023901
      13081
      Liver Function Tests
      DiseaseDisorderMention
      1
    
    
      181
      1058
      339201
      1061
      0fe2b584-e6b5-4096-8c92-396773d1c79d
      DiseaseDisorderMention
      [13906]
      07de3131-edbd-4310-8184-e42cdc646218
      13919
      DTs
      29
      C0001957
      13906
      Alcohol Withdrawal Delirium
      DiseaseDisorderMention
      1
    
    
      182
      1063
      339201
      1071
      172f9ff8-121e-435a-b461-31252e5a2af4
      SignSymptomMention
      [15512]
      07de3131-edbd-4310-8184-e42cdc646218
      15525
      seizures
      29
      C0036572
      15512
      Seizures
      SignSymptomMention
      1
    
    
      198
      760
      438166
      764
      8f72a361-5636-420c-8812-1254765262a7
      DiseaseDisorderMention
      [5019]
      086b67da-3425-4475-8433-e3e842c2a61f
      5032
      Plan
      21
      C0000726
      5019
      Abdomen
      DiseaseDisorderMention
      1
    
    
      216
      3535
      464827
      3544
      c108c585-e0fd-43fd-bb9b-46f54c5c7632
      DiseaseDisorderMention
      [14483, 14473, 14463]
      0926a6ac-3fee-4484-8afc-2fd8130216a0
      14498
      infection
      58
      C1260298
      14483
      tigecycline
      DiseaseDisorderMention
      1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      5805
      1373
      333703
      1377
      771bbaa7-eb06-4ba1-97af-d8640d04292d
      DiseaseDisorderMention
      [8838]
      f267ebd3-bb58-4d17-a1c2-3667e23bdd52
      8851
      Plan
      19
      C0270724
      8838
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      5827
      2779
      354151
      2798
      a9a532af-adc4-417e-a87e-06b1af4e2786
      DiseaseDisorderMention
      [15079]
      f335e36b-99aa-4088-a9d0-baef831c1a45
      15092
      Respiratory failure
      49
      C1145670
      15079
      Respiratory Failure
      DiseaseDisorderMention
      1
    
    
      5828
      2811
      354151
      2815
      0ae7c32f-9a83-46f7-8cb4-010b7078c4c0
      DiseaseDisorderMention
      [15739]
      f335e36b-99aa-4088-a9d0-baef831c1a45
      15752
      ARDS
      49
      C0035222
      15739
      Respiratory Distress Syndrome, Adult
      DiseaseDisorderMention
      1
    
    
      5844
      1284
      443072
      1303
      48c40579-4e8a-403b-8a09-7ff03a0d823a
      DiseaseDisorderMention
      [6112]
      f3c06428-9576-454f-b460-b7b2983104f0
      6125
      Respiratory failure
      21
      C1145670
      6112
      Respiratory Failure
      DiseaseDisorderMention
      1
    
    
      5845
      1316
      443072
      1320
      f1023fa7-04f9-4c64-b494-6c5cb08cc9c0
      DiseaseDisorderMention
      [6156]
      f3c06428-9576-454f-b460-b7b2983104f0
      6169
      ARDS
      21
      C0035222
      6156
      Respiratory Distress Syndrome, Adult
      DiseaseDisorderMention
      1
    
    
      5851
      1544
      422294
      1548
      24477922-4973-43e0-8aa8-61b92495096d
      DiseaseDisorderMention
      [7495]
      f400dbe2-d547-4182-bf2b-306196e45120
      7508
      Plan
      36
      C0270724
      7495
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      5861
      24
      354315
      27
      0c2cf874-0596-4c89-b698-c377248a46ec
      SignSymptomMention
      [20186]
      f4ba23d4-50d3-48cb-bfdc-8fc384a0824d
      20199
      PMH
      0
      C0262926
      20186
      Medical History
      SignSymptomMention
      1
    
    
      5862
      44
      354315
      48
      00cf6719-a0a6-4d55-ba1f-88558eb136b1
      DiseaseDisorderMention
      [19315, 19325, 19335]
      f4ba23d4-50d3-48cb-bfdc-8fc384a0824d
      19350
      ESRD
      0
      C0022661
      19315
      Kidney Failure, Chronic
      DiseaseDisorderMention
      1
    
    
      5863
      59
      354315
      63
      ccec44db-7213-446d-a754-17aae4f4317b
      DiseaseDisorderMention
      [18633]
      f4ba23d4-50d3-48cb-bfdc-8fc384a0824d
      18646
      COPD
      0
      C0024117
      18633
      Chronic Obstructive Airway Disease
      DiseaseDisorderMention
      1
    
    
      5864
      65
      354315
      91
      d310ef02-4a80-44ed-9320-a6a77cae4c6a
      DiseaseDisorderMention
      [19590]
      f4ba23d4-50d3-48cb-bfdc-8fc384a0824d
      19603
      ischemic\n   cardiomyopathy
      0
      C0349782
      19590
      Ischemic cardiomyopathy
      DiseaseDisorderMention
      1
    
    
      5865
      93
      354315
      97
      f42dd455-99dd-489e-a414-cee592a00a12
      DiseaseDisorderMention
      [18039]
      f4ba23d4-50d3-48cb-bfdc-8fc384a0824d
      18052
      MRSA
      0
      C0343401
      18039
      MRSA - Methicillin resistant Staphylococcus au...
      DiseaseDisorderMention
      1
    
    
      5866
      110
      354315
      113
      18bd8a6c-edfd-4c7b-9cb0-76a9c15fedc7
      DiseaseDisorderMention
      [18908]
      f4ba23d4-50d3-48cb-bfdc-8fc384a0824d
      18921
      OSA
      0
      C0520679
      18908
      Sleep Apnea, Obstructive
      DiseaseDisorderMention
      1
    
    
      5867
      142
      354315
      145
      8a465df4-b213-4269-b802-6b14b5d744f7
      DiseaseDisorderMention
      [19480]
      f4ba23d4-50d3-48cb-bfdc-8fc384a0824d
      19493
      CHF
      0
      C0018802
      19480
      Congestive heart failure
      DiseaseDisorderMention
      1
    
    
      5881
      2333
      361934
      2337
      5ef12247-fde5-4b51-8486-fde99354372b
      DiseaseDisorderMention
      [9451]
      f4ef4b9b-a581-44e9-8c14-549186a3ef50
      9464
      Plan
      39
      C0270724
      9451
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      5951
      1368
      333908
      1372
      be108ccc-ddb4-47c0-b273-10af4f99fe33
      DiseaseDisorderMention
      [9630]
      f918cc4a-2f8b-4c5e-a904-3de84efe714b
      9643
      Plan
      18
      C0270724
      9630
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      5954
      1548
      339201
      1552
      4e60214f-2712-48e2-9cc1-09b621dd8c8b
      DiseaseDisorderMention
      [13422]
      f9483446-c5b4-47bf-82e1-42cafc8c319c
      13435
      Plan
      39
      C0270724
      13422
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      5958
      1941
      497748
      1945
      28a92539-426d-41e3-b9b0-244604f427b0
      DiseaseDisorderMention
      [7178]
      f98eff4f-c4bc-40cf-aa4d-e60607336a58
      7191
      plan
      36
      C0017973
      7178
      Glycosaminoglycans
      DiseaseDisorderMention
      1
    
    
      5963
      3114
      438389
      3118
      96331d13-cf7b-4825-adf4-5b698a502f59
      DiseaseDisorderMention
      [15326]
      f9ce978c-9bc0-4cb7-b6c8-1f1d83912729
      15339
      Plan
      46
      C0270724
      15326
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      5967
      410
      341364
      414
      d3e6e998-fc45-4d83-ab5a-6da7ec707f02
      DiseaseDisorderMention
      [2309]
      fa0ad4ac-d860-4241-bafc-28f5fff766ff
      2322
      Plan
      9
      C0270724
      2309
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      6011
      2533
      336621
      2537
      f1e5b1d2-c242-4710-a23f-d149dc20ea1e
      DiseaseDisorderMention
      [11684]
      fb8ce413-3499-495c-bda3-15537d919296
      11697
      Plan
      44
      C0270724
      11684
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      6021
      1634
      354151
      1658
      96af2f8e-b3d9-43f4-a5f4-fdcea44bd8ab
      DiseaseDisorderMention
      [15365, 15375]
      fc48db0a-ff04-457a-adaa-53b608b51540
      15389
      Renal failure, End stage
      25
      C0022661
      15365
      Kidney Failure, Chronic
      DiseaseDisorderMention
      1
    
    
      6022
      1649
      354151
      1652
      a483ed81-247d-4ed3-aecc-f3691648dae4
      MedicationMention
      [12276, 12266]
      fc48db0a-ff04-457a-adaa-53b608b51540
      12290
      End
      25
      C0082420
      12276
      Endoglin, human
      MedicationMention
      1
    
    
      6023
      1660
      354151
      1683
      522e1559-f0b7-4cf5-9f15-8073777f51d0
      DiseaseDisorderMention
      [15496, 15486]
      fc48db0a-ff04-457a-adaa-53b608b51540
      15510
      End stage renal disease
      25
      C0022661
      15496
      Kidney Failure, Chronic
      DiseaseDisorderMention
      1
    
    
      6024
      1685
      354151
      1689
      e6b3e836-9a73-478e-b816-5b34a7ed9598
      DiseaseDisorderMention
      [15177, 15167, 15187]
      fc48db0a-ff04-457a-adaa-53b608b51540
      15202
      ESRD
      25
      C0022661
      15177
      Kidney Failure, Chronic
      DiseaseDisorderMention
      1
    
    
      6027
      3750
      337300
      3754
      65c279ea-f692-4157-b614-d3b8c2f67112
      DiseaseDisorderMention
      [18532]
      fc851549-121a-4c61-a625-4bad13c066dc
      18545
      Plan
      65
      C0270724
      18532
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      6034
      626
      463241
      630
      3975816b-3ed0-4809-a469-437e4fe95e94
      DiseaseDisorderMention
      [2788]
      fd105c98-7556-458e-be31-27b47d7f1024
      2801
      Plan
      7
      C0270724
      2788
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      6037
      846
      336937
      850
      35f625d6-5722-49d1-9dca-a9c744754928
      DiseaseDisorderMention
      [6893]
      fd6d0327-2549-4332-a24d-252e5338d621
      6906
      Plan
      13
      C0270724
      6893
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      6067
      1808
      366321
      1812
      97c9e6ed-08b7-4644-a3d2-a5e98523b735
      DiseaseDisorderMention
      [11048]
      fe6e1c34-fda0-4c51-aa69-13d7d82f05b8
      11061
      Plan
      31
      C2830004
      11048
      Somnolence
      DiseaseDisorderMention
      1
    
    
      6088
      3317
      464827
      3333
      ae72b021-8e64-46b6-a879-e811b5d1ff2f
      DiseaseDisorderMention
      [13153, 13143]
      ffa19818-d69b-4b37-b7fa-730985657e09
      13167
      Wound dehiscence
      53
      C0259768
      13153
      Wound dehiscence
      DiseaseDisorderMention
      1
    
    
      6098
      1660
      355538
      1664
      168e3d33-161a-4eca-b23a-6dae71b0a092
      DiseaseDisorderMention
      [10565]
      ffe8b11f-9e92-496b-a1f1-e70f225bd171
      10578
      Plan
      31
      C0270724
      10565
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
  

642 rows × 15 columns



In [223]:

    
umls[umls['xmi_id'].isin([17309, 11768, 11337, 4456, 15539, 16616, 10061, 13422]) ]









    Out[223]:







  
    
      
      code
      coding_scheme
      cui
      disambiguated
      doc_id
      id
      preferred_text
      tui
      xmi_id
    
  
  
    
      148
      52713000
      SNOMEDCT_US
      C0270724
      False
      458728
      4d9e058d-26bb-4c4a-bc40-a78068b10eae
      Infantile Neuroaxonal Dystrophy
      T047
      17309
    
    
      67
      52713000
      SNOMEDCT_US
      C0270724
      False
      355788
      7c97f733-5d46-4ea1-b21d-9852b9adf99d
      Infantile Neuroaxonal Dystrophy
      T047
      11337
    
    
      205
      52713000
      SNOMEDCT_US
      C0270724
      False
      339201
      6a9fd3cf-0848-4570-814e-2b14282b0fbd
      Infantile Neuroaxonal Dystrophy
      T047
      13422
    
    
      152
      439569004
      SNOMEDCT_US
      C0035273
      False
      438365
      12c464c8-e829-491d-95e0-ab1e8923c56b
      Resuscitation procedure
      T061
      17309
    
    
      77
      52713000
      SNOMEDCT_US
      C0270724
      False
      350384
      f8e98cf6-0afe-46a7-af7d-1cbfcddf852e
      Infantile Neuroaxonal Dystrophy
      T047
      10061
    
    
      59
      312250003
      SNOMEDCT_US
      C0024485
      False
      338922
      c6ec27c9-7897-45a6-a5cf-899909b0144e
      Magnetic Resonance Imaging
      T060
      4456
    
    
      96
      52713000
      SNOMEDCT_US
      C0270724
      False
      442309
      007e67f7-69c8-428d-b8ef-d3589e06f2d6
      Infantile Neuroaxonal Dystrophy
      T047
      15539
    
    
      22
      10312003
      SNOMEDCT_US
      C0032952
      False
      424894
      48223bcb-e16c-46f0-bfdd-1c502c9d388c
      Prednisone
      T121
      10061
    
    
      99
      52713000
      SNOMEDCT_US
      C0270724
      False
      355793
      b4b962f9-ea4d-4ae3-95f3-d9447e743dc6
      Infantile Neuroaxonal Dystrophy
      T047
      16616
    
    
      152
      439569004
      SNOMEDCT_US
      C0035273
      False
      438389
      ca5721fd-e5d9-4a5b-ace4-fb68acb89fcc
      Resuscitation procedure
      T061
      17309
    
    
      97
      6797001
      SNOMEDCT_US
      C0428886
      False
      361934
      cfbf85e6-c90f-4328-b205-3ffa92cf9494
      Mean blood pressure
      T033
      11337
    
    
      71
      271808008
      SNOMEDCT_US
      C1850534
      False
      443260
      6f912b79-3395-4a2a-b1c8-05442bc6d976
      Edema, generalized
      T033
      10061
    
    
      162
      52713000
      SNOMEDCT_US
      C0270724
      False
      378647
      528115c7-6d1f-47d9-a8d4-61d9584c3bf4
      Infantile Neuroaxonal Dystrophy
      T047
      11768
    
    
      141
      423827005
      SNOMEDCT_US
      C0014245
      False
      343682
      e18c6053-bea2-409a-8e28-70255a0d6624
      Endoscopy (procedure)
      T060
      13422
    
    
      51
      52713000
      SNOMEDCT_US
      C0270724
      False
      438154
      797dc390-6484-4b34-94cb-14d1704fe84e
      Infantile Neuroaxonal Dystrophy
      T047
      4456



In [228]:

    
sentences[sentences['sent_id'] == 'f918cc4a-2f8b-4c5e-a904-3de84efe714b']









    Out[228]:







  
    
      
      begin
      doc_id
      end
      sent_id
      sentence_number
      xmi_id
      text
      cluster
    
  
  
    
      62
      1368
      333908
      1373
      f918cc4a-2f8b-4c5e-a904-3de84efe714b
      18
      133
      Plan:
      1



In [229]:

    
notes = pd.read_parquet('data/note-events.parquet', engine='fastparquet')



In [235]:

    
notes[notes['ROW_ID'] == 333908]['TEXT'].iloc[0][1368:1372]









    Out[235]:





'Plan'

Generating Notes

Get all the entities for the document



In [503]:

    
doc_ids = templates['doc_id'].unique()
notes = notes[notes['ROW_ID'].isin(doc_ids)]
notes = notes.reset_index(drop=True)
# notes = notes.drop(['CHARTDATE','CHARTTIME','STORETIME','CGID','ISERROR'],axis=1)



In [504]:

    
doc = notes.sample(n=1)
doc_id = doc['ROW_ID'].iloc[0]
doc_id









    Out[504]:





361930

Drop templates that contain entities not in the document



In [505]:

    
ents_in_doc = mentions[mentions['doc_id'] == doc['ROW_ID'].iloc[0]]
ments_in_doc = ents_in_doc.mention_type.unique()
# print(ments_in_doc)
ents_in_doc.head()

# get metions where mention_type is in doc entities types
print(len(mentions))
doc_ments = mentions[mentions.cui.isin(ents_in_doc.cui.unique())]
# print(len(doc_ments))


doc_ments.head()









    Out[505]:







  
    
      
      begin
      doc_id
      end
      id
      mention_type
      ontology_arr
      sent_id
      xmi_id
      text
      sentence_number
      cui
      umls_xmi_id
      preferred_text
      template_token
      cluster
    
  
  
    
      114
      876
      361930
      881
      c37ca5d7-74de-47d4-b437-468c1e6b97e6
      AnatomicalSiteMention
      [16821]
      0565613d-e39b-4576-b670-e3e88f5cf982
      16834
      Blood
      8
      C0005767
      16821
      Blood
      AnatomicalSiteMention
      33
    
    
      115
      884
      361930
      898
      63a2bbd1-c119-4a6c-ba2c-e016b1c5d4a0
      ProcedureMention
      [15816, 15806]
      0565613d-e39b-4576-b670-e3e88f5cf982
      15830
      urine cultures
      8
      C0430404
      15816
      Urine for culture
      ProcedureMention
      33
    
    
      116
      923
      361930
      934
      a42518a0-8b0d-4c47-a566-8bfbf5efc069
      MedicationMention
      [12536]
      0565613d-e39b-4576-b670-e3e88f5cf982
      12549
      Antibiotics
      8
      C0003232
      12536
      Antibiotics
      MedicationMention
      33
    
    
      419
      392
      361930
      405
      d1caf679-169f-47c1-8c6a-c1423692fb02
      ProcedureMention
      [16022]
      12f3be14-c673-4b85-bebf-faa41151c1b9
      16035
      stabilization
      4
      C1293130
      16022
      Stabilization
      ProcedureMention
      79
    
    
      420
      423
      361930
      431
      aaf7ca5a-7c5e-4d7d-91c1-6760e05a5269
      AnatomicalSiteMention
      [16990]
      12f3be14-c673-4b85-bebf-faa41151c1b9
      17003
      arterial
      4
      C0003842
      16990
      Arteries
      AnatomicalSiteMention
      79



In [507]:

    
# get templates that have the corresponding sentence ids from doc_ments
template_candidates = templates[templates.sent_id.isin(doc_ments.sent_id)]
template_candidates.head()









    Out[507]:







  
    
      
      sent_id
      sem_template
      sentence_number
      doc_id
      begin
      end
      cluster
    
  
  
    
      0
      000142d2-4690-4a56-8a68-89b5831ed2aa
      ProcedureMention appear
      58
      408714
      3670
      3716
      18
    
    
      2
      002d5022-70f9-4638-84b1-dcf42a6c0e12
      be be follow SignSymptomMention AnatomicalSite...
      10
      354315
      718
      801
      47
    
    
      3
      00385844-a95d-4bb7-a773-70f78a3b035a
      continue titrate MedicationMention SignSymptom...
      17
      432020
      1451
      1520
      32
    
    
      4
      0046177c-aa3b-4c8d-9554-3efea4802687
      be give MedicationMention MedicationMention Me...
      5
      464828
      548
      618
      12
    
    
      5
      0063b06f-e43d-474b-bf27-b67e474b22de
      report SignSymptomMention have come SignSympto...
      30
      340820
      2199
      2277
      62

Choose a cluster based on cluster frequency for that sentence position



In [508]:

    
candidate_cluster_labels = template_candidates.cluster.sort_values().unique()
candidate_clusters = cluster_label_by_sentence_pos.iloc[candidate_cluster_labels]



In [509]:

    
sent_pos = 0

# remove cluster labels not present in template candidates
selected_cluster = candidate_clusters.sample(
                                        n=1,
                                        weights=candidate_clusters.loc[:,sent_pos]
                        ).iloc[0].name
selected_cluster
# templates_in_cluster = template_candidates[template_candidates['cluster'] == selected_cluster.iloc[0].index]









    Out[509]:





26



In [510]:

    
cluster_templates = template_candidates[template_candidates.cluster == selected_cluster]
cluster_templates.head()









    Out[510]:







  
    
      
      sent_id
      sem_template
      sentence_number
      doc_id
      begin
      end
      cluster
    
  
  
    
      252
      17c47df8-37b5-47cd-a1b4-b056fa6553a9
      DiseaseDisorderMention SignSymptomMention
      40
      333908
      1956
      1979
      26
    
    
      281
      1a3970bd-14fd-49e5-a606-ca7f61fc0eaf
      SignSymptomMention DiseaseDisorderMention Sign...
      0
      427806
      0
      147
      26
    
    
      517
      2d836805-7d80-4809-811d-fb3fd0c6c3b1
      have DiseaseDisorderMention SignSymptomMention...
      14
      378629
      1135
      1220
      26
    
    
      1327
      79d5ea31-7213-457a-82af-ed755e165778
      give SignSymptomMention SignSymptomMention Dis...
      2
      442499
      255
      619
      26
    
    
      1908
      b06329da-4fb9-45f6-a384-9e25b425ee7f
      SignSymptomMention DiseaseDisorderMention Sign...
      11
      361823
      1163
      1254
      26

Choose a template from the cluster base on frequency for that sentence position



In [511]:

    
# templates_at_pos = cluster_templates[cluster_templates.sentence_number == sent_pos]
template = cluster_templates.sample(n=1)
template









    Out[511]:







  
    
      
      sent_id
      sem_template
      sentence_number
      doc_id
      begin
      end
      cluster
    
  
  
    
      252
      17c47df8-37b5-47cd-a1b4-b056fa6553a9
      DiseaseDisorderMention SignSymptomMention
      40
      333908
      1956
      1979
      26



In [512]:

    
# sentences[sentences.sent_id == 'deef8a81-b222-4d1f-aa3f-7dfc160cb428'].iloc[0].text



In [ ]:

Fill template blank

Choosing text

Select text to fill the template blank based on the frequency of strings for the CUI associated with the mention



In [513]:

    
# get mentions in this template
template_id = template.iloc[0]['sent_id']
ments_in_temp = mentions[mentions.sent_id == template_id]
ments_in_temp
# Get the sentence for that template
raw_sentence = sentences[sentences.sent_id == template_id]
raw_sentence.iloc[0].text

# Select entities from entities in the document that match that entity type
#









    Out[513]:





'gtt if hypotension r/t.'



In [514]:

    
ments_in_temp









    Out[514]:







  
    
      
      begin
      doc_id
      end
      id
      mention_type
      ontology_arr
      sent_id
      xmi_id
      text
      sentence_number
      cui
      umls_xmi_id
      preferred_text
      template_token
      cluster
    
  
  
    
      525
      1956
      333908
      1959
      4b0ab019-d02c-4f7b-a363-2a24ead17145
      DiseaseDisorderMention
      [9255, 9245]
      17c47df8-37b5-47cd-a1b4-b056fa6553a9
      9269
      gtt
      40
      C0042029
      9255
      Urinary tract infection
      DiseaseDisorderMention
      26
    
    
      526
      1963
      333908
      1974
      62064326-c2c6-4490-ab47-c3117e8bcd79
      SignSymptomMention
      [10289]
      17c47df8-37b5-47cd-a1b4-b056fa6553a9
      10302
      hypotension
      40
      C0020649
      10289
      Hypotension
      SignSymptomMention
      26



In [515]:

    
# ments_in_temp.drop(ments_in_temp.loc[482].name, axis=0)



In [516]:

    
concepts = umls[umls.cui == ments_in_temp.iloc[0].cui]
concepts.head()









    Out[516]:







  
    
      
      code
      coding_scheme
      cui
      disambiguated
      doc_id
      id
      preferred_text
      tui
      xmi_id
    
  
  
    
      50
      68566005
      SNOMEDCT_US
      C0042029
      False
      453048
      ba0d4374-d679-4683-b85a-55e31b20de88
      Urinary tract infection
      T047
      9255
    
    
      154
      68566005
      SNOMEDCT_US
      C0042029
      False
      374110
      46b50269-81b1-4055-930f-0cf480fe5868
      Urinary tract infection
      T047
      18650



In [517]:

    
# ents_in_doc



In [ ]:



In [518]:

    
# txt_counts.sample(n=1, weights=txt_counts.cnt).iloc[0].text



In [644]:

    
def template_filler(template, sentences, entities, all_mentions):
#     print(template.sem_template)
    num_start = len(entities)
    
    template_id = template.iloc[0]['sent_id']
    
    ments_in_temp = all_mentions[all_mentions.sent_id == template_id]
    
    raw_sentence = sentences[sentences.sent_id == template_id]
#     print(f'raw sent df size: {len(raw_sentence)}')
#     print(template_id)
    sent_begin = raw_sentence.iloc[0].begin
    sent_end = raw_sentence.iloc[0].end

    raw_text = raw_sentence.iloc[0].text
    
    replacements = []
#     rows_to_drop = []

#     print('Mention types in template')
#     print(ments_in_temp.mention_type.unique())
#     print('types in entities')
#     print(entities.mention_type.unique())

    for i, row in ments_in_temp.iterrows():
        ents_subset = entities[entities.mention_type == row.mention_type]

        if len(ents_subset) == 0:
            print('Empty list of doc entities')
            print(entities.mention_type)
            print(row.mention_type)
            break
        rand_ent = ents_subset.sample(n=1)
        entities = entities[entities['id'] != rand_ent.iloc[0]['id']]
#         rows_to_drop.append(rand_ent.iloc[0].name)
        
        ent_cui = rand_ent.iloc[0].cui
#         print(ent_cui)
        span_text = get_text_for_mention(ent_cui, all_mentions)
        replacements.append({
            'text' : span_text,
            'begin' : row.begin - sent_begin,
            'end' : row.end - sent_begin,
        })
        
    new_sentence = ''
    for i, r in enumerate(replacements):
        if i == 0:
            new_sentence += raw_text[0 : r['begin'] ]
        else:
            new_sentence += raw_text[replacements[i-1]['end'] : r['begin']]
        new_sentence += r['text']
    
    if(len(replacements) > 1):
        new_sentence += raw_text[replacements[-1]['end'] : ]
        
    # clean up
    num_end = len(entities)
#     print(f"Dropped {num_start - num_end} rows")
    return new_sentence, entities
    
        
        
# Find all the text associated with the cui of the mention in the template
# choose a text span based on frequency
def get_text_for_mention(cui, mentions):
    txt_counts = mentions[mentions.cui == cui].groupby('text').size().reset_index(name='cnt')
    return txt_counts.sample(n=1, weights=txt_counts.cnt).iloc[0].text

Write a full note



In [657]:

    
# Select document to write note for
# doc = notes.sample(n=1)
# doc_id = doc['ROW_ID'].iloc[0]
doc_id = 374185
# Get all the entities in the chosen document

ents_in_doc = mentions[mentions['doc_id'] == doc_id]

new_doc_sentences = []
sent_pos = 0

while len(ents_in_doc) > 0:
#     print(f"Sentence position: {sent_pos}")
#     print(f"Length of remaining entities: {len(ents_in_doc)}")
    # Get list of possible mentions based on CUIs found in the document

    mentions_pool = mentions[(mentions.cui.isin(ents_in_doc.cui.unique()))
                            & (mentions.mention_type.isin(ents_in_doc.mention_type.unique()))]

    # Get template pool based on mentions pool
    # TODO: Need to only choose templates where all the mentions are in `ents_in_doc`
    template_candidates = templates[templates.sent_id.isin(mentions_pool.sent_id)]
    
#     ts = len(template_candidates.sent_id.unique())
#     ms = len(mentions_pool.sent_id.unique())
#     print(ts, ms)
    
    def all_ents_present(row, doc_ents, ments_pool):
        # Get mentions in this template
        all_temp_ments = ments_pool[ments_pool['sent_id'] == row['sent_id']]
        available_mentions = all_temp_ments[all_temp_ments['mention_type'].isin(doc_ents['mention_type'])]
        
        return (len(available_mentions) > 0)
        
    mask = template_candidates.apply(all_ents_present,
                                     args=(ents_in_doc, mentions_pool),
                                     axis=1)
    template_candidates = template_candidates[mask]
#     print(f'num templates: {len(template_candidates)}')
    #If there are no more possible templates then break
    if len(template_candidates) == 0:
        break

    # Get candidate clusters based on template pool

    # Remove the cluster labels that aren't present in template bank
    candidate_cluster_labels = template_candidates.cluster.sort_values().unique()
    candidate_clusters = cluster_label_by_sentence_pos.iloc[candidate_cluster_labels]
#     print(f"Num clusters: {len(candidate_clusters)}")
    # Select cluster based on frequency at sentence position
    selected_cluster = None
    try:
        selected_cluster = candidate_clusters.sample(
                                                n=1,
                                                weights=candidate_clusters.loc[:,sent_pos]
                                ).iloc[0].name
    except:
        # It's possible the clusters we chose don't appear at that position
        # so we can choose randomly
#         print('choosing random cluster')
        selected_cluster = candidate_clusters.sample(n=1).iloc[0].name
#     print('selected cluster:')
#     print(selected_cluster)
    cluster_templates = template_candidates[template_candidates.cluster == selected_cluster]

    # Choose template from cluster at random
    template = cluster_templates.sample(n=1)
    template_id = template.iloc[0]['sent_id']
    
    # Get mentions in the template   
    ments_in_temp = mentions[mentions.sent_id == template_id]

    # Write the sentence and update entities found in the document !!!
    t, ents_in_doc = template_filler(template, sentences, ents_in_doc, mentions_pool)
    new_doc_sentences.append(t)
    sent_pos += 1



In [658]:

    
'\n'.join(new_doc_sentences)









    Out[658]:





'Patient transferred in with cooling mattress, temp on adm 100.2\n   axillary\nAssessment:\n    Patient\ns HCT  dropped to 24.4 at 2330 hrs[ aim maintain > 25], NGT\n   in place, initiallt aspirated 20 cc of dark brown blood ,later  got\n   bilious aspirate, HR 90-100bpm b/p stable 100-130 systolic, Started on\n   Clear scopolamine\nPEG\nAssessment:\n    Patient\ns HCT  dropped to 24.4 at 2330 hrs[ aim maintain > 25], NGT\n   in place, initiallt aspirated 20 cc of dark brown blood ,later  got\n   bilious aspirate, HR 90-100bpm b/p stable 100-130 systolic, Started on\n   Clear morphine\nLactulose to keep stools liquid\nBarrier cream to protect maceration,  turn\nEvents :\n          Axillary temp 101.2- Liquid Tylenol\nTITLE:\n   Comfort care (CMO\nAfter the patch has been applied to the back of the  right ear, there\n   has been no secretions & no more Gurgling sounds, patients respirations\n   continue to be shallow but regular.\nO2 changed to\n   aerosol\nPt. given iv T12\nHe was then noted to develop worsening tachypnea\n   and increased oxygen\nIV Morphine given before Turns & positioning'



In [659]:

    
notes[notes.ROW_ID == 374185].iloc[0].TEXT









    Out[659]:





'TITLE:\n   Events :\n          Axillary temp 101.2- Liquid Tylenol given via Peg Tube.\n          Patient very gurgly at the beginning of the shift, has\n   scopolamine patch on him, small thick yellow secretions obtained .\n          IV Morphine given before Turns & positioning.  Gurgling\n   reduced post morphine.\n          Oxygen reduced to 35 % the lowest possible with the aerosol\n   mask as per Resident\ns orders as patient is CMO\n'

Write until all mentions have been used



In [638]:

    
mentions.groupby('doc_id').size().reset_index(name='cnt').sort_values('cnt').head(10)



In [617]:

    
mentions[mentions.doc_id == 476781]









    Out[617]:







  
    
      
      begin
      doc_id
      end
      id
      mention_type
      ontology_arr
      sent_id
      xmi_id
      text
      sentence_number
      cui
      umls_xmi_id
      preferred_text
      template_token
      cluster
    
  
  
    
      1320
      111
      476781
      123
      8743a755-7e36-4400-9f8d-14c89d5b0624
      DiseaseDisorderMention
      [1682]
      36c214a8-424f-4b54-a3b2-84f6daf1aef1
      1695
      Bell's palsy
      0
      C0376175
      1682
      Bell Palsy
      DiseaseDisorderMention
      43
    
    
      1321
      140
      476781
      149
      50b0dd7b-f6bb-4a15-8b75-4b4a48b72e5a
      MedicationMention
      [1569, 1539, 1589, 1559, 1529, 1579, 1549, 1519]
      36c214a8-424f-4b54-a3b2-84f6daf1aef1
      1609
      acyclovir
      0
      C0001367
      1569
      Acyclovir
      MedicationMention
      43
    
    
      3763
      299
      476781
      312
      d3679191-ebfc-4a01-8555-34e72d25c81a
      SignSymptomMention
      [2007, 1997]
      9d69d84d-1184-4bb3-ab7a-bd44c370f375
      2021
      mental status
      2
      C0278060
      2007
      Mental state
      SignSymptomMention
      27
    
    
      3764
      318
      476781
      326
      3ade634c-9672-49a2-99b1-c7151ac1ac2a
      DiseaseDisorderMention
      [1780, 1770]
      9d69d84d-1184-4bb3-ab7a-bd44c370f375
      1794
      Delirium
      2
      C0011206
      1780
      Delirium
      DiseaseDisorderMention
      27
    
    
      4731
      370
      476781
      374
      7aa86886-5eca-445e-8ee4-765175707413
      DiseaseDisorderMention
      [1726]
      c63b6ac6-9b5a-4bb7-b5b4-04345e4340bc
      1739
      Plan
      6
      C0270724
      1726
      Infantile Neuroaxonal Dystrophy
      DiseaseDisorderMention
      1
    
    
      5891
      187
      476781
      194
      22b64b29-2690-4c34-bc40-94cb258a6d91
      SignSymptomMention
      [1825]
      f5898ed8-cf97-4c65-b1f3-4d207548b6de
      1838
      history
      1
      C0262926
      1825
      Medical History
      SignSymptomMention
      38
    
    
      5892
      198
      476781
      202
      fe9fcf13-19f2-4771-b793-553fc9a15f91
      AnatomicalSiteMention
      [2119]
      f5898ed8-cf97-4c65-b1f3-4d207548b6de
      2132
      neck
      1
      C0027530
      2119
      Neck
      AnatomicalSiteMention
      38
    
    
      5893
      207
      476781
      220
      55dead84-b220-4019-bb86-458c9bb85ec7
      SignSymptomMention
      [1911]
      f5898ed8-cf97-4c65-b1f3-4d207548b6de
      1924
      shoulder pain
      1
      C0037011
      1911
      Shoulder Pain
      SignSymptomMention
      38
    
    
      5894
      260
      476781
      273
      be575703-158a-4e44-81a7-a538f1bfb8a4
      MedicationMention
      [1639]
      f5898ed8-cf97-4c65-b1f3-4d207548b6de
      1652
      oral morphine
      1
      C0026549
      1639
      Morphine
      MedicationMention
      38
    
    
      5895
      278
      476781
      286
      2e34af2e-74c5-46d2-8036-944bb889dc6a
      MedicationMention
      [1377, 1367]
      f5898ed8-cf97-4c65-b1f3-4d207548b6de
      1391
      percocet
      1
      C0086787
      1377
      Percocet
      MedicationMention
      38



In [ ]:

	begin	doc_id	end	sent_id	sentence_number	xmi_id	text
1	10	333701	304	2d8125eb-99e7-414e-95f1-9bfb20664d0d	1	31	Chief Complaint: 82 yo M with h/o dementia, se...
2	308	333701	378	2f9f207c-9cde-47d6-9e6d-cda0c864fae2	2	37	Airway, Inability to Protect (Risk for Aspirat...
3	382	333701	422	d95f5daf-b9e2-400b-95e3-7bbf36b4ed01	3	43	Clearance, Cough) , Pneumonia/Aspiration
5	441	333701	620	034d7746-eeed-4240-b6a4-42a7d6115937	5	55	Impaired gag, weak cough, unable to clear sec...
7	635	333701	902	37e5d910-fe36-4124-a414-304b4d98d3de	7	67	HOB >30 degrees, alb/atr treatment frequency ...

	begin	doc_id	end	id	mention_type	ontology_arr	sent_id	xmi_id	text	sentence_number	cui	umls_xmi_id	preferred_text	template_token
0	3680	408714	3683	9a53e800-6b2b-451d-94a6-fb355c76e7a0	ProcedureMention	[15031, 15021]	000142d2-4690-4a56-8a68-89b5831ed2aa	15045	ABG	58	C0150411	15031	Analysis of arterial blood gases and pH	ProcedureMention
1	1515	442499	1523	1ae0dc2f-b533-4d3a-ac89-d89bfe6fa2b9	MedicationMention	[6130, 6100, 6150, 6120, 6140, 6110]	001ca15d-0e94-4933-b376-5123e22e5b13	6168	Morphine	17	C0026549	6130	Morphine	MedicationMention
2	1532	442499	1542	742efc2f-1005-4455-b647-6b671bb61e0a	SignSymptomMention	[8825]	001ca15d-0e94-4933-b376-5123e22e5b13	8838	air hunger	17	C0231848	8825	Air hunger	SignSymptomMention
3	759	354315	767	e9651709-c7ad-48cc-87b3-e2ae527a8ed8	SignSymptomMention	[19885]	002d5022-70f9-4638-84b1-dcf42a6c0e12	19898	services	10	C0557854	19885	Services	SignSymptomMention
4	769	354315	774	80e6aaab-9f60-4ba2-ab89-8a6c97bff1cc	AnatomicalSiteMention	[23649]	002d5022-70f9-4638-84b1-dcf42a6c0e12	23662	Renal	10	C0022646	23649	Kidney	AnatomicalSiteMention

	sent_id	sem_template	sentence_number	doc_id	begin	end
0	000142d2-4690-4a56-8a68-89b5831ed2aa	ProcedureMention appear	58	408714	3670	3716
1	001ca15d-0e94-4933-b376-5123e22e5b13	MedicationMention SignSymptomMention	17	442499	1515	1542
2	002d5022-70f9-4638-84b1-dcf42a6c0e12	be be follow SignSymptomMention AnatomicalSite...	10	354315	718	801
3	00385844-a95d-4bb7-a773-70f78a3b035a	continue titrate MedicationMention SignSymptom...	17	432020	1451	1520
4	0046177c-aa3b-4c8d-9554-3efea4802687	be give MedicationMention MedicationMention Me...	5	464828	548	618

	cluster	count	frequency
0	0	14	0.005047
1	1	416	0.149964
2	2	198	0.071377
3	3	5	0.001802
4	4	2	0.000721
5	5	97	0.034968
6	6	26	0.009373
7	7	13	0.004686
8	8	95	0.034247
9	9	32	0.011536
10	10	1	0.000360
11	11	9	0.003244
12	12	34	0.012257
13	13	28	0.010094
14	14	68	0.024513
15	15	16	0.005768
16	16	14	0.005047
17	17	36	0.012978
18	18	14	0.005047
19	19	22	0.007931
20	20	19	0.006849
21	21	7	0.002523
22	22	23	0.008291
23	23	17	0.006128
24	24	47	0.016943
25	25	13	0.004686
26	26	27	0.009733
27	27	20	0.007210
28	28	35	0.012617
29	29	25	0.009012
...	...	...	...
90	90	6	0.002163
91	91	8	0.002884
92	92	18	0.006489
93	93	94	0.033886
94	94	32	0.011536
95	95	5	0.001802
96	96	17	0.006128
97	97	5	0.001802
98	98	6	0.002163
99	99	6	0.002163
100	100	20	0.007210
101	101	5	0.001802
102	102	5	0.001802
103	103	28	0.010094
104	104	7	0.002523
105	105	8	0.002884
106	106	8	0.002884
107	107	15	0.005407
108	108	3	0.001081
109	109	7	0.002523
110	110	13	0.004686
111	111	12	0.004326
112	112	2	0.000721
113	113	10	0.003605
114	114	12	0.004326
115	115	9	0.003244
116	116	18	0.006489
117	117	3	0.001081
118	118	2	0.000721
119	119	9	0.003244

	cui	cluster	cluster_count
2095	C0270724	1	163
3004	C1145670	1	40
1319	C0035222	1	34
632	C0015967	2	31
3332	C4048181	2	29
910	C0022660	1	28
914	C0022661	1	26
854	C0020649	2	21
1021	C0024467	12	20
2134	C0278060	27	20

	cui	num_clusters
323	C0030193	32
683	C0310367	28
843	C0587081	25
112	C0010200	24
902	C0751781	23
221	C0019134	23
279	C0024467	23
412	C0039985	22
179	C0015967	20
859	C0699142	19

	cluster	num_cuis_in_cluster
1	1	179
2	2	117
63	63	111
93	93	103
36	36	94
58	58	87
8	8	85
39	39	78
6	6	78
14	14	71
44	44	65
5	5	65
26	26	63
71	71	63
89	89	59
24	24	57
43	43	54
12	12	54
32	32	53
42	42	49
22	22	46
38	38	44
28	28	43
100	100	43
19	19	43
17	17	41
47	47	40
30	30	38
103	103	38
16	16	36
...	...	...
68	68	8
97	97	8
105	105	7
101	101	7
69	69	7
46	46	7
50	50	7
67	67	7
90	90	7
102	102	6
82	82	6
21	21	6
99	99	6
108	108	5
109	109	5
70	70	4
98	98	4
80	80	4
59	59	4
112	112	3
74	74	3
57	57	3
4	4	3
84	84	3
27	27	2
48	48	2
117	117	2
10	10	1
118	118	1
52	52	1

sentence_number	0	1	2	3	4	5	6	7	8	9	...	73	74	75	76	77	78	79	80	81	82
cluster
0	0.000000	0.000000	0.000000	0.0250	0.028571	0.042857	0.032258	0.030769	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
1	0.253731	0.092105	0.025974	0.0125	0.057143	0.042857	0.016129	0.046154	0.071429	0.092593	...	0.2	0.333333	0.0	0.333333	0.000000	0.333333	0.0	0.0	1.0	0.0
2	0.074627	0.013158	0.012987	0.0625	0.014286	0.042857	0.016129	0.046154	0.114286	0.055556	...	0.2	0.000000	0.0	0.000000	0.666667	0.000000	0.0	0.0	0.0	0.0
3	0.000000	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.018519	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
4	0.000000	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
5	0.014925	0.013158	0.090909	0.0625	0.000000	0.000000	0.016129	0.061538	0.028571	0.092593	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.5	0.0	0.0	0.0
6	0.000000	0.000000	0.000000	0.0000	0.014286	0.014286	0.032258	0.000000	0.014286	0.037037	...	0.0	0.000000	0.5	0.000000	0.000000	0.000000	0.0	1.0	0.0	0.0
7	0.000000	0.000000	0.000000	0.0000	0.028571	0.000000	0.000000	0.000000	0.014286	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
8	0.014925	0.000000	0.012987	0.0250	0.028571	0.028571	0.000000	0.015385	0.042857	0.074074	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
9	0.000000	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.014286	0.000000	...	0.0	0.000000	0.0	0.333333	0.000000	0.000000	0.0	0.0	0.0	1.0
10	0.000000	0.000000	0.000000	0.0000	0.014286	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
11	0.000000	0.000000	0.012987	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
12	0.000000	0.026316	0.025974	0.0125	0.014286	0.014286	0.064516	0.000000	0.014286	0.018519	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
13	0.000000	0.000000	0.012987	0.0125	0.000000	0.000000	0.000000	0.000000	0.014286	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.5	0.0	0.0	0.0
14	0.029851	0.013158	0.012987	0.0000	0.000000	0.014286	0.016129	0.015385	0.014286	0.037037	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
15	0.000000	0.000000	0.000000	0.0000	0.014286	0.014286	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
16	0.000000	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.061538	0.014286	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
17	0.000000	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.046154	0.057143	0.000000	...	0.0	0.000000	0.5	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
18	0.000000	0.000000	0.012987	0.0125	0.000000	0.000000	0.016129	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
19	0.000000	0.065789	0.051948	0.0000	0.000000	0.028571	0.000000	0.000000	0.014286	0.018519	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
20	0.014925	0.039474	0.051948	0.0125	0.028571	0.028571	0.000000	0.030769	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
21	0.000000	0.000000	0.000000	0.0125	0.000000	0.000000	0.032258	0.000000	0.000000	0.018519	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
22	0.059701	0.065789	0.012987	0.0000	0.028571	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
23	0.000000	0.000000	0.000000	0.0250	0.000000	0.000000	0.000000	0.000000	0.000000	0.018519	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
24	0.000000	0.013158	0.000000	0.0250	0.042857	0.071429	0.032258	0.076923	0.014286	0.018519	...	0.2	0.333333	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
25	0.000000	0.000000	0.000000	0.0000	0.014286	0.000000	0.000000	0.000000	0.014286	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
26	0.059701	0.000000	0.025974	0.0250	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
27	0.000000	0.026316	0.012987	0.0125	0.000000	0.000000	0.000000	0.000000	0.014286	0.055556	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
28	0.000000	0.000000	0.025974	0.0000	0.014286	0.000000	0.032258	0.000000	0.014286	0.018519	...	0.0	0.333333	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
29	0.000000	0.000000	0.012987	0.0000	0.000000	0.042857	0.048387	0.000000	0.014286	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
90	0.000000	0.000000	0.000000	0.0000	0.014286	0.014286	0.032258	0.000000	0.000000	0.037037	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
91	0.000000	0.000000	0.000000	0.0000	0.000000	0.000000	0.016129	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
92	0.000000	0.000000	0.000000	0.0375	0.000000	0.000000	0.000000	0.015385	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
93	0.000000	0.052632	0.038961	0.0500	0.028571	0.071429	0.016129	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
94	0.000000	0.026316	0.025974	0.0125	0.042857	0.000000	0.016129	0.000000	0.014286	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.333333	0.0	0.0	0.0	0.0
95	0.000000	0.000000	0.000000	0.0000	0.000000	0.014286	0.000000	0.015385	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.333333	0.000000	0.0	0.0	0.0	0.0
96	0.000000	0.000000	0.025974	0.0000	0.028571	0.000000	0.000000	0.000000	0.000000	0.037037	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
97	0.014925	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.028571	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
98	0.000000	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
99	0.000000	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
100	0.000000	0.013158	0.000000	0.0000	0.014286	0.014286	0.000000	0.000000	0.000000	0.000000	...	0.2	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
101	0.000000	0.013158	0.000000	0.0000	0.000000	0.014286	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
102	0.000000	0.000000	0.012987	0.0250	0.000000	0.014286	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
103	0.000000	0.013158	0.012987	0.0125	0.028571	0.000000	0.000000	0.015385	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
104	0.000000	0.000000	0.000000	0.0000	0.028571	0.000000	0.016129	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
105	0.000000	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.014286	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
106	0.000000	0.000000	0.012987	0.0000	0.028571	0.028571	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
107	0.000000	0.000000	0.012987	0.0250	0.000000	0.000000	0.000000	0.015385	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
108	0.000000	0.026316	0.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
109	0.000000	0.000000	0.000000	0.0125	0.000000	0.000000	0.000000	0.015385	0.000000	0.018519	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
110	0.059701	0.026316	0.025974	0.0000	0.014286	0.028571	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
111	0.014925	0.000000	0.000000	0.0000	0.000000	0.028571	0.016129	0.000000	0.071429	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
112	0.000000	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
113	0.000000	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
114	0.000000	0.000000	0.000000	0.0000	0.014286	0.000000	0.000000	0.015385	0.028571	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
115	0.000000	0.000000	0.012987	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
116	0.000000	0.000000	0.000000	0.0000	0.000000	0.028571	0.000000	0.061538	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
117	0.000000	0.000000	0.038961	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
118	0.029851	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0
119	0.000000	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0

	begin	doc_id	end	id	mention_type	ontology_arr	sent_id	xmi_id	text	sentence_number	cui	umls_xmi_id	preferred_text	template_token	cluster
14	1527	443072	1531	e50bfffb-721e-4dcc-90d8-dca1d6e45ea5	DiseaseDisorderMention	[6376]	009a71c7-cdf5-407d-9595-970d338038ed	6389	Plan	27	C0270724	6376	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
16	653	441342	656	f8fbe689-662d-4b40-94bc-ee9300ceb500	DiseaseDisorderMention	[8954]	00ee58f6-cac3-495d-9f91-70fb8e79764f	8967	HTN	9	C0020538	8954	Hypertensive disease	DiseaseDisorderMention	1
17	658	441342	661	365196de-2e9a-4253-b168-29f73e055d21	DiseaseDisorderMention	[9339]	00ee58f6-cac3-495d-9f91-70fb8e79764f	9352	ARF	9	C0022660	9339	Kidney Failure, Acute	DiseaseDisorderMention	1
18	674	441342	692	b9bb5e80-b1ba-428d-b388-a56daff051df	SignSymptomMention	[10533]	00ee58f6-cac3-495d-9f91-70fb8e79764f	10546	infectious process	9	C0006277	10533	Bronchitis	SignSymptomMention	1
19	694	441342	714	1d906a9d-5cc2-4c34-aa8b-4e0fd18de01b	DiseaseDisorderMention	[9207]	00ee58f6-cac3-495d-9f91-70fb8e79764f	9220	gastric peptic ulcer	9	C0577559	9207	Mass of body structure	DiseaseDisorderMention	1
20	715	441342	722	3f0026fc-8d41-4519-ab8b-d447c7b70495	DiseaseDisorderMention	[9251]	00ee58f6-cac3-495d-9f91-70fb8e79764f	9264	disease	9	C0012634	9251	Disease	DiseaseDisorderMention	1
27	2361	458728	2365	9181f72f-ae82-41aa-8e93-b6a1175e1c75	DiseaseDisorderMention	[17309]	01a1e37e-acab-4245-811a-cbdf6d196c3d	17322	Plan	33	C0270724	17309	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
50	2254	378647	2258	ef0cad34-8f69-47ad-8838-1e48d5c17dfd	DiseaseDisorderMention	[11768]	02d1ff23-b8a8-4a61-8528-68c9b9c81683	11781	Plan	31	C0270724	11768	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
52	868	458691	887	655dda8e-9902-49ba-8d8a-188a001a5688	DiseaseDisorderMention	[12913]	02e10911-c5e5-47e5-8de1-c940acb642f6	12926	Respiratory failure	8	C1145670	12913	Respiratory Failure	DiseaseDisorderMention	1
53	900	458691	904	14d04147-09d9-4954-854f-978e645fc3d0	DiseaseDisorderMention	[12264]	02e10911-c5e5-47e5-8de1-c940acb642f6	12277	ARDS	8	C0035222	12264	Respiratory Distress Syndrome, Adult	DiseaseDisorderMention	1
54	297	350891	300	621ac8c8-7fb4-4b37-8d2e-a2c055b5c25d	DiseaseDisorderMention	[7175]	02e3ac64-04bc-43f4-b44c-2dfd20a03747	7188	Pan	5	C0031036	7175	Polyarteritis Nodosa	DiseaseDisorderMention	1
62	2527	355788	2531	2c39699f-54c4-4918-ae2e-22eee2044bfa	DiseaseDisorderMention	[11337]	03121359-ada0-4d3b-a85d-8766b55c94ad	11350	Plan	44	C0428886	11337	Mean blood pressure	DiseaseDisorderMention	1
75	3376	408714	3383	e4ef6ed4-30b7-482a-83de-d5d7f14b1bb4	DiseaseDisorderMention	[12858]	038dbac0-69ca-4751-961e-15949aae92c7	12871	Anxiety	50	C0003467	12858	Anxiety	DiseaseDisorderMention	1
96	623	438154	627	feb692dd-17dc-4364-9c9d-2f545393ef67	DiseaseDisorderMention	[4456]	04c20bf7-b6f5-4c50-98b9-6ad2d436a46a	4469	Plan	18	C0270724	4456	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
124	3643	442309	3647	d7ec3deb-0387-4d1b-9131-9f7ab6760698	DiseaseDisorderMention	[15539]	05f13f07-32ca-4334-ba79-3760071805bb	15552	Plan	59	C0270724	15539	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
133	2471	458728	2478	3a15f527-56ed-4ddb-ac85-8f89b599806b	DiseaseDisorderMention	[17452]	064bfeac-f01d-4152-a919-fbed9b73cb86	17465	Anxiety	37	C0003467	17452	Anxiety	DiseaseDisorderMention	1
138	4089	355793	4093	6a3dcad8-389a-4ab4-9648-27c2c49f65ef	DiseaseDisorderMention	[16616]	06fe4ce6-f145-489e-881b-652b870c7a54	16629	Plan	76	C0270724	16616	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
168	1780	350384	1784	9192a6a8-0bef-4be4-97bd-19217cf82f68	DiseaseDisorderMention	[10061]	07b64045-c4ee-4970-8e29-d88f1f191bea	10074	Plan	41	C0032952	10061	Prednisone	DiseaseDisorderMention	1
169	83	355538	91	e13708a1-d200-4677-9b0d-e37352ce818f	MedicationMention	[9382, 9372]	07c07c34-a842-4ee5-8f4d-d8baac197f14	9396	Antibody	2	C0003241	9382	Antibodies	MedicationMention	1
170	92	355538	100	70ddb4e8-fee7-46b3-8919-5e7a0caf8fe9	DiseaseDisorderMention	[10961]	07c07c34-a842-4ee5-8f4d-d8baac197f14	10974	Syndrome	2	C0039082	10961	Syndrome	DiseaseDisorderMention	1
171	113	355538	146	6c6ae979-4d5d-43fd-bca2-4c9d85ac900d	DiseaseDisorderMention	[9982]	07c07c34-a842-4ee5-8f4d-d8baac197f14	9995	Microangiopathic Hemolytic Anemia	2	C0221021	9982	Microangiopathic hemolytic anemia	DiseaseDisorderMention	1
172	147	355538	150	b06b2f75-4f8b-4ecc-85bb-a239d4e20b4b	DiseaseDisorderMention	[9938]	07c07c34-a842-4ee5-8f4d-d8baac197f14	9951	TTP	2	C0034155	9938	Purpura, Thrombotic Thrombocytopenic	DiseaseDisorderMention	1
173	169	355538	173	23e299ce-06ad-4f99-902e-ba9ae24d5888	DiseaseDisorderMention	[10785, 10795, 10805]	07c07c34-a842-4ee5-8f4d-d8baac197f14	10820	ESRD	2	C0022661	10785	Kidney Failure, Chronic	DiseaseDisorderMention	1
174	178	355538	204	27da854e-af62-47ca-b1e3-f17cf9304850	ProcedureMention	[12171]	07c07c34-a842-4ee5-8f4d-d8baac197f14	12184	cadaveric renal transplant	2	C0401176	12171	Cadaveric renal transplant	ProcedureMention	1
179	1010	339201	1028	16eb969e-519d-439f-92f9-804fa6f27756	DiseaseDisorderMention	[13510]	07de3131-edbd-4310-8184-e42cdc646218	13523	Alcohol withdrawal	29	C0236663	13510	Alcohol withdrawal syndrome	DiseaseDisorderMention	1
180	1040	339201	1056	e1b7e133-46d4-47be-887c-667f9214c9c3	DiseaseDisorderMention	[13081]	07de3131-edbd-4310-8184-e42cdc646218	13094	delirium tremens	29	C0023901	13081	Liver Function Tests	DiseaseDisorderMention	1
181	1058	339201	1061	0fe2b584-e6b5-4096-8c92-396773d1c79d	DiseaseDisorderMention	[13906]	07de3131-edbd-4310-8184-e42cdc646218	13919	DTs	29	C0001957	13906	Alcohol Withdrawal Delirium	DiseaseDisorderMention	1
182	1063	339201	1071	172f9ff8-121e-435a-b461-31252e5a2af4	SignSymptomMention	[15512]	07de3131-edbd-4310-8184-e42cdc646218	15525	seizures	29	C0036572	15512	Seizures	SignSymptomMention	1
198	760	438166	764	8f72a361-5636-420c-8812-1254765262a7	DiseaseDisorderMention	[5019]	086b67da-3425-4475-8433-e3e842c2a61f	5032	Plan	21	C0000726	5019	Abdomen	DiseaseDisorderMention	1
216	3535	464827	3544	c108c585-e0fd-43fd-bb9b-46f54c5c7632	DiseaseDisorderMention	[14483, 14473, 14463]	0926a6ac-3fee-4484-8afc-2fd8130216a0	14498	infection	58	C1260298	14483	tigecycline	DiseaseDisorderMention	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5805	1373	333703	1377	771bbaa7-eb06-4ba1-97af-d8640d04292d	DiseaseDisorderMention	[8838]	f267ebd3-bb58-4d17-a1c2-3667e23bdd52	8851	Plan	19	C0270724	8838	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
5827	2779	354151	2798	a9a532af-adc4-417e-a87e-06b1af4e2786	DiseaseDisorderMention	[15079]	f335e36b-99aa-4088-a9d0-baef831c1a45	15092	Respiratory failure	49	C1145670	15079	Respiratory Failure	DiseaseDisorderMention	1
5828	2811	354151	2815	0ae7c32f-9a83-46f7-8cb4-010b7078c4c0	DiseaseDisorderMention	[15739]	f335e36b-99aa-4088-a9d0-baef831c1a45	15752	ARDS	49	C0035222	15739	Respiratory Distress Syndrome, Adult	DiseaseDisorderMention	1
5844	1284	443072	1303	48c40579-4e8a-403b-8a09-7ff03a0d823a	DiseaseDisorderMention	[6112]	f3c06428-9576-454f-b460-b7b2983104f0	6125	Respiratory failure	21	C1145670	6112	Respiratory Failure	DiseaseDisorderMention	1
5845	1316	443072	1320	f1023fa7-04f9-4c64-b494-6c5cb08cc9c0	DiseaseDisorderMention	[6156]	f3c06428-9576-454f-b460-b7b2983104f0	6169	ARDS	21	C0035222	6156	Respiratory Distress Syndrome, Adult	DiseaseDisorderMention	1
5851	1544	422294	1548	24477922-4973-43e0-8aa8-61b92495096d	DiseaseDisorderMention	[7495]	f400dbe2-d547-4182-bf2b-306196e45120	7508	Plan	36	C0270724	7495	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
5861	24	354315	27	0c2cf874-0596-4c89-b698-c377248a46ec	SignSymptomMention	[20186]	f4ba23d4-50d3-48cb-bfdc-8fc384a0824d	20199	PMH	0	C0262926	20186	Medical History	SignSymptomMention	1
5862	44	354315	48	00cf6719-a0a6-4d55-ba1f-88558eb136b1	DiseaseDisorderMention	[19315, 19325, 19335]	f4ba23d4-50d3-48cb-bfdc-8fc384a0824d	19350	ESRD	0	C0022661	19315	Kidney Failure, Chronic	DiseaseDisorderMention	1
5863	59	354315	63	ccec44db-7213-446d-a754-17aae4f4317b	DiseaseDisorderMention	[18633]	f4ba23d4-50d3-48cb-bfdc-8fc384a0824d	18646	COPD	0	C0024117	18633	Chronic Obstructive Airway Disease	DiseaseDisorderMention	1
5864	65	354315	91	d310ef02-4a80-44ed-9320-a6a77cae4c6a	DiseaseDisorderMention	[19590]	f4ba23d4-50d3-48cb-bfdc-8fc384a0824d	19603	ischemic\n cardiomyopathy	0	C0349782	19590	Ischemic cardiomyopathy	DiseaseDisorderMention	1
5865	93	354315	97	f42dd455-99dd-489e-a414-cee592a00a12	DiseaseDisorderMention	[18039]	f4ba23d4-50d3-48cb-bfdc-8fc384a0824d	18052	MRSA	0	C0343401	18039	MRSA - Methicillin resistant Staphylococcus au...	DiseaseDisorderMention	1
5866	110	354315	113	18bd8a6c-edfd-4c7b-9cb0-76a9c15fedc7	DiseaseDisorderMention	[18908]	f4ba23d4-50d3-48cb-bfdc-8fc384a0824d	18921	OSA	0	C0520679	18908	Sleep Apnea, Obstructive	DiseaseDisorderMention	1
5867	142	354315	145	8a465df4-b213-4269-b802-6b14b5d744f7	DiseaseDisorderMention	[19480]	f4ba23d4-50d3-48cb-bfdc-8fc384a0824d	19493	CHF	0	C0018802	19480	Congestive heart failure	DiseaseDisorderMention	1
5881	2333	361934	2337	5ef12247-fde5-4b51-8486-fde99354372b	DiseaseDisorderMention	[9451]	f4ef4b9b-a581-44e9-8c14-549186a3ef50	9464	Plan	39	C0270724	9451	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
5951	1368	333908	1372	be108ccc-ddb4-47c0-b273-10af4f99fe33	DiseaseDisorderMention	[9630]	f918cc4a-2f8b-4c5e-a904-3de84efe714b	9643	Plan	18	C0270724	9630	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
5954	1548	339201	1552	4e60214f-2712-48e2-9cc1-09b621dd8c8b	DiseaseDisorderMention	[13422]	f9483446-c5b4-47bf-82e1-42cafc8c319c	13435	Plan	39	C0270724	13422	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
5958	1941	497748	1945	28a92539-426d-41e3-b9b0-244604f427b0	DiseaseDisorderMention	[7178]	f98eff4f-c4bc-40cf-aa4d-e60607336a58	7191	plan	36	C0017973	7178	Glycosaminoglycans	DiseaseDisorderMention	1
5963	3114	438389	3118	96331d13-cf7b-4825-adf4-5b698a502f59	DiseaseDisorderMention	[15326]	f9ce978c-9bc0-4cb7-b6c8-1f1d83912729	15339	Plan	46	C0270724	15326	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
5967	410	341364	414	d3e6e998-fc45-4d83-ab5a-6da7ec707f02	DiseaseDisorderMention	[2309]	fa0ad4ac-d860-4241-bafc-28f5fff766ff	2322	Plan	9	C0270724	2309	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
6011	2533	336621	2537	f1e5b1d2-c242-4710-a23f-d149dc20ea1e	DiseaseDisorderMention	[11684]	fb8ce413-3499-495c-bda3-15537d919296	11697	Plan	44	C0270724	11684	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
6021	1634	354151	1658	96af2f8e-b3d9-43f4-a5f4-fdcea44bd8ab	DiseaseDisorderMention	[15365, 15375]	fc48db0a-ff04-457a-adaa-53b608b51540	15389	Renal failure, End stage	25	C0022661	15365	Kidney Failure, Chronic	DiseaseDisorderMention	1
6022	1649	354151	1652	a483ed81-247d-4ed3-aecc-f3691648dae4	MedicationMention	[12276, 12266]	fc48db0a-ff04-457a-adaa-53b608b51540	12290	End	25	C0082420	12276	Endoglin, human	MedicationMention	1
6023	1660	354151	1683	522e1559-f0b7-4cf5-9f15-8073777f51d0	DiseaseDisorderMention	[15496, 15486]	fc48db0a-ff04-457a-adaa-53b608b51540	15510	End stage renal disease	25	C0022661	15496	Kidney Failure, Chronic	DiseaseDisorderMention	1
6024	1685	354151	1689	e6b3e836-9a73-478e-b816-5b34a7ed9598	DiseaseDisorderMention	[15177, 15167, 15187]	fc48db0a-ff04-457a-adaa-53b608b51540	15202	ESRD	25	C0022661	15177	Kidney Failure, Chronic	DiseaseDisorderMention	1
6027	3750	337300	3754	65c279ea-f692-4157-b614-d3b8c2f67112	DiseaseDisorderMention	[18532]	fc851549-121a-4c61-a625-4bad13c066dc	18545	Plan	65	C0270724	18532	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
6034	626	463241	630	3975816b-3ed0-4809-a469-437e4fe95e94	DiseaseDisorderMention	[2788]	fd105c98-7556-458e-be31-27b47d7f1024	2801	Plan	7	C0270724	2788	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
6037	846	336937	850	35f625d6-5722-49d1-9dca-a9c744754928	DiseaseDisorderMention	[6893]	fd6d0327-2549-4332-a24d-252e5338d621	6906	Plan	13	C0270724	6893	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
6067	1808	366321	1812	97c9e6ed-08b7-4644-a3d2-a5e98523b735	DiseaseDisorderMention	[11048]	fe6e1c34-fda0-4c51-aa69-13d7d82f05b8	11061	Plan	31	C2830004	11048	Somnolence	DiseaseDisorderMention	1
6088	3317	464827	3333	ae72b021-8e64-46b6-a879-e811b5d1ff2f	DiseaseDisorderMention	[13153, 13143]	ffa19818-d69b-4b37-b7fa-730985657e09	13167	Wound dehiscence	53	C0259768	13153	Wound dehiscence	DiseaseDisorderMention	1
6098	1660	355538	1664	168e3d33-161a-4eca-b23a-6dae71b0a092	DiseaseDisorderMention	[10565]	ffe8b11f-9e92-496b-a1f1-e70f225bd171	10578	Plan	31	C0270724	10565	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1

	code	coding_scheme	cui	disambiguated	doc_id	id	preferred_text	tui	xmi_id
148	52713000	SNOMEDCT_US	C0270724	False	458728	4d9e058d-26bb-4c4a-bc40-a78068b10eae	Infantile Neuroaxonal Dystrophy	T047	17309
67	52713000	SNOMEDCT_US	C0270724	False	355788	7c97f733-5d46-4ea1-b21d-9852b9adf99d	Infantile Neuroaxonal Dystrophy	T047	11337
205	52713000	SNOMEDCT_US	C0270724	False	339201	6a9fd3cf-0848-4570-814e-2b14282b0fbd	Infantile Neuroaxonal Dystrophy	T047	13422
152	439569004	SNOMEDCT_US	C0035273	False	438365	12c464c8-e829-491d-95e0-ab1e8923c56b	Resuscitation procedure	T061	17309
77	52713000	SNOMEDCT_US	C0270724	False	350384	f8e98cf6-0afe-46a7-af7d-1cbfcddf852e	Infantile Neuroaxonal Dystrophy	T047	10061
59	312250003	SNOMEDCT_US	C0024485	False	338922	c6ec27c9-7897-45a6-a5cf-899909b0144e	Magnetic Resonance Imaging	T060	4456
96	52713000	SNOMEDCT_US	C0270724	False	442309	007e67f7-69c8-428d-b8ef-d3589e06f2d6	Infantile Neuroaxonal Dystrophy	T047	15539
22	10312003	SNOMEDCT_US	C0032952	False	424894	48223bcb-e16c-46f0-bfdd-1c502c9d388c	Prednisone	T121	10061
99	52713000	SNOMEDCT_US	C0270724	False	355793	b4b962f9-ea4d-4ae3-95f3-d9447e743dc6	Infantile Neuroaxonal Dystrophy	T047	16616
152	439569004	SNOMEDCT_US	C0035273	False	438389	ca5721fd-e5d9-4a5b-ace4-fb68acb89fcc	Resuscitation procedure	T061	17309
97	6797001	SNOMEDCT_US	C0428886	False	361934	cfbf85e6-c90f-4328-b205-3ffa92cf9494	Mean blood pressure	T033	11337
71	271808008	SNOMEDCT_US	C1850534	False	443260	6f912b79-3395-4a2a-b1c8-05442bc6d976	Edema, generalized	T033	10061
162	52713000	SNOMEDCT_US	C0270724	False	378647	528115c7-6d1f-47d9-a8d4-61d9584c3bf4	Infantile Neuroaxonal Dystrophy	T047	11768
141	423827005	SNOMEDCT_US	C0014245	False	343682	e18c6053-bea2-409a-8e28-70255a0d6624	Endoscopy (procedure)	T060	13422
51	52713000	SNOMEDCT_US	C0270724	False	438154	797dc390-6484-4b34-94cb-14d1704fe84e	Infantile Neuroaxonal Dystrophy	T047	4456

	cluster	num_cuis_in_cluster
1	1	179
2	2	117
63	63	111
93	93	103
36	36	94
58	58	87
8	8	85
39	39	78
6	6	78
14	14	71
44	44	65
5	5	65
26	26	63
71	71	63
89	89	59
24	24	57
43	43	54
12	12	54
32	32	53
42	42	49
22	22	46
38	38	44
28	28	43
100	100	43
19	19	43
17	17	41
47	47	40
30	30	38
103	103	38
16	16	36
...	...	...
68	68	8
97	97	8
105	105	7
101	101	7
69	69	7
46	46	7
50	50	7
67	67	7
90	90	7
102	102	6
82	82	6
21	21	6
99	99	6
108	108	5
109	109	5
70	70	4
98	98	4
80	80	4
59	59	4
112	112	3
74	74	3
57	57	3
4	4	3
84	84	3
27	27	2
48	48	2
117	117	2
10	10	1
118	118	1
52	52	1

	begin	doc_id	end	id	mention_type	ontology_arr	sent_id	xmi_id	text	sentence_number	cui	umls_xmi_id	preferred_text	template_token	cluster
114	876	361930	881	c37ca5d7-74de-47d4-b437-468c1e6b97e6	AnatomicalSiteMention	[16821]	0565613d-e39b-4576-b670-e3e88f5cf982	16834	Blood	8	C0005767	16821	Blood	AnatomicalSiteMention	33
115	884	361930	898	63a2bbd1-c119-4a6c-ba2c-e016b1c5d4a0	ProcedureMention	[15816, 15806]	0565613d-e39b-4576-b670-e3e88f5cf982	15830	urine cultures	8	C0430404	15816	Urine for culture	ProcedureMention	33
116	923	361930	934	a42518a0-8b0d-4c47-a566-8bfbf5efc069	MedicationMention	[12536]	0565613d-e39b-4576-b670-e3e88f5cf982	12549	Antibiotics	8	C0003232	12536	Antibiotics	MedicationMention	33
419	392	361930	405	d1caf679-169f-47c1-8c6a-c1423692fb02	ProcedureMention	[16022]	12f3be14-c673-4b85-bebf-faa41151c1b9	16035	stabilization	4	C1293130	16022	Stabilization	ProcedureMention	79
420	423	361930	431	aaf7ca5a-7c5e-4d7d-91c1-6760e05a5269	AnatomicalSiteMention	[16990]	12f3be14-c673-4b85-bebf-faa41151c1b9	17003	arterial	4	C0003842	16990	Arteries	AnatomicalSiteMention	79

	sent_id	sem_template	sentence_number	doc_id	begin	end	cluster
252	17c47df8-37b5-47cd-a1b4-b056fa6553a9	DiseaseDisorderMention SignSymptomMention	40	333908	1956	1979	26
281	1a3970bd-14fd-49e5-a606-ca7f61fc0eaf	SignSymptomMention DiseaseDisorderMention Sign...	0	427806	0	147	26
517	2d836805-7d80-4809-811d-fb3fd0c6c3b1	have DiseaseDisorderMention SignSymptomMention...	14	378629	1135	1220	26
1327	79d5ea31-7213-457a-82af-ed755e165778	give SignSymptomMention SignSymptomMention Dis...	2	442499	255	619	26
1908	b06329da-4fb9-45f6-a384-9e25b425ee7f	SignSymptomMention DiseaseDisorderMention Sign...	11	361823	1163	1254	26

	begin	doc_id	end	id	mention_type	ontology_arr	sent_id	xmi_id	text	sentence_number	cui	umls_xmi_id	preferred_text	template_token	cluster
525	1956	333908	1959	4b0ab019-d02c-4f7b-a363-2a24ead17145	DiseaseDisorderMention	[9255, 9245]	17c47df8-37b5-47cd-a1b4-b056fa6553a9	9269	gtt	40	C0042029	9255	Urinary tract infection	DiseaseDisorderMention	26
526	1963	333908	1974	62064326-c2c6-4490-ab47-c3117e8bcd79	SignSymptomMention	[10289]	17c47df8-37b5-47cd-a1b4-b056fa6553a9	10302	hypotension	40	C0020649	10289	Hypotension	SignSymptomMention	26

	code	coding_scheme	cui	disambiguated	doc_id	id	preferred_text	tui	xmi_id
50	68566005	SNOMEDCT_US	C0042029	False	453048	ba0d4374-d679-4683-b85a-55e31b20de88	Urinary tract infection	T047	9255
154	68566005	SNOMEDCT_US	C0042029	False	374110	46b50269-81b1-4055-930f-0cf480fe5868	Urinary tract infection	T047	18650

	doc_id	cnt
77	443064	3
35	366305	5
92	464813	7
46	384296	9
95	476781	10
15	341364	12
87	459534	13
90	464653	13
40	374185	14
64	438151	16

	begin	doc_id	end	id	mention_type	ontology_arr	sent_id	xmi_id	text	sentence_number	cui	umls_xmi_id	preferred_text	template_token	cluster
1320	111	476781	123	8743a755-7e36-4400-9f8d-14c89d5b0624	DiseaseDisorderMention	[1682]	36c214a8-424f-4b54-a3b2-84f6daf1aef1	1695	Bell's palsy	0	C0376175	1682	Bell Palsy	DiseaseDisorderMention	43
1321	140	476781	149	50b0dd7b-f6bb-4a15-8b75-4b4a48b72e5a	MedicationMention	[1569, 1539, 1589, 1559, 1529, 1579, 1549, 1519]	36c214a8-424f-4b54-a3b2-84f6daf1aef1	1609	acyclovir	0	C0001367	1569	Acyclovir	MedicationMention	43
3763	299	476781	312	d3679191-ebfc-4a01-8555-34e72d25c81a	SignSymptomMention	[2007, 1997]	9d69d84d-1184-4bb3-ab7a-bd44c370f375	2021	mental status	2	C0278060	2007	Mental state	SignSymptomMention	27
3764	318	476781	326	3ade634c-9672-49a2-99b1-c7151ac1ac2a	DiseaseDisorderMention	[1780, 1770]	9d69d84d-1184-4bb3-ab7a-bd44c370f375	1794	Delirium	2	C0011206	1780	Delirium	DiseaseDisorderMention	27
4731	370	476781	374	7aa86886-5eca-445e-8ee4-765175707413	DiseaseDisorderMention	[1726]	c63b6ac6-9b5a-4bb7-b5b4-04345e4340bc	1739	Plan	6	C0270724	1726	Infantile Neuroaxonal Dystrophy	DiseaseDisorderMention	1
5891	187	476781	194	22b64b29-2690-4c34-bc40-94cb258a6d91	SignSymptomMention	[1825]	f5898ed8-cf97-4c65-b1f3-4d207548b6de	1838	history	1	C0262926	1825	Medical History	SignSymptomMention	38
5892	198	476781	202	fe9fcf13-19f2-4771-b793-553fc9a15f91	AnatomicalSiteMention	[2119]	f5898ed8-cf97-4c65-b1f3-4d207548b6de	2132	neck	1	C0027530	2119	Neck	AnatomicalSiteMention	38
5893	207	476781	220	55dead84-b220-4019-bb86-458c9bb85ec7	SignSymptomMention	[1911]	f5898ed8-cf97-4c65-b1f3-4d207548b6de	1924	shoulder pain	1	C0037011	1911	Shoulder Pain	SignSymptomMention	38
5894	260	476781	273	be575703-158a-4e44-81a7-a538f1bfb8a4	MedicationMention	[1639]	f5898ed8-cf97-4c65-b1f3-4d207548b6de	1652	oral morphine	1	C0026549	1639	Morphine	MedicationMention	38
5895	278	476781	286	2e34af2e-74c5-46d2-8036-944bb889dc6a	MedicationMention	[1377, 1367]	f5898ed8-cf97-4c65-b1f3-4d207548b6de	1391	percocet	1	C0086787	1377	Percocet	MedicationMention	38

	cluster	num_cuis_in_cluster
1	1	179
2	2	117
63	63	111
93	93	103
36	36	94
58	58	87
8	8	85
39	39	78
6	6	78
14	14	71
44	44	65
5	5	65
26	26	63
71	71	63
89	89	59
24	24	57
43	43	54
12	12	54
32	32	53
42	42	49
22	22	46
38	38	44
28	28	43
100	100	43
19	19	43
17	17	41
47	47	40
30	30	38
103	103	38
16	16	36
...	...	...
68	68	8
97	97	8
105	105	7
101	101	7
69	69	7
46	46	7
50	50	7
67	67	7
90	90	7
102	102	6
82	82	6
21	21	6
99	99	6
108	108	5
109	109	5
70	70	4
98	98	4
80	80	4
59	59	4
112	112	3
74	74	3
57	57	3
4	4	3
84	84	3
27	27	2
48	48	2
117	117	2
10	10	1
118	118	1
52	52	1