In [34]:
import logging

# gensim docs recommend setting this up, but the following doesn't work in Jupyter
##logger = logging.getLogger(__name__)
##logger.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# found that this works - though output is to console, not Jupyter output
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [35]:
from gensim import corpora, models, similarities
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [36]:
import sqlite3
import os
#sqlitedb = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'ctpa.sqlite')
sqlitedb = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'mimic3.sqlite')
if not (os.path.exists(sqlitedb)):
    print("Specified database does not exist")
    sys.exit()

connection = sqlite3.connect(sqlitedb)
with connection:
    cur = connection.cursor()
#    cur.execute('select * from reports')
    cur.execute("select text from noteevents where category = 'Radiology' and text like '%pulmonary%'")
#    col_names = [cn[0] for cn in cur.description]
    rows = cur.fetchall()
    #print(len(rows[0]))
    #print("%s %s %s %s %s %s" % (col_names[0], col_names[1], col_names[2], col_names[3], col_names[4], col_names[5]))

    documents = []
    for row in rows:
#        d = row[4]
        documents.append(row[0])
    print('Read', len(documents), 'documents.')


Read 139834 documents.

In [37]:
import nltk.data
from nltk.tokenize import word_tokenize
import re

counter = 0
training_sentences = []
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# really need to parallelize this - silly to only process one document at a time!
for document in documents:
    # convert lines of underscores into period so they trigger a sentence boundary with NLTK
    document = re.sub( '___+', '.', document)

    counter += 1
    # Load the punkt tokenizer pre-trained on english text to improve
    # sentence splitting, would need to create custom tokenizer that understands
    # radiology report sections. However, I think this may be good enoug for now.
    output = sent_tokenizer.tokenize(document)
    
    # NLTK sentence splitter; handles punctuation better, but don't like how
    # "we'll" becomes two words "we" and "'ll"
    output = [word_tokenize(o.lower()) for o in output]    
    # alternative std python split function - this is much faster than the NLTK splitter
    #output = [o.lower().split() for o in output]

    if (counter % 1000 == 0):
        logger.info('Processed ' + str(counter) + ' documents.')

    for o in output:
        training_sentences.append(o)
    
#pp.pprint(training_sentences)
print('Total documents:', counter, '(should agree with previous number of documents.)')
print('Total sentences:', len(training_sentences))


Total documents: 139834 (should agree with previous number of documents.)
Total sentences: 2393790

In [38]:
from gensim.models import word2vec, Phrases
retrain = True
if(retrain):
    # Set values for various parameters, starting point provided by 
    # https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
    num_features = 300    # Word vector dimensionality                      
    min_word_count = 5    # Minimum word count, default is 5
    num_workers = 4       # Number of threads to run in parallel
    context = 20          # Context window size - set to large as some report sections 
                          #    aren't prose but are instead mostly shorthand notation  
    # default sample = 1e-3   # Downsample setting for frequent words

    # from the gensim documentation:
    #   Note that there is a gensim.models.phrases module which lets you automatically detect
    #    phrases longer than one word. Using phrases, you can learn a word2vec model where 
    #    “words” are actually multiword expressions, such as new_york_times or financial_crisis:
    bigram_transformer = Phrases(training_sentences)
    trigram_transformer = Phrases(bigram_transformer[training_sentences])
    #model = Word2Vec(bigram_transformer[sentences], size=100, ...)

    model = word2vec.Word2Vec(trigram_transformer[bigram_transformer[training_sentences]], \
                              workers=num_workers, \
                              size=num_features, \
                              min_count = min_word_count, \
                              window = context,
                              negative=0, hs=1) # use hierarchal softmax
    model.save(os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'word2vec_mimic3.model'))
else:
    model = word2vec.Word2Vec.load(os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'word2vec_mimic3.model'))

print('Model ready for use.')


Model ready for use.

In [42]:
#model.most_similar("pulmonary_embolism", topn=5)
model.most_similar("hypertension", topn=5)


Out[42]:
[('arterial_hypertension', 0.6557725667953491),
 ('diabetes', 0.5016984343528748),
 ('copd', 0.45498859882354736),
 ('end-_stage_renal', 0.4476727247238159),
 ('artery', 0.44748634099960327)]

In [30]:
model.most_similar_cosmul("pulmonary_embolism", topn=5)


Out[30]:
[('pulmonary_embolus', 0.8827810883522034),
 ('recurrence', 0.8298359513282776),
 ('cervical_spine_fracture', 0.8218733668327332),
 ('intraarterial_filling_defects', 0.8035216927528381),
 ('aortic_dissection', 0.7832710146903992)]

In [31]:
model.doesnt_match("embolism fracture pulmonary lung".split())


Out[31]:
'fracture'

In [32]:
model.similarity('heart', 'pulmonary')


Out[32]:
-0.056489989317015918

In [24]:
model['pulmonary']


Out[24]:
array([ 0.12226088, -0.09986808,  0.15960208,  0.15728202, -0.08715954,
       -0.07322888,  0.10454237,  0.02746133, -0.214715  ,  0.25200212,
        0.28033361,  0.07777113,  0.14724384,  0.0305615 ,  0.20448543,
        0.23157008,  0.05041288, -0.06596602,  0.07892799,  0.05021708,
        0.16758342,  0.07257286, -0.01204597, -0.01905949, -0.08105014,
       -0.29252869, -0.39353162,  0.00816245,  0.03349591,  0.13277838,
       -0.08022843, -0.00757302,  0.19938199,  0.08181747,  0.06211767,
        0.07793283, -0.25264481,  0.0972663 ,  0.01344704, -0.12573668,
       -0.20251122,  0.00797517, -0.04851846,  0.02214212, -0.21393654,
       -0.00628168,  0.05975638,  0.1591824 ,  0.18776827,  0.0649076 ,
        0.07142299,  0.01134679,  0.1108847 ,  0.18774967, -0.02545563,
       -0.17305388,  0.26548243, -0.24695414,  0.10063036,  0.14536436,
       -0.42197403, -0.04089982,  0.12709691, -0.00258544,  0.10918406,
        0.07942567, -0.06314755,  0.11329138, -0.10015193,  0.13069156,
        0.31571212,  0.05686886,  0.21054435,  0.22091338,  0.0335255 ,
        0.2536377 ,  0.04367675,  0.32792476,  0.06621389,  0.1492683 ,
       -0.3089363 ,  0.15556958, -0.05699058,  0.00553698,  0.09595138,
        0.12146559,  0.02762809, -0.08461247,  0.15088923, -0.09796299,
       -0.00603593, -0.03895748,  0.09064839, -0.22086756, -0.03820214,
        0.01015452, -0.13967834,  0.16106416, -0.01523545, -0.25771606,
        0.32416099,  0.01479305, -0.07595318,  0.07511736, -0.20908673,
       -0.06032772,  0.26524153, -0.08089854, -0.13385627,  0.14957635,
       -0.14551714,  0.02103581, -0.03247695, -0.04327335, -0.09787235,
        0.14632085,  0.01649678, -0.18571913,  0.09965235,  0.03978478,
        0.02071232,  0.04865055,  0.13037261,  0.31955194,  0.25224614,
       -0.05751751, -0.20179859,  0.05445533,  0.27734971, -0.37645781,
       -0.03637466, -0.07002191,  0.0197606 ,  0.00839588,  0.04548286,
       -0.08266832, -0.01026484, -0.04422405, -0.15506136,  0.10223421,
       -0.07981584, -0.29359683, -0.0858044 ,  0.41271785, -0.24022067,
        0.10252051,  0.1035196 ,  0.07486682, -0.05112133, -0.04633168,
       -0.02969544, -0.37957633, -0.0256737 ,  0.10438021, -0.22620313,
       -0.15971565, -0.22676735, -0.06539095, -0.15188177,  0.08308921,
       -0.12288539,  0.24181829,  0.04772521,  0.18882254,  0.05270986,
       -0.37288368,  0.0732641 , -0.04642523, -0.09939796, -0.06363439,
       -0.36053756, -0.15154901, -0.04289202, -0.08593476, -0.07837287,
        0.24763219,  0.15863261,  0.30836496, -0.15884027,  0.0008045 ,
       -0.15720174, -0.07743108, -0.11350256, -0.1650836 ,  0.17749286,
        0.04892087,  0.02757153, -0.00239981,  0.03075592,  0.13723376,
        0.04719549,  0.18788388, -0.01978051,  0.0328429 , -0.26177388,
        0.12810703, -0.10947601,  0.2282974 , -0.36474645,  0.00808131,
       -0.05483712, -0.05975857, -0.18486916, -0.22394729, -0.07666639,
       -0.15740465,  0.30660141,  0.10442688,  0.11119017, -0.00767022,
       -0.05947855, -0.14273612,  0.04357133,  0.07601267, -0.16304599,
       -0.13430668, -0.02514091, -0.32170734, -0.05674896,  0.09926035,
        0.01452112, -0.10121953,  0.12907296, -0.11565642,  0.08598266,
       -0.01356647, -0.17235674,  0.0032945 ,  0.14380185,  0.25918767,
        0.27545291, -0.10017081, -0.02481437,  0.01211742, -0.10743919,
       -0.00944406, -0.1345672 ,  0.10850019,  0.10586335, -0.23646387,
        0.00601967,  0.20303139,  0.00976667, -0.23724139,  0.27390608,
        0.12438907,  0.27328652, -0.08401756, -0.03968932, -0.09428386,
        0.07584975,  0.1524462 , -0.03698736,  0.12575489,  0.17137735,
        0.18789175,  0.12138759,  0.10129077, -0.04173847, -0.04464551,
        0.08875333,  0.1559519 ,  0.11329752, -0.10297976, -0.18194458,
        0.26576969,  0.08745864, -0.01799619,  0.12606004,  0.16077569,
        0.2296339 , -0.11126439,  0.06221353,  0.02571889, -0.18824732,
       -0.06195777,  0.00063869, -0.10814091,  0.20312093, -0.06103781,
        0.11337466,  0.20742531, -0.11189275, -0.06304377, -0.1174637 ,
       -0.09097233, -0.05173945, -0.10628837, -0.00852184,  0.20125821,
        0.15775685,  0.15578361,  0.02953441, -0.00170083, -0.01776535,
       -0.03023959,  0.05414073,  0.14892603,  0.03209312, -0.06848533], dtype=float32)

In [ ]: