In [ ]:
import logging

# gensim docs recommend setting this up, but the following doesn't work in Jupyter
##logger = logging.getLogger(__name__)
##logger.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# found that this works - though output is to console, not Jupyter output
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [ ]:
from gensim import corpora, models, similarities
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [ ]:
import sqlite3
import os
#sqlitedb = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'ctpa.sqlite')
sqlitedb = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'mimic3.sqlite')
if not (os.path.exists(sqlitedb)):
    print("Specified database does not exist")
    sys.exit()

connection = sqlite3.connect(sqlitedb)
with connection:
    cur = connection.cursor()
#    cur.execute('select * from reports')
    cur.execute("select text from noteevents where category = 'Radiology'")
#    col_names = [cn[0] for cn in cur.description]
    rows = cur.fetchall()
    #print(len(rows[0]))
    #print("%s %s %s %s %s %s" % (col_names[0], col_names[1], col_names[2], col_names[3], col_names[4], col_names[5]))

    documents = []
    for row in rows:
#        d = row[4]
        documents.append(row[0])
    print('Read', len(documents), 'documents.')

In [ ]:
import nltk.data
from nltk.tokenize import word_tokenize
import re

counter = 0
training_sentences = []
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# really need to parallelize this - silly to only process one document at a time!
for document in documents:
    # convert lines of underscores into period so they trigger a sentence boundary with NLTK
    document = re.sub( '___+', '.', document)

    counter += 1
    # Load the punkt tokenizer pre-trained on english text to improve
    # sentence splitting, would need to create custom tokenizer that understands
    # radiology report sections. However, I think this may be good enoug for now.
    output = sent_tokenizer.tokenize(document)
    
    # NLTK sentence splitter; handles punctuation better, but don't like how
    # "we'll" becomes two words "we" and "'ll"
    output = [word_tokenize(o.lower()) for o in output]    
    # alternative std python split function - this is much faster than the NLTK splitter
    #output = [o.lower().split() for o in output]

    if (counter % 10000 == 0):
        logger.info('Processed ' + str(counter) + ' documents.')

    for o in output:
        training_sentences.append(o)
    
#pp.pprint(training_sentences)
print('Total documents:', counter, '(should agree with previous number of documents.)')
print('Total sentences:', len(training_sentences))

In [ ]:
# only want 'words' that have either letters or numbers - exclude items that are only punctuation or other symbols.
real_word = re.compile('.*\w+.*')
total_word_count = 0
for s in training_sentences:
    for w in s:
        if real_word.match(w):
            total_word_count += 1
            
print('Total words:', total_word_count)

In [ ]:
import sqlite3
import os
from gensim.models import word2vec, Phrases
retrain = False
if(retrain):
    # Set values for various parameters, starting point provided by 
    # https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
    size = 300            # Word vector dimensionality, previously this was "num_features"                      
    min_word_count = 10   # Minimum word count, default is 5
    num_workers = 4       # Number of threads to run in parallel
    context = 20          # Context window size - set to large as some report sections 
                          #    aren't prose but are instead mostly shorthand notation  
    # default sample = 1e-3   # Downsample setting for frequent words

    # from the gensim documentation:
    #   Note that there is a gensim.models.phrases module which lets you automatically detect
    #    phrases longer than one word. Using phrases, you can learn a word2vec model where 
    #    “words” are actually multiword expressions, such as new_york_times or financial_crisis:
    bigram_transformer = Phrases(training_sentences)
    trigram_transformer = Phrases(bigram_transformer[training_sentences])
    #model = Word2Vec(bigram_transformer[sentences], size=100, ...)

    model = word2vec.Word2Vec(trigram_transformer[bigram_transformer[training_sentences]], \
                              workers=num_workers, \
                              size=num_features, \
                              min_count = min_word_count, \
                              window = context)
    model.save(os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'word2vec_full_radiology.model'))
else:
    model = word2vec.Word2Vec.load(os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'word2vec_full_radiology.model'))

print('Model ready for use.')

In [ ]:
model.most_similar("pe", topn=20)

In [ ]:
model.most_similar("pulmonary_embolism", topn=20)

In [ ]:
model.most_similar_cosmul("pe", topn=20)

In [ ]:
model.doesnt_match("embolism fracture pulmonary lung".split())

In [ ]:
model.similarity('pe', 'pulmonary_embolism')

In [ ]:
count = 0
print(type(model.index2word))
for m in model.index2word:
    print(m)
    count += 1

print("Total terms in model: ", count)

In [ ]: