In [ ]:
import logging
# gensim docs recommend setting this up, but the following doesn't work in Jupyter
##logger = logging.getLogger(__name__)
##logger.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# found that this works - though output is to console, not Jupyter output
logger = logging.getLogger()
logger.setLevel(logging.INFO)
In [ ]:
from gensim import corpora, models, similarities
import pprint
pp = pprint.PrettyPrinter(indent=4)
In [ ]:
import sqlite3
import os
#sqlitedb = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'ctpa.sqlite')
sqlitedb = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'mimic3.sqlite')
if not (os.path.exists(sqlitedb)):
print("Specified database does not exist")
sys.exit()
connection = sqlite3.connect(sqlitedb)
with connection:
cur = connection.cursor()
# cur.execute('select * from reports')
cur.execute("select text from noteevents where category = 'Radiology'")
# col_names = [cn[0] for cn in cur.description]
rows = cur.fetchall()
#print(len(rows[0]))
#print("%s %s %s %s %s %s" % (col_names[0], col_names[1], col_names[2], col_names[3], col_names[4], col_names[5]))
documents = []
for row in rows:
# d = row[4]
documents.append(row[0])
print('Read', len(documents), 'documents.')
In [ ]:
import nltk.data
from nltk.tokenize import word_tokenize
import re
counter = 0
training_sentences = []
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# really need to parallelize this - silly to only process one document at a time!
for document in documents:
# convert lines of underscores into period so they trigger a sentence boundary with NLTK
document = re.sub( '___+', '.', document)
counter += 1
# Load the punkt tokenizer pre-trained on english text to improve
# sentence splitting, would need to create custom tokenizer that understands
# radiology report sections. However, I think this may be good enoug for now.
output = sent_tokenizer.tokenize(document)
# NLTK sentence splitter; handles punctuation better, but don't like how
# "we'll" becomes two words "we" and "'ll"
output = [word_tokenize(o.lower()) for o in output]
# alternative std python split function - this is much faster than the NLTK splitter
#output = [o.lower().split() for o in output]
if (counter % 10000 == 0):
logger.info('Processed ' + str(counter) + ' documents.')
for o in output:
training_sentences.append(o)
#pp.pprint(training_sentences)
print('Total documents:', counter, '(should agree with previous number of documents.)')
print('Total sentences:', len(training_sentences))
In [ ]:
# only want 'words' that have either letters or numbers - exclude items that are only punctuation or other symbols.
real_word = re.compile('.*\w+.*')
total_word_count = 0
for s in training_sentences:
for w in s:
if real_word.match(w):
total_word_count += 1
print('Total words:', total_word_count)
In [ ]:
import sqlite3
import os
from gensim.models import word2vec, Phrases
retrain = False
if(retrain):
# Set values for various parameters, starting point provided by
# https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
size = 300 # Word vector dimensionality, previously this was "num_features"
min_word_count = 10 # Minimum word count, default is 5
num_workers = 4 # Number of threads to run in parallel
context = 20 # Context window size - set to large as some report sections
# aren't prose but are instead mostly shorthand notation
# default sample = 1e-3 # Downsample setting for frequent words
# from the gensim documentation:
# Note that there is a gensim.models.phrases module which lets you automatically detect
# phrases longer than one word. Using phrases, you can learn a word2vec model where
# “words” are actually multiword expressions, such as new_york_times or financial_crisis:
bigram_transformer = Phrases(training_sentences)
trigram_transformer = Phrases(bigram_transformer[training_sentences])
#model = Word2Vec(bigram_transformer[sentences], size=100, ...)
model = word2vec.Word2Vec(trigram_transformer[bigram_transformer[training_sentences]], \
workers=num_workers, \
size=num_features, \
min_count = min_word_count, \
window = context)
model.save(os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'word2vec_full_radiology.model'))
else:
model = word2vec.Word2Vec.load(os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'word2vec_full_radiology.model'))
print('Model ready for use.')
In [ ]:
model.most_similar("pe", topn=20)
In [ ]:
model.most_similar("pulmonary_embolism", topn=20)
In [ ]:
model.most_similar_cosmul("pe", topn=20)
In [ ]:
model.doesnt_match("embolism fracture pulmonary lung".split())
In [ ]:
model.similarity('pe', 'pulmonary_embolism')
In [ ]:
count = 0
print(type(model.index2word))
for m in model.index2word:
print(m)
count += 1
print("Total terms in model: ", count)
In [ ]: