In [192]:
from pyspark import SparkContext
from pyspark.mllib.feature import Word2Vec
from pyspark.sql import HiveContext
sentences = sc.textFile("practice_fusion/sentences_nlp").map(lambda row: row.split(" "))
word2vec = Word2Vec()
word2vec.setSeed(0)
word2vec.setVectorSize(100)
model = word2vec.fit(sentences)
In [193]:
def normalize_icd9(icd9):
first_part = icd9[0:3].lower()
second_part = icd9[3:]
if len(second_part) > 0:
return first_part + '.' + second_part
else:
return first_part
def read_diag_map():
ret = {}
with open('/root/clinical2vec/CMS32_DESC_LONG_DX.txt') as f:
content = f.readlines()
for line in content:
key = normalize_icd9(line[0:6].strip())
value = line[6:].strip()
ret[key] = value
return ret
diag_map = read_diag_map()
def pretty_print(concept):
tokens = concept.split('::')
if tokens[0] == 'dx':
diag = tokens[1]
if diag[-1] == '.':
diag = diag[0:len(diag) - 1]
try:
return 'dx: {} -- {}'.format(diag, diag_map[diag])
except KeyError:
if '.' not in diag:
first_try = pretty_print('dx::' + diag + '.0')
if first_try.startswith('dx::'):
return pretty_print('dx::' + diag + '.00')
else:
return first_try
else:
if diag.endswith('00'):
return concept
else:
return pretty_print('dx::' + diag + '0')
else:
return concept
def print_synonyms_filt(clinical_concept, model, prefix):
synonyms = model.findSynonyms(clinical_concept, 10000)
i = 0
for word, cosine_distance in synonyms:
if prefix is None or word.startswith(prefix):
print "{}: {}".format(cosine_distance, pretty_print(word))
i = i+1
if i > 10:
return
def print_synonyms(clinical_concept, model):
print_synonyms_filt(clinical_concept, model, None)
In [200]:
print_synonyms('dx::440.0', model)
There have been long-standing connections noticed between ulcers and atherosclerosis. Partiaully due to smokers having a higher than average incidence of peptic ulcers and atherosclerosis. You can see an editorial in the British Medical Journal all the way back in the 1970's discussing this.
From an article from the Journal of Atherosclerosis in 2012:
Sensorineural hearing loss seemed to be associated with vascular endothelial dysfunction and an increased cardiovascular risk
These procedures are common among those with osteoarthritis and there has been a solid correlation between osteoarthritis and atherosclerosis in the literature.
In [194]:
#Crohn's Disease
print_synonyms('dx::555.9', model)
From the Crohn's and Colitis Foundation of America:
Arthritis, or inflammation of the joints, is the most common extraintestinal complication of IBD. It may affect as many as 25% of people with Crohn’s disease or ulcerative colitis. Although arthritis is typically associated with advancing age, in IBD it often strikes the youngest patients.
While not much medical literature exists with a specific link to dental abscesses and Crohn's (there are general oral issues noticed here), you do see lengthy discussions on the Crohn's forums about abscesses being a common occurance with Crohn's.
Candidiasis of skin and nails is a form of yeast infection on the skin. From the journal "Critical Review of Microbiology" here.
It is widely accepted that Candidia could result from an inappropriate inflammatory response to intestinal microorganisms in a genetically susceptible host. Most studies to date have concerned the involvement of bacteria in disease progression. In addition to bacteria, there appears to be a possible link between the commensal yeast Candida albicans and disease development.
In [195]:
print_synonyms_filt('dx::042', model, 'rx')
From the list above, we see