In [1]:
from gensim import corpora, models, similarities, utils
import gensim, simserver
# import nltk, codecs, string, random, math, cPickle as pickle, re, datetime, pandas as pd, os
# from collections import Counter, defaultdict
# from bs4 import BeautifulSoup
# sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
# stopset = set(nltk.corpus.stopwords.words('english'))
In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [3]:
trial_desc = {}
for row in codecs.open('../data/clinical_study.txt','r','utf-8').readlines():
data = row.split('|')
brief_desc, detail_desc = (data[9].replace('<br />',' '),
data[10].replace('<br />',' ') if len(data[10]) > 50 else '')
trial_desc[data[0]] = brief_desc, detail_desc
In [4]:
corpus = [{'id': nct_id, 'tokens': utils.simple_preprocess(' '.join(text_tup))}
for nct_id, text_tup in trial_desc.items()]
In [2]:
server = simserver.SessionServer('../data/docsim_server')
In [ ]:
utils.upload_chunked(server, corpus, chunksize=100)
In [ ]:
server.train(corpus, method='lsi')
In [ ]:
server.index(corpus)
In [5]:
corrections = {"Sarcoma, Ewing's": 'Sarcoma, Ewing',
'Beta-Thalassemia': 'beta-Thalassemia',
'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',
'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',
'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',
'Felty''s Syndrome': 'Felty Syndrome',
'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',
'Retrognathism': 'Retrognathia',
'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',
'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',
'Von Willebrand Diseases': 'von Willebrand Diseases',
'Pontine Glioma': 'Brain Stem Neoplasms',
'Mental Retardation': 'Intellectual Disability',
'Overdose': 'Drug Overdose',
'Beta-Mannosidosis': 'beta-Mannosidosis',
'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',
'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',
'Alpha-Thalassemia': 'alpha-Thalassemia',
'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',
'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',
'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',
'Alpha-Mannosidosis': 'alpha-Mannosidosis',
'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'
}
cond = defaultdict(set)
cond_r = defaultdict(set)
for row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():
row_id, trial_id, mesh_term = row.strip().split('|')
if mesh_term in corrections: mesh_term = corrections[mesh_term]
cond[mesh_term].add(trial_id)
cond_r[trial_id].add(mesh_term)
mesh_codes = {}
mesh_codes_r = defaultdict(set)
for row in codecs.open('../data/mesh_thesaurus.txt','r','utf-8').readlines():
row_id, mesh_id, mesh_term = row.strip().split('|')
mesh_codes[mesh_id] = mesh_term
mesh_codes_r[mesh_term].add(mesh_id)
# limiting to conditions that appear in ten or more trials
top_cond = {c for c in cond if len(cond[c]) >= 10}
trials = {t for c in top_cond for t in cond[c]}
In [6]:
new_trials = os.listdir('../data/new_trial_xml/')
In [9]:
new_corpus = {}
new_conds = defaultdict(set)
In [10]:
for fname in random.sample(new_trials,1000):
soup = BeautifulSoup(codecs.open('../data/new_trial_xml/' + fname, 'r', 'utf-8').read())
nct_id = soup.clinical_study.id_info.nct_id.contents[0].replace('\r\n','')
text_desc = ''
if soup.clinical_study.brief_summary:
text_desc += soup.clinical_study.brief_summary.textblock.contents[0].replace('\r\n',' ')
if soup.clinical_study.detailed_description:
text_desc += soup.clinical_study.detailed_description.textblock.contents[0].replace('\r\n',' ')
if soup.clinical_study.condition_browse:
for m in soup.clinical_study.condition_browse.findAll('mesh_term'):
new_conds[nct_id].add(m.contents[0])
if text_desc:
new_corpus[nct_id] = utils.simple_preprocess(text_desc)
In [11]:
print len(new_corpus)
print len(new_conds)
In [13]:
knn_preds = {}
In [ ]:
cnt = 0
for nct_id, tokens in new_corpus.items():
cnt += 1
if nct_id not in knn_preds:
this_guess = defaultdict(float)
for sim_id, dist, _ in server.find_similar({'tokens': tokens}, min_score=0.4, max_results=10):
for m in cond_r[sim_id]:
this_guess[m] += dist
knn_preds[nct_id] = this_guess
if cnt % 10 == 0: print cnt, datetime.datetime.now().time()
In [17]:
%matplotlib inline
knn_accuracy = {}
for trial_id in new_conds.keys():
# initialize variables for this prediction
accuracy = {'exact': 10000,
'hypernym': {}}
this_pred = knn_preds[trial_id]
val_order = {j[1]: i for i, j in enumerate(sorted(this_pred.items(), key=lambda x: x[1], reverse=True))}
# loop through known MeSH terms to look for greatest overlap
for m in new_conds[trial_id]:
if m in this_pred:
this_rank = val_order[this_pred[m]]
if this_rank < accuracy['exact']:
accuracy['exact'] = this_rank
if accuracy['exact'] == 10000: accuracy['exact'] = None
knn_accuracy[trial_id] = accuracy
c = Counter([knn_accuracy[t]['exact']+1
if knn_accuracy[t]['exact'] is not None
else 'No match'
for t in knn_accuracy.keys()])
print sum(c.values()), len(knn_accuracy)
ax = pd.DataFrame(c.values(), index=c.keys()).plot(kind='bar',
figsize=(8,8),
legend=False)
ax.set_xlabel("Highest rank of nearest neighbor prediction exact match")
ax.set_ylabel("Number of trials")
ax.set_title("Evaluating KNN predictions using manually assigned MeSH terms:\nHighest ranking match of a known term")
Out[17]:
In [30]:
no_match = [(k,len([m for m, c in d.items() if c >= 1]))
for k, d in knn_preds.items()
if len(set(d.keys()) & new_conds[k]) == 0]
In [31]:
sorted(no_match, key=lambda x: x[1], reverse=True)[:10]
Out[31]:
In [35]:
nct_id = 'NCT02307630'
for m, p in sorted(knn_preds[nct_id].items(), key=lambda x: x[1], reverse=True):
if p >= 1:
print m, p
In [12]:
%matplotlib inline
knn_accuracy = {}
for trial_id in new_conds.keys():
# initialize variables for this prediction
accuracy = {'exact': 10000,
'hypernym': {}}
this_pred = knn_preds[trial_id]
val_order = {j[1]: i for i, j in enumerate(sorted(this_pred.items(), key=lambda x: x[1], reverse=True))}
# loop through known MeSH terms to look for greatest overlap
for m in new_conds[trial_id]:
if m in this_pred:
this_rank = val_order[this_pred[m]]
if this_rank < accuracy['exact']:
accuracy['exact'] = this_rank
if accuracy['exact'] == 10000: accuracy['exact'] = None
knn_accuracy[trial_id] = accuracy
c = Counter([knn_accuracy[t]['exact']+1
if knn_accuracy[t]['exact'] is not None
else 'No match'
for t in knn_accuracy.keys()])
print sum(c.values()), len(knn_accuracy)
ax = pd.DataFrame(c.values(), index=c.keys()).plot(kind='bar',
figsize=(8,8),
legend=False)
ax.set_xlabel("Highest rank of nearest neighbor prediction exact match")
ax.set_ylabel("Number of trials")
ax.set_title("Evaluating KNN predictions using manually assigned MeSH terms:\nHighest ranking match of a known term")
Out[12]:
In [ ]:
In [ ]:
In [ ]: