In [1]:
from gensim import corpora, models, similarities, utils
import gensim, simserver
# import nltk, codecs, string, random, math, cPickle as pickle, re, datetime, pandas as pd, os
# from collections import Counter, defaultdict
# from bs4 import BeautifulSoup

# sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
# stopset = set(nltk.corpus.stopwords.words('english'))

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Load existing trial descriptions


In [3]:
trial_desc = {}
for row in codecs.open('../data/clinical_study.txt','r','utf-8').readlines():
    data = row.split('|')
    brief_desc, detail_desc = (data[9].replace('<br />',' '),
                               data[10].replace('<br />',' ') if len(data[10]) > 50 else '')
    trial_desc[data[0]] = brief_desc, detail_desc

In [4]:
corpus = [{'id': nct_id, 'tokens': utils.simple_preprocess(' '.join(text_tup))}
          for nct_id, text_tup in trial_desc.items()]

Set up server (or reload server)


In [2]:
server = simserver.SessionServer('../data/docsim_server')

Upload data, train model, and index documents


In [ ]:
utils.upload_chunked(server, corpus, chunksize=100)

In [ ]:
server.train(corpus, method='lsi')

In [ ]:
server.index(corpus)

Test model


In [5]:
corrections = {"Sarcoma, Ewing's": 'Sarcoma, Ewing',
               'Beta-Thalassemia': 'beta-Thalassemia',
               'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',
               'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',
               'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',
               'Felty''s Syndrome': 'Felty Syndrome',
               'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',
               'Retrognathism': 'Retrognathia',
               'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',
               'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',
               'Von Willebrand Diseases': 'von Willebrand Diseases',
               'Pontine Glioma': 'Brain Stem Neoplasms',
               'Mental Retardation': 'Intellectual Disability',
               'Overdose': 'Drug Overdose',
               'Beta-Mannosidosis': 'beta-Mannosidosis',
               'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',
               'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',
               'Alpha-Thalassemia': 'alpha-Thalassemia',
               'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',
               'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',
               'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',
               'Alpha-Mannosidosis': 'alpha-Mannosidosis',
               'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'
               }

cond = defaultdict(set)
cond_r = defaultdict(set)
for row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():
    row_id, trial_id, mesh_term = row.strip().split('|')
    if mesh_term in corrections: mesh_term = corrections[mesh_term]
    cond[mesh_term].add(trial_id)
    cond_r[trial_id].add(mesh_term)

mesh_codes = {}
mesh_codes_r = defaultdict(set)
for row in codecs.open('../data/mesh_thesaurus.txt','r','utf-8').readlines():
    row_id, mesh_id, mesh_term = row.strip().split('|')
    mesh_codes[mesh_id] = mesh_term
    mesh_codes_r[mesh_term].add(mesh_id)

# limiting to conditions that appear in ten or more trials
top_cond = {c for c in cond if len(cond[c]) >= 10}
trials = {t for c in top_cond for t in cond[c]}

In [6]:
new_trials = os.listdir('../data/new_trial_xml/')

In [9]:
new_corpus = {}
new_conds = defaultdict(set)

In [10]:
for fname in random.sample(new_trials,1000):
    soup = BeautifulSoup(codecs.open('../data/new_trial_xml/' + fname, 'r', 'utf-8').read())
    nct_id = soup.clinical_study.id_info.nct_id.contents[0].replace('\r\n','')
    text_desc = ''
    if soup.clinical_study.brief_summary:
        text_desc += soup.clinical_study.brief_summary.textblock.contents[0].replace('\r\n',' ')
    if soup.clinical_study.detailed_description:
        text_desc += soup.clinical_study.detailed_description.textblock.contents[0].replace('\r\n',' ')
    if soup.clinical_study.condition_browse:
        for m in soup.clinical_study.condition_browse.findAll('mesh_term'):
            new_conds[nct_id].add(m.contents[0])
    if text_desc:
        new_corpus[nct_id] = utils.simple_preprocess(text_desc)

In [11]:
print len(new_corpus)
print len(new_conds)


997
549

In [13]:
knn_preds = {}

In [ ]:
cnt = 0
for nct_id, tokens in new_corpus.items():
    cnt += 1
    if nct_id not in knn_preds:
        this_guess = defaultdict(float)
        for sim_id, dist, _ in server.find_similar({'tokens': tokens}, min_score=0.4, max_results=10):
            for m in cond_r[sim_id]:
                this_guess[m] += dist
                
        knn_preds[nct_id] = this_guess
    
    if cnt % 10 == 0: print cnt, datetime.datetime.now().time()

In [17]:
%matplotlib inline
knn_accuracy = {}
for trial_id in new_conds.keys():
    # initialize variables for this prediction
    accuracy = {'exact': 10000,
                'hypernym': {}}
    this_pred = knn_preds[trial_id]
    val_order = {j[1]: i for i, j in enumerate(sorted(this_pred.items(), key=lambda x: x[1], reverse=True))}

    # loop through known MeSH terms to look for greatest overlap
    for m in new_conds[trial_id]:
        if m in this_pred:
            this_rank = val_order[this_pred[m]]
            if this_rank < accuracy['exact']:
                accuracy['exact'] = this_rank

    if accuracy['exact'] == 10000: accuracy['exact'] = None
    
    knn_accuracy[trial_id] = accuracy

c = Counter([knn_accuracy[t]['exact']+1 
             if knn_accuracy[t]['exact'] is not None
             else 'No match'
             for t in knn_accuracy.keys()])
print sum(c.values()), len(knn_accuracy)

ax = pd.DataFrame(c.values(), index=c.keys()).plot(kind='bar',
                                                   figsize=(8,8),
                                                   legend=False)

ax.set_xlabel("Highest rank of nearest neighbor prediction exact match")
ax.set_ylabel("Number of trials")
ax.set_title("Evaluating KNN predictions using manually assigned MeSH terms:\nHighest ranking match of a known term")


549 549
Out[17]:
<matplotlib.text.Text at 0x127bf5ed0>

In [30]:
no_match = [(k,len([m for m, c in d.items() if c >= 1])) 
            for k, d in knn_preds.items() 
            if len(set(d.keys()) & new_conds[k]) == 0]

In [31]:
sorted(no_match, key=lambda x: x[1], reverse=True)[:10]


Out[31]:
[(u'NCT02277639', 25),
 (u'NCT02312284', 17),
 (u'NCT02376387', 14),
 (u'NCT02350777', 12),
 (u'NCT02317406', 11),
 (u'NCT02360345', 10),
 (u'NCT02315612', 10),
 (u'NCT02307630', 10),
 (u'NCT02276911', 9),
 (u'NCT02274584', 9)]

In [35]:
nct_id = 'NCT02307630'
for m, p in sorted(knn_preds[nct_id].items(), key=lambda x: x[1], reverse=True):
    if p >= 1:
        print m, p


Central Nervous System Neoplasms 2.13993692398
Neuroblastoma 2.13993692398
Sarcoma 2.13993692398
Neoplasms 1.3122420311
Neuroectodermal Tumors, Primitive, Peripheral 1.3062120676
Nervous System Neoplasms 1.3062120676
Meningeal Neoplasms 1.3062120676
Desmoplastic Small Round Cell Tumor 1.29230755568
Melanoma 1.2418192625
Prostatic Neoplasms 1.19214987755

In [12]:
%matplotlib inline
knn_accuracy = {}
for trial_id in new_conds.keys():
    # initialize variables for this prediction
    accuracy = {'exact': 10000,
                'hypernym': {}}
    this_pred = knn_preds[trial_id]
    val_order = {j[1]: i for i, j in enumerate(sorted(this_pred.items(), key=lambda x: x[1], reverse=True))}

    # loop through known MeSH terms to look for greatest overlap
    for m in new_conds[trial_id]:
        if m in this_pred:
            this_rank = val_order[this_pred[m]]
            if this_rank < accuracy['exact']:
                accuracy['exact'] = this_rank

    if accuracy['exact'] == 10000: accuracy['exact'] = None
    
    knn_accuracy[trial_id] = accuracy

c = Counter([knn_accuracy[t]['exact']+1 
             if knn_accuracy[t]['exact'] is not None
             else 'No match'
             for t in knn_accuracy.keys()])
print sum(c.values()), len(knn_accuracy)

ax = pd.DataFrame(c.values(), index=c.keys()).plot(kind='bar',
                                                   figsize=(8,8),
                                                   legend=False)

ax.set_xlabel("Highest rank of nearest neighbor prediction exact match")
ax.set_ylabel("Number of trials")
ax.set_title("Evaluating KNN predictions using manually assigned MeSH terms:\nHighest ranking match of a known term")


5915 5915
Out[12]:
<matplotlib.text.Text at 0x1b3593050>

In [ ]:


In [ ]:


In [ ]: