In [1]:
import codecs, string, random, math, cPickle as pickle, re, pandas as pd, seaborn
from matplotlib.ticker import FuncFormatter
from collections import Counter
from IPython.display import HTML, Javascript, display

from __future__ import division

%matplotlib inline

Load data


In [2]:
corrections = {"Sarcoma, Ewing's": 'Sarcoma, Ewing',
               'Beta-Thalassemia': 'beta-Thalassemia',
               'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',
               'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',
               'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',
               'Felty''s Syndrome': 'Felty Syndrome',
               'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',
               'Retrognathism': 'Retrognathia',
               'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',
               'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',
               'Von Willebrand Diseases': 'von Willebrand Diseases',
               'Pontine Glioma': 'Brain Stem Neoplasms',
               'Mental Retardation': 'Intellectual Disability',
               'Overdose': 'Drug Overdose',
               'Beta-Mannosidosis': 'beta-Mannosidosis',
               'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',
               'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',
               'Alpha-Thalassemia': 'alpha-Thalassemia',
               'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',
               'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',
               'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',
               'Alpha-Mannosidosis': 'alpha-Mannosidosis',
               'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'
               }
cond = {}
cond_r = {}
for row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():
    row_id, trial_id, mesh_term = row.strip().split('|')
    if mesh_term in corrections: mesh_term = corrections[mesh_term]
    if mesh_term not in cond: cond[mesh_term] = []
    cond[mesh_term].append(trial_id)
    if trial_id not in cond_r: cond_r[trial_id] = []
    cond_r[trial_id].append(mesh_term)

mesh_codes = {}
mesh_codes_r = {}
for row in codecs.open('../data/mesh_thesaurus.txt','r','utf-8').readlines():
    row_id, mesh_id, mesh_term = row.strip().split('|')
    mesh_codes[mesh_id] = mesh_term
    if mesh_term not in mesh_codes_r: mesh_codes_r[mesh_term] = []
    mesh_codes_r[mesh_term].append(mesh_id)

# limiting to conditions that appear in ten or more trials
top_cond = {c for c in cond if len(cond[c]) >= 10}
trials = {t for c in top_cond for t in cond[c]}

# trial descriptions
trial_desc = pickle.load(open('../data/trial_desc.pkl','rb'))

# holdout sample text counters used in classification
# classify_text = pickle.load(open('../data/classify_text.pkl','rb'))

Load model predictions


In [3]:
multi_preds = pickle.load(open('../data/mesh_guesses_holdout.pkl','rb'))
single_preds = pickle.load(open('../data/mesh_guesses_maxent_holdout.pkl','rb'))
knn_preds = pickle.load(open('../data/mesh_guesses_knn_holdout.pkl','rb'))

In [4]:
print len(multi_preds)
print len(single_preds)
print len(knn_preds)


38075
38075
37527

Evaluating single-prediction model

What is the maximum overlap between the predicted term and the manually assigned terms?

Overlap is defined by the level of the least common hypernym in the MeSH hierarchy. A value of zero indicates that there is no overlap at all between the predicted and manually assigned terms.


In [4]:
single_accuracy = {}
for trial_id in list(set(single_preds.keys()) & set(cond_r.keys()) & trials):
    # initialize variables for this prediction
    accuracy = 0
    this_pred = single_preds[trial_id]
    if this_pred in mesh_codes_r:
        mesh_pred = mesh_codes_r[single_preds[trial_id]]

        # loop through known MeSH terms to look for greatest overlap
        for m in cond_r[trial_id]:
            if m in mesh_codes_r:
                for a in mesh_codes_r[m]:
                    len_a = len(a)
                    for i in range(len_a,accuracy,-4):
                        for pa in mesh_pred:
                            if pa[:i] == a[:i] and i > accuracy: accuracy = i
    
    single_accuracy[trial_id] = int((accuracy + 1) / 4) if accuracy > 0 else 'No match'

In [5]:
c = Counter(single_accuracy.values())
print sum(c.values())
ax = pd.DataFrame(c.values(), index=c.keys()).plot(kind='bar',
                                                   legend=False, 
                                                   figsize=(8,8))
ax.set_xlabel("Hierarchy level")
ax.set_ylabel("Number of trials")
ax.set_title("Evaluating MaxEnt classification using manually assigned MeSH terms:\nMaximum depth of a matching term")


13965
Out[5]:
<matplotlib.text.Text at 0x13c6006d0>

Evaluating multiple prediction models

How well do the (accurate) level-2 hypernym predictions rank?

What is the highest (i.e., lowest number) rank of a manually assigned term's level-2 hypernym?


In [25]:
mult_accuracy = {}
for trial_id in list(set(multi_preds.keys()) & set(cond_r.keys()) & trials):
    # initialize variables for this prediction
    top_rank = 10000
    this_pred = {d[0]: i for i, d in list(enumerate(sorted(multi_preds[trial_id].items(),
                                                           key=lambda x: x[1],
                                                           reverse=True)))
                }

    # loop through known MeSH terms to look for greatest overlap
    for m in cond_r[trial_id]:
        if m in mesh_codes_r:
            for a in mesh_codes_r[m]:
                if a[:7] in this_pred and this_pred[a[:7]] < top_rank: top_rank = this_pred[a[:7]]
    
    if top_rank < 10000:
        mult_accuracy[trial_id] = top_rank

In [26]:
low_rank = max(mult_accuracy.values())
c = Counter(mult_accuracy.values())
print sum(c.values())

ticks = [k for k in range(low_rank) if k%10 == 0]

ax = pd.DataFrame([c[i] for i in range(low_rank+1)], 
                  index=range(low_rank+1)).plot(kind='bar',
                                                legend=False, 
                                                xticks=ticks,
                                                figsize=(14,8))
ax.set_xticklabels(ticks)
ax.set_xlabel("Highest rank of matching hypernym")
ax.set_ylabel("Number of trials")
ax.set_title("Evaluating hypernym classifiers using manually assigned MeSH terms:\nHighest ranking hypernym of a known term")


13745
Out[26]:
<matplotlib.text.Text at 0x13f3a0fd0>

In [10]:
total = sum(c.values())
ticks = [i/10 for i in range(1,11)]
ax = pd.DataFrame([sum([c[j] for j in c if j <= i]) / total for i in range(low_rank+1)], 
                  index=range(low_rank+1)).plot(legend=False,
                                                xlim=(0,200),
                                                yticks=ticks,
                                                figsize=(8,8))
ax.set_xlabel("Highest ranking matching hypernym")
ax.set_ylabel("Share of trials")
ax.set_yticklabels(['%d%%' % (r*100) for r in ticks])
ax.set_title("Cumulative distribution of top hypernym predictions:\nWhat share of trials have a hypernym at this rank or above?")


Out[10]:
<matplotlib.text.Text at 0x15d975fd0>

Evaluating KNN model

Does an exact match exist among the neighbors? At what rank?

If not, does a hypernym match exist? At what level and rank?


In [6]:
knn_accuracy = {}
for trial_id in list(set(knn_preds.keys()) & set(cond_r.keys()) & trials):
    # initialize variables for this prediction
    accuracy = {'exact': 10000,
                'hypernym': {}}
    this_pred = dict([(k, sum([1 / (10 ** d) for d in v]))
                        for k, v in knn_preds[trial_id].items()
                        ])
    val_order = {j: i for i, j in enumerate(sorted(list(set(this_pred.values())), reverse=True))}

    hyp_pred = {m: val_order[v]
                for p, v in this_pred.items() 
                if p in mesh_codes_r 
                for m in mesh_codes_r[p]}
    
    # loop through known MeSH terms to look for greatest overlap
    for m in cond_r[trial_id]:
        if m in this_pred:
            this_rank = val_order[this_pred[m]]
            if this_rank < accuracy['exact']:
                accuracy['exact'] = this_rank
        elif m in mesh_codes_r:
            for a in mesh_codes_r[m]:
                len_a = len(a)
                for i in range(len_a,0,-4):
                    for pa in hyp_pred:
                        if pa[:i] == a[:i]:
                            if i not in accuracy['hypernym'] or hyp_pred[pa] < accuracy['hypernym'][i]:
                                accuracy['hypernym'][i] = hyp_pred[pa]

    if accuracy['exact'] == 10000: accuracy['exact'] = None
    
    knn_accuracy[trial_id] = accuracy

In [9]:
c = Counter([knn_accuracy[t]['exact']+1 
             if knn_accuracy[t]['exact'] is not None
             else 'No match'
             for t in knn_accuracy.keys()])
print sum(c.values()), len(knn_accuracy)

ax = pd.DataFrame(c.values(), index=c.keys()).plot(kind='bar',
                                                   figsize=(12,8),
                                                   legend=False)

ax.set_xlabel("Highest rank of nearest neighbor prediction exact match")
ax.set_ylabel("Number of trials")
ax.set_title("Evaluating KNN predictions using manually assigned MeSH terms:\nHighest ranking match of a known term")


13959 13959
Out[9]:
<matplotlib.text.Text at 0x14f034c50>

Testing random instances


In [4]:
holdout_sample = list(set(knn_preds.keys()) & set(cond_r.keys()) & trials)
rare_mesh = list(set(knn_preds.keys()) & set(cond_r.keys()) - trials)
unknown_mesh = list(set(knn_preds.keys()) - set(cond_r.keys()))

In [14]:
def mesh_url(this_term, this_code):
    this_term = this_term.replace(' ','+')
    url = 'http://www.nlm.nih.gov/cgi/mesh/2015/MB_cgi?mode=&term=%s&field=entry#Tree%s' % (this_term, this_code)
    return '<a target="_blank" href=%s>%s</a>' % (url, this_code)

def html_output(trial_id):
    
    if trial_id in cond_r:
        assigned = cond_r[trial_id]
        assigned_codes = set()
        for m in assigned:
            if m in mesh_codes_r: assigned_codes |= set(mesh_codes_r[m])
        assigned_hyp = set([c[:7] for c in assigned_codes if len(c) > 3])
    else:
        assigned = []
    
    if trial_id in single_preds:
        top_pred = single_preds[trial_id]
    else:
        top_pred = ''
        
    if trial_id in knn_preds:
        knn_pred = sorted([(k, sum([1 / (10 ** d) for d in v])) 
                           for k, v in knn_preds[trial_id].items()
                           ], key=lambda x: x[1], reverse=True)
        knn_ranks = {}
        rank = 1
        for k,v in knn_pred:
            if len(knn_ranks) == 0: cur_val = v
            if v == cur_val:
                knn_ranks[k] = rank
            else:
                rank += 1
                cur_val = v
                knn_ranks[k] = rank
        rank_count = Counter(knn_ranks.values())
    else:
        knn_pred = []
    
    if trial_id in multi_preds:
        hyp_pred = sorted(multi_preds[trial_id].items(), 
                          key=lambda x: x[1], 
                          reverse=True)
        top_hyps = [c for c, v in hyp_pred[:20]]
    else:
        top_hyps = []
    
    def print_codes(cur_term, paint=False):
        code_html = ''
        if cur_term in mesh_codes_r:
            cur_codes = mesh_codes_r[cur_term][:]
            cur_codes.sort()
            for code in cur_codes:
                painting = ''
                if paint and code[:7] in top_hyps:
                    painting = '&nbsp;<span style="background-color: #FF69BA; font-size: small;">(hypernym in top 20)</span>'
                code_html += '<p style="padding-left: 30px; margin-top: 0px">%s%s</p>' % (mesh_url(cur_term,code),
                                                                                          painting)
        return code_html
    
    page = "<p><strong>Trial ID:</strong> %s</p>\n" % trial_id
    page += "<h3>Brief description</h3><p>%s</p>\n" % trial_desc[trial_id][0]
    page += "<h3>Detailed description</h3><p>%s</p>\n" % (trial_desc[trial_id][1] 
                                                          if trial_desc[trial_id][1]
                                                          else "No detailed description.")
    page += "<h2>Current MeSH assignment (if any)</h2>"
    if assigned:
        for t in assigned:
            additional = ''
            if t == top_pred:
                additional += '''&nbsp;<span style="background-color: greenyellow; 
                                               font-weight: normal; 
                                               font-size: small;
                                               font-style: normal">(same as top single prediction)</span>'''
            if t in knn_ranks and knn_ranks[t] == 1:
                additional += '''&nbsp;<span style="background-color: lightsalmon; 
                                               font-weight: normal; 
                                               font-size: small;
                                               font-style: normal">(top-ranked KNN prediction)</span>'''
            elif t in knn_ranks:
                additional += '''&nbsp;<span style="background-color: #FFD9CA; 
                                               font-weight: normal; 
                                               font-size: small;
                                               font-style: normal">(KNN prediction)</span>'''                
            page += '<h5 style="margin-top: 8px;">%s%s</h5>' % (t, 
                                                                additional)
            page += print_codes(t, paint=True)
    else:
        page += '<p>No MeSH assignment.</p>'
    
    page += '<h2>Top single prediction</h2>'
    top_pred_style = ''
    if top_pred in assigned:
        top_pred_style = ' style="background-color: greenyellow;"'
    page += '<h5 style="margin-top: 8px;"><span%s>%s</span></h5>' % (top_pred_style, 
                                                                     top_pred)
    page += print_codes(top_pred)
    
    page += '<h2>K Nearest Neighbor predictions</h2>'
    for this_term, cur_val in knn_pred:
        knn_style = ''
        if this_term in assigned:
            knn_style = ' style="background-color: %s;"' % ('lightsalmon' if knn_ranks[this_term] == 1 else '#FFD9CA')
        knn_tie = ''
        if rank_count[knn_ranks[this_term]] > 1:
            knn_tie = ' (tie)'
        page += '<h5 style="margin-top: 8px;">%d%s. <span%s>%s</span> </h5>' % (knn_ranks[this_term], 
                                                                                knn_tie, 
                                                                                knn_style, 
                                                                                this_term)
        page += print_codes(this_term)
    
    page += '<h2>Top 20 MeSH hypernym predictions</h2>'
    hyp_rank = 1
    for code in top_hyps:
        hyp_style = ''
        if code in assigned_hyp:
            hyp_style = ' style="background-color: #FF69BA;"'
        page += '<p style="margin-top: 0px">%d. <span style="font-weight: bold;">%s</span>' % (hyp_rank,
                                                                                               mesh_url(mesh_codes[code],code))
        page += '&nbsp;&nbsp;<span%s>%s</span></p>' % (hyp_style,
                                                      mesh_codes[code])
        hyp_rank += 1
    
    return page

In [36]:
display(HTML(html_output(random.choice(holdout_sample))))


Trial ID: NCT01674868

Brief description

This pilot study will recruit 16-25 subjects to assess the feasibility of replicating the FLAME study (Chollet, et al. Lancet 2011), an randomized controlled trial (RCT) that assessed the effect of fluoxetine vs placebo on motor recovery after ischemic stroke, in an American sample of post-acute stroke patients. This trial will in addition examine the effect of treatment with fluoxetine versus placebo on concurrent deficits in language and hemispatial attention, and will evaluate the durability of observed effects in all 3 domains. The results of this pilot trial will be used to develop power estimates for a larger trial and to evaluate recruitment and intervention completion rates for subjects in an American post-acute environment.

Detailed description

No detailed description.

Current MeSH assignment (if any)

Stroke (top-ranked KNN prediction)

C10.228.140.300.775 (hypernym in top 20)

C14.907.253.855

Cerebral Infarction (same as top single prediction) (top-ranked KNN prediction)

C10.228.140.300.150.477.200 (hypernym in top 20)

C10.228.140.300.775.200.200 (hypernym in top 20)

C14.907.253.092.477.200

C14.907.253.855.200.200

Top single prediction

Cerebral Infarction

C10.228.140.300.150.477.200

C10.228.140.300.775.200.200

C14.907.253.092.477.200

C14.907.253.855.200.200

K Nearest Neighbor predictions

1 (tie). Cerebral Infarction

C10.228.140.300.150.477.200

C10.228.140.300.775.200.200

C14.907.253.092.477.200

C14.907.253.855.200.200

1 (tie). Stroke

C10.228.140.300.775

C14.907.253.855

2 (tie). Depressive Disorder

F03.600.300

2 (tie). Depression

F01.145.126.350

3. Alcoholism

C25.775.100.250

F03.900.100.350

4. Infarction

C23.550.513.355

C23.550.717.489

5. Autistic Disorder

F03.550.325.125

6 (tie). Cerebral Hemorrhage

C10.228.140.300.535.200

C14.907.253.573.200

C23.550.414.913.100

6 (tie). Hemorrhage

C23.550.414

7. Perceptual Disorders

C10.597.606.762

C23.888.592.604.764

F01.700.750

8 (tie). Mental Disorders

F03

8 (tie). Psychotic Disorders

F03.700.675

9 (tie). Obsessive-Compulsive Disorder

F03.080.600

9 (tie). Compulsive Personality Disorder

F03.675.150

Top 20 MeSH hypernym predictions

1. C05.116  Bone Diseases

2. C23.300  Pathological Conditions, Anatomical

3. C23.888  Signs and Symptoms

4. C16.131  Congenital Abnormalities

5. C16.614  Infant, Newborn, Diseases

6. C01.539  Infection

7. C18.654  Nutrition Disorders

8. C10.597  Neurologic Manifestations

9. C23.550  Pathologic Processes

10. C14.240  Cardiovascular Abnormalities

11. C26.404  Fractures, Bone

12. C05.660  Musculoskeletal Abnormalities

13. G07.100  Body Constitution

14. C10.228  Central Nervous System Diseases

15. E01.370  Diagnostic Techniques and Procedures

16. C13.703  Pregnancy Complications

17. F01.700  Neurobehavioral Manifestations

18. C16.320  Genetic Diseases, Inborn

19. C10.900  Trauma, Nervous System

20. C10.500  Nervous System Malformations


In [ ]: