Load modules


In [1]:
# basic NLP
import nltk, codecs, string, random, math, cPickle as pickle, re, datetime
from collections import Counter

# scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics.pairwise import linear_kernel

from __future__ import division

sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
stopset = set(nltk.corpus.stopwords.words('english'))

Load data


In [2]:
corrections = {"Sarcoma, Ewing's": 'Sarcoma, Ewing',
               'Beta-Thalassemia': 'beta-Thalassemia',
               'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',
               'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',
               'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',
               'Felty''s Syndrome': 'Felty Syndrome',
               'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',
               'Retrognathism': 'Retrognathia',
               'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',
               'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',
               'Von Willebrand Diseases': 'von Willebrand Diseases',
               'Pontine Glioma': 'Brain Stem Neoplasms',
               'Mental Retardation': 'Intellectual Disability',
               'Overdose': 'Drug Overdose',
               'Beta-Mannosidosis': 'beta-Mannosidosis',
               'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',
               'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',
               'Alpha-Thalassemia': 'alpha-Thalassemia',
               'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',
               'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',
               'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',
               'Alpha-Mannosidosis': 'alpha-Mannosidosis',
               'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'
               }
cond = {}
cond_r = {}
for row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():
    row_id, trial_id, mesh_term = row.strip().split('|')
    if mesh_term in corrections: mesh_term = corrections[mesh_term]
    if mesh_term not in cond: cond[mesh_term] = []
    cond[mesh_term].append(trial_id)
    if trial_id not in cond_r: cond_r[trial_id] = []
    cond_r[trial_id].append(mesh_term)

mesh_codes = {}
mesh_codes_r = {}
for row in codecs.open('../data/mesh_thesaurus.txt','r','utf-8').readlines():
    row_id, mesh_id, mesh_term = row.strip().split('|')
    mesh_codes[mesh_id] = mesh_term
    if mesh_term not in mesh_codes_r: mesh_codes_r[mesh_term] = []
    mesh_codes_r[mesh_term].append(mesh_id)

# limiting to conditions that appear in ten or more trials
top_cond = {c for c in cond if len(cond[c]) >= 10}
trials = {t for c in top_cond for t in cond[c]}

In [ ]:
trial_desc = {}
for row in codecs.open('../data/clinical_study.txt','r','utf-8').readlines():
    data = row.split('|')
    brief_desc, detail_desc = (data[9],
                               data[10] if len(data[10]) > 50 else '')
    trial_desc[data[0]] = brief_desc, detail_desc

to_classify = [t for t in trial_desc if t not in trials]

In [ ]:
pickle.dump(trial_desc,open('../data/trial_desc.pkl','wb'))

In [3]:
trial_desc = pickle.load(open('../data/trial_desc.pkl','rb'))
to_classify = [t for t in trial_desc if t not in trials]

Analyze data


In [ ]:
print 'Total MeSH terms: %d' % len(cond)
print 'Total MeSH terms (level 1): %d' % len([mesh_codes[m] for m in set([mr[:3] for c in cond  if c in mesh_codes_r for mr in mesh_codes_r[c]])])
print 'Total MeSH terms (level 2): %d' % len([mesh_codes[m] for m in set([mr[:7] for c in cond  if c in mesh_codes_r for mr in mesh_codes_r[c]])])

Create trial lookup for MeSH term hypernyms in the second level of the hierarchy


In [3]:
cond_l2 = {}
for m in cond.keys():
    if m in mesh_codes_r:
        m_l2 = set([mr[:7] for mr in mesh_codes_r[m]])
        for l2 in m_l2:
            if l2 not in cond_l2: cond_l2[l2] = set()
            cond_l2[l2] |= set(cond[m])

Process text


In [34]:
def process_text(text):
    return [word.lower() 
            for sent in sent_tokenizer.tokenize(text) 
            for word in nltk.word_tokenize(sent)
            if word.lower() not in stopset and
            sum(1 for char in word if char not in string.punctuation) > 0]

In [20]:
cond_text = {cond: Counter([word
                            for trial_id in cond_l2[cond] 
                            for desc in trial_desc[trial_id]
                            if len(desc) > 0
                            for word in process_text(desc)])
             for cond in cond_l2.keys()}

In [21]:
total_text = sum(cond_text.values(),Counter())

In [22]:
pickle.dump(cond_text,open('../data/mesh_level2_textcount.pkl','wb'))
pickle.dump(total_text,open('../data/mesh_level2_alltextcount.pkl','wb'))

In [4]:
cond_text = pickle.load(open('../data/mesh_level2_textcount.pkl','rb'))
total_text = pickle.load(open('../data/mesh_level2_alltextcount.pkl','rb'))

Building series of individual level-2 MeSH classifiers


In [19]:
# initializing values
mesh_models = {}

total_text_keys, total_text_values = zip(*[(k, v)
                                           for k, v in total_text.items() 
                                           if len(k) > 2 and sum([1 
                                                                  for char in k 
                                                                  if char not in '1234567890']) > 0])

other_text_len = sum(total_text_values)

In [ ]:
i = len(mesh_models) + 1

for c in cond_text.keys():
    if c not in mesh_models and len(c) > 3:
        # get total number of words for that term and for everything else that isn't that term
        cond_text_len = sum([v 
                             for k, v in cond_text[c].items() 
                             if len(k) > 2 and sum([1 
                                                    for char in k 
                                                    if char not in '1234567890']) > 0])
        cur_other_text_len = other_text_len - cond_text_len
        
        # create set of tuples (term % of target MeSH descriptor text, term % of other MeSH descriptor text)
        vecs = [(cond_text[c][t] / cond_text_len, (total_text[t] - cond_text[c][t]) / cur_other_text_len)
                for t in total_text.keys()
                if len(t) > 2 and sum([1
                                       for char in t
                                       if char not in '1234567890']) > 0]

        # fit logistic model
        model = LogisticRegression()
        mesh_models[c] = model.fit(zip(*vecs),[1,0])

        print '%-3d %s (%s)' % (i, c, mesh_codes[c])
        i += 1

In [29]:
pickle.dump(mesh_models,open('../data/mesh_models_series.pkl','wb'))

In [ ]:
mesh_models = pickle.load(open('../data/mesh_models_series.pkl','rb'))

Applying models to each unclassified trial


In [35]:
classify_text = {trial_id: Counter([word
                                    for desc in trial_desc[trial_id]
                                    if len(desc) > 0
                                    for word in process_text(desc)])
                 for trial_id in to_classify}

In [72]:
guesses = {}
total_text_keys, total_text_values = zip(*[(k, v)
                                           for k, v in total_text.items() 
                                           if len(k) > 2 and sum([1 
                                                                  for char in k 
                                                                  if char not in '1234567890']) > 0])

other_text_len = sum(total_text_values)

In [ ]:
i = len(guesses) + 1

for c in classify_text.keys():
    if c not in guesses:
        text_len = sum([v
                        for k, v in classify_text[c].items()
                        if len(k) > 2 and sum([1
                                               for char in k
                                               if char not in '1234567890']) > 0])
        
        if text_len > 0:
            # create set of tuples (term % of target descriptor text, term % of other MeSH descriptor text)
            vecs = [classify_text[c][t] / text_len
                    for t in total_text.keys()
                    if len(t) > 2 and sum([1
                                           for char in t
                                           if char not in '1234567890']) > 0]

            # predict logistic models
            predictions = {}
            for term, model in mesh_models.items():
                predictions[term] = model.predict_proba(vecs)[0][1]

            guesses[c] = predictions

        i += 1
        if i % 10 == 0: print i

In [128]:
pickle.dump(guesses,open('../data/mesh_guesses.pkl','wb'))

Single-prediction maxent classifier


In [129]:
cond_text = {c: ' '.join(' '.join(trial_desc[t]) for t in cond[c])
             for c in top_cond}

In [130]:
tfidf = TfidfVectorizer(stop_words=stopset)
train_mat = tfidf.fit_transform(cond_text.values())
apply_mat = tfidf.transform(' '.join(trial_desc[t]) for t in to_classify)

In [131]:
model = LogisticRegression()
model.fit(train_mat,cond_text.keys())
single_preds = dict(zip(to_classify,model.predict(apply_mat)))

In [132]:
pickle.dump(single_preds,open('../data/mesh_guesses_maxent.pkl','wb'))

K Nearest Neighbors suggestions


In [4]:
trial_text = {t: ' '.join(trial_desc[t])
              for t in trials 
              if len(trial_desc[t][0] + trial_desc[t][1]) > 50}
trial_text_other = {t: ' '.join(trial_desc[t]) 
                    for t in to_classify
                    if len(trial_desc[t][0] + trial_desc[t][1]) > 50}

In [5]:
tfidf = TfidfVectorizer(stop_words=stopset)
train_mat = tfidf.fit_transform(trial_text.values())
apply_mat = tfidf.transform(trial_text_other.values())

In [6]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=10,radius=5)
neigh.fit(train_mat)


Out[6]:
NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         n_neighbors=10, radius=5)

In [7]:
knn_guesses = {}

In [ ]:
for i in range(len(trial_text_other.keys())):
    trial_id = trial_text_other.keys()[i]
    if trial_id not in knn_guesses:
        dist, idx = (arr.flatten() for arr in neigh.kneighbors(apply_mat[i]))

        this_guess = {}
        for j in range(len(idx)):
            k_trial_id = trial_text.keys()[idx[j]]
            for mterm in cond_r[k_trial_id]:
                if mterm not in this_guess: this_guess[mterm] = []
                this_guess[mterm].append(dist[j])

        knn_guesses[trial_id] = this_guess
    if i % 100 == 0: print i, datetime.datetime.now().time()

In [11]:
pickle.dump(knn_guesses,open('../data/mesh_guesses_knn.pkl','wb'))

In [ ]: