In [1]:
# basic NLP
import nltk, codecs, string, random, math, cPickle as pickle, re, datetime
from collections import Counter
# scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
from __future__ import division
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
stopset = set(nltk.corpus.stopwords.words('english'))
In [2]:
corrections = {"Sarcoma, Ewing's": 'Sarcoma, Ewing',
'Beta-Thalassemia': 'beta-Thalassemia',
'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',
'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',
'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',
'Felty''s Syndrome': 'Felty Syndrome',
'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',
'Retrognathism': 'Retrognathia',
'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',
'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',
'Von Willebrand Diseases': 'von Willebrand Diseases',
'Pontine Glioma': 'Brain Stem Neoplasms',
'Mental Retardation': 'Intellectual Disability',
'Overdose': 'Drug Overdose',
'Beta-Mannosidosis': 'beta-Mannosidosis',
'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',
'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',
'Alpha-Thalassemia': 'alpha-Thalassemia',
'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',
'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',
'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',
'Alpha-Mannosidosis': 'alpha-Mannosidosis',
'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'
}
cond = {}
cond_r = {}
for row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():
row_id, trial_id, mesh_term = row.strip().split('|')
if mesh_term in corrections: mesh_term = corrections[mesh_term]
if mesh_term not in cond: cond[mesh_term] = []
cond[mesh_term].append(trial_id)
if trial_id not in cond_r: cond_r[trial_id] = []
cond_r[trial_id].append(mesh_term)
mesh_codes = {}
mesh_codes_r = {}
for row in codecs.open('../data/mesh_thesaurus.txt','r','utf-8').readlines():
row_id, mesh_id, mesh_term = row.strip().split('|')
mesh_codes[mesh_id] = mesh_term
if mesh_term not in mesh_codes_r: mesh_codes_r[mesh_term] = []
mesh_codes_r[mesh_term].append(mesh_id)
# limiting to conditions that appear in ten or more trials
top_cond = {c for c in cond if len(cond[c]) >= 10}
trials = {t for c in top_cond for t in cond[c]}
In [ ]:
trial_desc = {}
for row in codecs.open('../data/clinical_study.txt','r','utf-8').readlines():
data = row.split('|')
brief_desc, detail_desc = (data[9],
data[10] if len(data[10]) > 50 else '')
trial_desc[data[0]] = brief_desc, detail_desc
to_classify = [t for t in trial_desc if t not in trials]
In [ ]:
pickle.dump(trial_desc,open('../data/trial_desc.pkl','wb'))
In [3]:
trial_desc = pickle.load(open('../data/trial_desc.pkl','rb'))
to_classify = [t for t in trial_desc if t not in trials]
In [ ]:
print 'Total MeSH terms: %d' % len(cond)
print 'Total MeSH terms (level 1): %d' % len([mesh_codes[m] for m in set([mr[:3] for c in cond if c in mesh_codes_r for mr in mesh_codes_r[c]])])
print 'Total MeSH terms (level 2): %d' % len([mesh_codes[m] for m in set([mr[:7] for c in cond if c in mesh_codes_r for mr in mesh_codes_r[c]])])
Create trial lookup for MeSH term hypernyms in the second level of the hierarchy
In [3]:
cond_l2 = {}
for m in cond.keys():
if m in mesh_codes_r:
m_l2 = set([mr[:7] for mr in mesh_codes_r[m]])
for l2 in m_l2:
if l2 not in cond_l2: cond_l2[l2] = set()
cond_l2[l2] |= set(cond[m])
In [34]:
def process_text(text):
return [word.lower()
for sent in sent_tokenizer.tokenize(text)
for word in nltk.word_tokenize(sent)
if word.lower() not in stopset and
sum(1 for char in word if char not in string.punctuation) > 0]
In [20]:
cond_text = {cond: Counter([word
for trial_id in cond_l2[cond]
for desc in trial_desc[trial_id]
if len(desc) > 0
for word in process_text(desc)])
for cond in cond_l2.keys()}
In [21]:
total_text = sum(cond_text.values(),Counter())
In [22]:
pickle.dump(cond_text,open('../data/mesh_level2_textcount.pkl','wb'))
pickle.dump(total_text,open('../data/mesh_level2_alltextcount.pkl','wb'))
In [4]:
cond_text = pickle.load(open('../data/mesh_level2_textcount.pkl','rb'))
total_text = pickle.load(open('../data/mesh_level2_alltextcount.pkl','rb'))
In [19]:
# initializing values
mesh_models = {}
total_text_keys, total_text_values = zip(*[(k, v)
for k, v in total_text.items()
if len(k) > 2 and sum([1
for char in k
if char not in '1234567890']) > 0])
other_text_len = sum(total_text_values)
In [ ]:
i = len(mesh_models) + 1
for c in cond_text.keys():
if c not in mesh_models and len(c) > 3:
# get total number of words for that term and for everything else that isn't that term
cond_text_len = sum([v
for k, v in cond_text[c].items()
if len(k) > 2 and sum([1
for char in k
if char not in '1234567890']) > 0])
cur_other_text_len = other_text_len - cond_text_len
# create set of tuples (term % of target MeSH descriptor text, term % of other MeSH descriptor text)
vecs = [(cond_text[c][t] / cond_text_len, (total_text[t] - cond_text[c][t]) / cur_other_text_len)
for t in total_text.keys()
if len(t) > 2 and sum([1
for char in t
if char not in '1234567890']) > 0]
# fit logistic model
model = LogisticRegression()
mesh_models[c] = model.fit(zip(*vecs),[1,0])
print '%-3d %s (%s)' % (i, c, mesh_codes[c])
i += 1
In [29]:
pickle.dump(mesh_models,open('../data/mesh_models_series.pkl','wb'))
In [ ]:
mesh_models = pickle.load(open('../data/mesh_models_series.pkl','rb'))
Applying models to each unclassified trial
In [35]:
classify_text = {trial_id: Counter([word
for desc in trial_desc[trial_id]
if len(desc) > 0
for word in process_text(desc)])
for trial_id in to_classify}
In [72]:
guesses = {}
total_text_keys, total_text_values = zip(*[(k, v)
for k, v in total_text.items()
if len(k) > 2 and sum([1
for char in k
if char not in '1234567890']) > 0])
other_text_len = sum(total_text_values)
In [ ]:
i = len(guesses) + 1
for c in classify_text.keys():
if c not in guesses:
text_len = sum([v
for k, v in classify_text[c].items()
if len(k) > 2 and sum([1
for char in k
if char not in '1234567890']) > 0])
if text_len > 0:
# create set of tuples (term % of target descriptor text, term % of other MeSH descriptor text)
vecs = [classify_text[c][t] / text_len
for t in total_text.keys()
if len(t) > 2 and sum([1
for char in t
if char not in '1234567890']) > 0]
# predict logistic models
predictions = {}
for term, model in mesh_models.items():
predictions[term] = model.predict_proba(vecs)[0][1]
guesses[c] = predictions
i += 1
if i % 10 == 0: print i
In [128]:
pickle.dump(guesses,open('../data/mesh_guesses.pkl','wb'))
In [129]:
cond_text = {c: ' '.join(' '.join(trial_desc[t]) for t in cond[c])
for c in top_cond}
In [130]:
tfidf = TfidfVectorizer(stop_words=stopset)
train_mat = tfidf.fit_transform(cond_text.values())
apply_mat = tfidf.transform(' '.join(trial_desc[t]) for t in to_classify)
In [131]:
model = LogisticRegression()
model.fit(train_mat,cond_text.keys())
single_preds = dict(zip(to_classify,model.predict(apply_mat)))
In [132]:
pickle.dump(single_preds,open('../data/mesh_guesses_maxent.pkl','wb'))
In [4]:
trial_text = {t: ' '.join(trial_desc[t])
for t in trials
if len(trial_desc[t][0] + trial_desc[t][1]) > 50}
trial_text_other = {t: ' '.join(trial_desc[t])
for t in to_classify
if len(trial_desc[t][0] + trial_desc[t][1]) > 50}
In [5]:
tfidf = TfidfVectorizer(stop_words=stopset)
train_mat = tfidf.fit_transform(trial_text.values())
apply_mat = tfidf.transform(trial_text_other.values())
In [6]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=10,radius=5)
neigh.fit(train_mat)
Out[6]:
In [7]:
knn_guesses = {}
In [ ]:
for i in range(len(trial_text_other.keys())):
trial_id = trial_text_other.keys()[i]
if trial_id not in knn_guesses:
dist, idx = (arr.flatten() for arr in neigh.kneighbors(apply_mat[i]))
this_guess = {}
for j in range(len(idx)):
k_trial_id = trial_text.keys()[idx[j]]
for mterm in cond_r[k_trial_id]:
if mterm not in this_guess: this_guess[mterm] = []
this_guess[mterm].append(dist[j])
knn_guesses[trial_id] = this_guess
if i % 100 == 0: print i, datetime.datetime.now().time()
In [11]:
pickle.dump(knn_guesses,open('../data/mesh_guesses_knn.pkl','wb'))
In [ ]: