In [2]:
import nltk
import cPickle as pickle
from collections import Counter
from random import shuffle
In [3]:
data = pickle.load(open('../data/criteria_text_chunk_3.pkl', 'rb'))
In [219]:
smoker_list = ['Non-smoker', 'smoker', 'Current smoker', 'smoking']
pregnancy_list = ['Pregnancy']
birth_control_list = ['Birth control', 'contraception']
drug_list = ['Illicit drugs', 'Alcohol abuse', 'illegal', 'illicit', 'drug abuse']
heart_failure_list = ['Congestive Heart Failure', 'heart failure']
hiv_list = ['HIV', 'aids', 'human immunodeficiency virus']
allergy_list = ['Allergies', 'allergy', 'hypersensitivity']
In [220]:
smoker_pred_list = ['current']
pregnancy_pred_list = ['potential', 'negative']
birth_control_pred_list = ['effective', 'Fertile patients', 'must use effective',
'must use', 'use effective', 'Fertile patients must use',
'fertile']
drug_pred_list = ['use', 'abuse']
heart_failure_pred_list = []
hiv_pred_list = []
allergy_pred_list = ['known', 'history', 'suspected', 'known suspected',
'clinically significant']
In [221]:
#dictionaries keeping track of predictors said no to
smoker_pred_dict = {}
pregnancy_pred_dict = {}
birth_control_pred_dict = {}
drug_pred_dict = {}
heart_failure_pred_dict = {}
hiv_pred_dict = {}
allergy_pred_dict = {}
#dictionaries to keep track of terms said no to
smoker_term_dict = {}
pregnancy_term_dict = {}
birth_control_term_dict = {}
drug_term_dict = {}
heart_failure_term_dict = {}
hiv_term_dict = {}
allergy_term_dict = {}
In [222]:
pred_list = [smoker_pred_list, pregnancy_pred_list, birth_control_pred_list, drug_pred_list,
heart_failure_pred_list, hiv_pred_list, allergy_pred_list]
term_list = [smoker_list, pregnancy_list, birth_control_list, drug_list, heart_failure_list,
hiv_list, allergy_list]
pred_dicts = [smoker_pred_dict, pregnancy_pred_dict, birth_control_pred_dict, drug_pred_dict,
heart_failure_pred_dict, hiv_pred_dict, allergy_pred_dict]
term_dicts = [smoker_term_dict, pregnancy_term_dict, birth_control_term_dict, drug_term_dict,
heart_failure_term_dict, hiv_term_dict, allergy_term_dict]
In [223]:
def active_learn_predictors(data, term_list, pred_list, pred_dicts):
#look for more predictors for each concept by finding sentnces that have
#concept terms in them and looking for predictors in those sentences
def get_pred(text_dict, term_list, pred_dicts, pred_list):
pred_options_dict = Counter()
for doc in text_dict.values():
for subdoc in doc:
for sent in subdoc:
#if the sentance has less than 2 words skip it
if len(sent) <= 1:
continue
#crate a sentence rank for judging weight of terms found
sent_rank = 0
for term in term_list:
if term.lower() in ' '.join(zip(*sent)[0]).lower():
sent_rank += 1
result = chunker(sent)
preds = [' '.join(x) for x in [[x[0] for x in term] for term in result]]
preds.append(' '.join([sent[0][0], sent[1][0]]))
#lower case all preds
preds = [x.lower() for x in preds]
preds = preds * sent_rank
pred_options_dict.update(preds)
#get top 20 predictors that have not been seen before
sorted_preds = sorted(pred_options_dict.items(), key=lambda x: x[1], reverse=True)
counter = 0
top_preds = []
for pred in sorted_preds:
if pred[0] not in pred_list and pred[0] not in pred_dicts:
top_preds.append(pred)
counter += 1
if counter == 15 or counter == len(sorted_preds):
return top_preds
#if there are no preds return empty list
return top_preds
#get chunks for preds
def chunker(sent):
chunk_reg1 = r"""
CHUNK: {<NN.*><IN>}
"""
chunk_reg2 = r"""
CHUNK: {<VB.*><DT>}
"""
chunk_reg3 = r"""
CHUNK: {<NN.*><VB.*>}
"""
results = []
for chunk_reg in [chunk_reg1, chunk_reg2, chunk_reg3]:
cp = nltk.RegexpParser(chunk_reg)
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.label() == 'CHUNK':
results.append(subtree[:])
return results
def human_checker(term, pred_list, top_preds, pred_dict):
'''This function loops through the possible predictors and
lets human input decide if they actually are or not'''
print 'Are the following predictors of these %r?' % (term)
if len(top_preds) > 1:
for pred in top_preds:
print 'Predictor: \x1b[35m %s \x1b[0m Count: \x1b[36m %d \x1b[0m' % (pred[0], pred[1])
answer_switch = True
while answer_switch:
add_pred = raw_input('Is this a predictor of %s? (Y, N, exit): ' % (term[0]))
if add_pred.lower() == 'y':
pred_list.append(pred[0])
answer_switch = False
elif add_pred.lower() == 'exit':
#pass switch to exit program
exit_switch = True
return pred_list, pred_dict, exit_switch
elif add_pred.lower() == 'n':
pred_dict[pred[0]] = ''
answer_switch = False
else:
pass
exit_switch = False
return pred_list, pred_dict, exit_switch
for idx, term in enumerate(term_list):
top_preds = get_pred(data, term, pred_dicts[idx], pred_list[idx])
print '\n**NEW Concept**\n'
pred_list[idx], pred_dicts[idx], exit_switch = human_checker(term, pred_list[idx], top_preds, pred_dicts[idx])
#save list and dict
#make sure it is not null before saving
if pred_list[idx]:
pickle.dump(pred_list, open('data/predictor_list.pkl', 'wb'))
pickle.dump(pred_dicts, open('data/not_predictor_dict.pkl', 'wb'))
else:
print 'pred list Null'
#if exit, exit program
if exit_switch:
break
print 'Active Learning Complete'
return pred_list, pred_dicts
In [224]:
def active_learn_terms(data, term_list, pred_list, term_dicts):
#look for more terms for each concept by finding sentnces that have
#predictors in them and looking for terms in those sentences
def get_pred(text_dict, term_list, term_dicts, pred_list):
term_options_dict = Counter()
for doc in text_dict.values():
for subdoc in doc:
for sent in subdoc:
#skip sentence if it contains less than one word
if len(sent) <= 1:
continue
#crate a sentence rank for judging weight of terms found
sent_rank = 0
for pred in pred_list:
if pred[0].lower() in ' '.join(zip(*sent)[0]).lower():
sent_rank += pred[1]
result = chunker(sent)
terms = [' '.join(x) for x in [[x[0] for x in term] for term in result]]
terms.append(' '.join([sent[0][0], sent[1][0]]))
#lower case all preds
terms = [x.lower() for x in terms]
#add weights to terms by multiplying by sent_rank
terms = terms * sent_rank
term_options_dict.update(terms)
#get top 20 predictors that have not been seen before
sorted_terms = sorted(term_options_dict.items(), key=lambda x: x[1], reverse=True)
counter = 0
top_terms = []
for term in sorted_terms:
if term[0] not in term_list and term[0] not in term_dicts:
top_terms.append(term)
counter += 1
if counter == 15 or counter == len(sorted_terms):
return top_terms
#if there are no preds return empty list
return top_terms
#get chunks for preds
def chunker(sent):
chunk_reg1 = r"""
CHUNK: {(<NN.*><POS>)?<RB>?<JJ.*>*<NN.*>+}
"""
results = []
for chunk_reg in [chunk_reg1]:
cp = nltk.RegexpParser(chunk_reg)
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.label() == 'CHUNK':
results.append(subtree[:])
return results
def human_checker(term_list, top_terms, term_dict):
'''This function loops through the possible terms and
lets human input decide if they actually are or not'''
print 'Are the following terms part of this list: %r?' % (term_list)
if len(top_terms) > 1:
for term in top_terms:
print 'Term: \x1b[35m %s \x1b[0m Count: \x1b[36m %d \x1b[0m' % (term[0], (term[1]/7.))
answer_switch = True
while answer_switch:
add_term = raw_input('Is this similar to %s? (Y, N, exit): ' % (term_list[0]))
if add_term.lower() == 'y':
term_list.append(term[0])
answer_switch = False
elif add_term.lower() == 'exit':
#pass switch to exit program
exit_switch = True
return term_list, term_dict, exit_switch
elif add_term.lower() == 'n':
term_dict[term[0]] = ''
answer_switch = False
else:
pass
exit_switch = False
return term_list, term_dict, exit_switch
#making a pred weight list because of scoping problems in iPyhton notebooks
smoker_pred_weight_list = []
pregnancy_pred_weight_list = []
birth_control_pred_weight_list = []
drug_pred_weight_list = []
heart_failure_pred_weight_list = []
hiv_pred_weight_list = []
allergy_pred_weight_list = []
pred_weight_list = [smoker_pred_weight_list, pregnancy_pred_weight_list,
birth_control_pred_weight_list, drug_pred_weight_list,
heart_failure_pred_weight_list, hiv_pred_weight_list, allergy_pred_weight_list]
#create a combined list of all preds, create Counter dict
tot_pred_list = []
for p in pred_list:
tot_pred_list += p
count_pred = Counter(tot_pred_list)
#add weights to pred terms and create new pred weight lists
for n in xrange(len(pred_list)):
for idx in range(len(pred_list[n])):
weight = 7 - (count_pred[pred_list[n][idx]]-1)
pred_weight_list[n].append((pred_list[n][idx], weight))
for idx, term in enumerate(term_list):
top_terms = get_pred(data, term, term_dicts[idx], pred_weight_list[idx])
print '\n**NEW Concept**\n'
term_list[idx], term_dicts[idx], exit_switch = human_checker(term, top_terms, term_dicts[idx])
#save list and dict
#make sure it is not null before saving
if pred_list[idx]:
pickle.dump(term_list, open('data/term_list.pkl', 'wb'))
pickle.dump(term_dicts, open('data/not_term_dict.pkl', 'wb'))
else:
print 'Term list Null'
#if exit, exit program
if exit_switch:
break
print 'Active Learning Complete'
return term_list, term_dicts
In [4]:
#load in past predictor terms
pred_list = pickle.load(open('data/predictor_list.pkl', 'rb'))
pred_dicts = pickle.load(open('data/not_predictor_dict.pkl', 'rb'))
#load in past concept terms
term_list = pickle.load(open('data/term_list.pkl', 'rb'))
term_dicts = pickle.load(open('data/not_term_dict.pkl', 'rb'))
In [ ]:
pred_list, pred_dicts = active_learn_predictors(data, term_list, pred_list, pred_dicts)
In [ ]:
term_list, term_dicts = active_learn_terms(data, term_list, pred_list, term_dicts)
In [8]:
term_list = pickle.load(open('../data/term_list.pkl','rb'))
Display criteria split by Inclusion and Exclusion
Sentences highlighted based with different colors depending on the concept they contain
Tag the trials which two set of tags, Inclusion and Exclusion
In [17]:
#load trail concept lookup dict
trial_concept_lookup = pickle.load(open('data/trial_concept_lookup.pkl', 'rb'))
In [5]:
trial_concept_lookup = {}
In [10]:
def criteria_highlight(data, term_list, term_color_lookup, trial_concept_lookup,
concept_lookup):
for key, value in data.items():
#print a color key
print 'Color Legend'
for c in xrange(len(term_color_lookup)):
print (term_color_lookup[c] + concept_lookup[c] + '\x1b[0m \x1b[0m')
print
print key
if key not in trial_concept_lookup:
trial_concept_lookup[key] = {'inclusion':set(),
'exclusion':set()}
for group in value:
doc = [' '.join(word) for word in [[word[0] for word in sent] for sent in group]]
#check each sentence for concept terms
for sent_idx in xrange(len(doc)):
for concept_idx in xrange(len(term_list)):
for term in term_list[concept_idx]:
if term.lower() in doc[sent_idx].lower():
#tag trial with this concept
#split into inclusion and exclusion sections
if 'inclusion criteria' in doc[0].lower():
trial_concept_lookup[key]['inclusion'].add(concept_lookup[concept_idx])
elif 'exclusion criteria' in doc[0].lower():
trial_concept_lookup[key]['exclusion'].add(concept_lookup[concept_idx])
#if the background is being set to black you have to escape twice
if concept_idx == 6:
doc[sent_idx] = (term_color_lookup[concept_idx] + doc[sent_idx]
+ '\x1b[0m \x1b[0m')
else:
doc[sent_idx] = (term_color_lookup[concept_idx] + doc[sent_idx]
+ '\x1b[0m')
#check to print inclusion or exclusion tags
if 'inclusion criteria' in doc[0].lower():
if len(trial_concept_lookup[key]['inclusion']) >= 1:
print 'Tags: ', list(trial_concept_lookup[key]['inclusion'])
else:
print 'Tags: None'
print
elif 'exclusion criteria' in doc[0].lower():
if len(trial_concept_lookup[key]['exclusion']) >= 1:
print 'Tags: ', list(trial_concept_lookup[key]['exclusion'])
else:
print 'Tags: None'
print
for sent in doc:
print sent
print
#save lookup dict
pickle.dump(trial_concept_lookup, open('../data/trial_concept_lookup.pkl', 'wb'))
return trial_concept_lookup
In [98]:
term_color_lookup = ['\x1b[41m', '\x1b[42m', '\x1b[43m', '\x1b[44m', '\x1b[45m', '\x1b[46m',
'\x1b[40m \x1b[37m']
concept_lookup = ['Smoking', 'Pregnancy', 'Birth Control', 'Illicit drugs',
'Congestive heart failure', 'HIV', 'Allergies']
shuffled_trials = data.items()
shuffle(shuffled_trials)
for trial in shuffled_trials:
if trial[0] not in trial_concept_lookup:
trial_concept_lookup = criteria_highlight({trial[0]:trial[1]}, term_list,
term_color_lookup,
trial_concept_lookup, concept_lookup)
break
Notes: If a sentence had two concepts in then the first concept in the list will be the one that accounts for the highlight color. Both will be added to the tags however.
Example: Tags - Birth Control and Pregnacy:
Positive pregnancy test in women of child bearing potential or who are unwilling to use an acceptable method of contraception .
Problems:
Negatives - Not pregnant or sentences that say was pregnant but now are not...
In [392]:
term_list
Out[392]:
In [15]:
pred_list
Out[15]:
In [ ]: