In [2]:
import nltk
import cPickle as pickle
from collections import Counter
from random import shuffle

Load Data


In [3]:
data = pickle.load(open('../data/criteria_text_chunk_3.pkl', 'rb'))

Initial Concept Term Lists


In [219]:
smoker_list = ['Non-smoker', 'smoker', 'Current smoker', 'smoking']
pregnancy_list = ['Pregnancy']
birth_control_list = ['Birth control', 'contraception']
drug_list = ['Illicit drugs', 'Alcohol abuse', 'illegal', 'illicit', 'drug abuse']
heart_failure_list = ['Congestive Heart Failure', 'heart failure']
hiv_list = ['HIV', 'aids', 'human immunodeficiency virus']
allergy_list = ['Allergies', 'allergy', 'hypersensitivity']

Inital Predictive Terms


In [220]:
smoker_pred_list = ['current']
pregnancy_pred_list = ['potential', 'negative']
birth_control_pred_list = ['effective', 'Fertile patients', 'must use effective',
                           'must use', 'use effective', 'Fertile patients must use',
                           'fertile']
drug_pred_list = ['use', 'abuse']
heart_failure_pred_list = []
hiv_pred_list = []
allergy_pred_list = ['known', 'history', 'suspected', 'known suspected',
                     'clinically significant']

Discount Dictionaries


In [221]:
#dictionaries keeping track of predictors said no to
smoker_pred_dict = {}
pregnancy_pred_dict = {}
birth_control_pred_dict = {}
drug_pred_dict = {}
heart_failure_pred_dict = {}
hiv_pred_dict = {}
allergy_pred_dict = {}

#dictionaries to keep track of terms said no to
smoker_term_dict = {}
pregnancy_term_dict = {}
birth_control_term_dict = {}
drug_term_dict = {}
heart_failure_term_dict = {}
hiv_term_dict = {}
allergy_term_dict = {}

In [222]:
pred_list = [smoker_pred_list, pregnancy_pred_list, birth_control_pred_list, drug_pred_list,
                   heart_failure_pred_list, hiv_pred_list, allergy_pred_list]
term_list = [smoker_list, pregnancy_list, birth_control_list, drug_list, heart_failure_list,
             hiv_list, allergy_list]
pred_dicts = [smoker_pred_dict, pregnancy_pred_dict, birth_control_pred_dict, drug_pred_dict,
              heart_failure_pred_dict, hiv_pred_dict, allergy_pred_dict]
term_dicts = [smoker_term_dict, pregnancy_term_dict, birth_control_term_dict, drug_term_dict,
              heart_failure_term_dict, hiv_term_dict, allergy_term_dict]

Find new predictors


In [223]:
def active_learn_predictors(data, term_list, pred_list, pred_dicts):
    #look for more predictors for each concept by finding sentnces that have 
    #concept terms in them and looking for predictors in those sentences 

    def get_pred(text_dict, term_list, pred_dicts, pred_list):
        pred_options_dict = Counter()
        for doc in text_dict.values():
            for subdoc in doc:
                for sent in subdoc:
                    #if the sentance has less than 2 words skip it
                    if len(sent) <= 1:
                        continue
                    #crate a sentence rank for judging weight of terms found
                    sent_rank = 0
                    for term in term_list:
                        if term.lower() in ' '.join(zip(*sent)[0]).lower():
                            sent_rank += 1
                    result = chunker(sent)
                    preds = [' '.join(x) for x in [[x[0] for x in term] for term in result]]
                    preds.append(' '.join([sent[0][0], sent[1][0]]))
                    #lower case all preds
                    preds = [x.lower() for x in preds]
                    preds = preds * sent_rank
                    pred_options_dict.update(preds)

        #get top 20 predictors that have not been seen before
        sorted_preds = sorted(pred_options_dict.items(), key=lambda x: x[1], reverse=True)
        counter = 0
        top_preds = []
        for pred in sorted_preds:
            if pred[0] not in pred_list and pred[0] not in pred_dicts:
                top_preds.append(pred)
                counter += 1
                if counter == 15 or counter == len(sorted_preds):
                    return top_preds
        #if there are no preds return empty list
        return top_preds

    #get chunks for preds
    def chunker(sent):

        chunk_reg1 = r"""
                          CHUNK: {<NN.*><IN>}
                     """
        chunk_reg2 = r"""
                          CHUNK: {<VB.*><DT>}
                     """
        chunk_reg3 = r"""
                          CHUNK: {<NN.*><VB.*>}
                     """
        results = []

        for chunk_reg in [chunk_reg1, chunk_reg2, chunk_reg3]:
            cp = nltk.RegexpParser(chunk_reg)

            tree = cp.parse(sent)
            for subtree in tree.subtrees():
                if subtree.label() == 'CHUNK':
                    results.append(subtree[:])
        return results

    def human_checker(term, pred_list, top_preds, pred_dict):
        '''This function loops through the possible predictors and
        lets human input decide if they actually are or not'''
        print 'Are the following predictors of these %r?' % (term)
        if len(top_preds) > 1:
            for pred in top_preds:
                print 'Predictor: \x1b[35m %s \x1b[0m  Count: \x1b[36m %d \x1b[0m' % (pred[0], pred[1])
                answer_switch = True
                while answer_switch:
                    add_pred = raw_input('Is this a predictor of %s? (Y, N, exit): ' % (term[0]))
                    if add_pred.lower() == 'y':
                        pred_list.append(pred[0])
                        answer_switch = False
                    elif add_pred.lower() == 'exit':
                        #pass switch to exit program
                        exit_switch = True
                        return pred_list, pred_dict, exit_switch
                    elif add_pred.lower() == 'n':
                        pred_dict[pred[0]] = ''
                        answer_switch = False
                    else:
                        pass
                    
        exit_switch = False
        return pred_list, pred_dict, exit_switch


    for idx, term in enumerate(term_list):
        top_preds = get_pred(data, term, pred_dicts[idx], pred_list[idx])
        print '\n**NEW Concept**\n'
        pred_list[idx], pred_dicts[idx], exit_switch = human_checker(term, pred_list[idx], top_preds, pred_dicts[idx])
        #save list and dict
        #make sure it is not null before saving
        if pred_list[idx]:
            pickle.dump(pred_list, open('data/predictor_list.pkl', 'wb'))
            pickle.dump(pred_dicts, open('data/not_predictor_dict.pkl', 'wb'))
        else:
            print 'pred list Null'
        #if exit, exit program
        if exit_switch:
            break
    print 'Active Learning Complete'
    return pred_list, pred_dicts

Find new terms


In [224]:
def active_learn_terms(data, term_list, pred_list, term_dicts):
    #look for more terms for each concept by finding sentnces that have 
    #predictors in them and looking for terms in those sentences 

    def get_pred(text_dict, term_list, term_dicts, pred_list):
        term_options_dict = Counter()
        for doc in text_dict.values():
            for subdoc in doc:
                for sent in subdoc:
                    #skip sentence if it contains less than one word
                    if len(sent) <= 1:
                            continue
                    #crate a sentence rank for judging weight of terms found
                    sent_rank = 0
                    for pred in pred_list:
                        if pred[0].lower() in ' '.join(zip(*sent)[0]).lower():
                            sent_rank += pred[1]
                    result = chunker(sent)
                    terms = [' '.join(x) for x in [[x[0] for x in term] for term in result]]
                    terms.append(' '.join([sent[0][0], sent[1][0]]))
                    #lower case all preds
                    terms = [x.lower() for x in terms]
                    #add weights to terms by multiplying by sent_rank
                    terms = terms * sent_rank
                    term_options_dict.update(terms)

        #get top 20 predictors that have not been seen before
        sorted_terms = sorted(term_options_dict.items(), key=lambda x: x[1], reverse=True)
        counter = 0
        top_terms = []
        for term in sorted_terms:
            if term[0] not in term_list and term[0] not in term_dicts:
                top_terms.append(term)
                counter += 1
                if counter == 15 or counter == len(sorted_terms):
                    return top_terms
        #if there are no preds return empty list
        return top_terms

    #get chunks for preds
    def chunker(sent):

        chunk_reg1 = r"""
                          CHUNK: {(<NN.*><POS>)?<RB>?<JJ.*>*<NN.*>+}
                     """
        results = []

        for chunk_reg in [chunk_reg1]:
            cp = nltk.RegexpParser(chunk_reg)

            tree = cp.parse(sent)
            for subtree in tree.subtrees():
                if subtree.label() == 'CHUNK':
                    results.append(subtree[:])
        return results

    def human_checker(term_list, top_terms, term_dict):
        '''This function loops through the possible terms and
        lets human input decide if they actually are or not'''
        print 'Are the following terms part of this list: %r?' % (term_list)
        if len(top_terms) > 1:
            for term in top_terms:
                print 'Term: \x1b[35m %s \x1b[0m  Count: \x1b[36m %d \x1b[0m' % (term[0], (term[1]/7.))
                answer_switch = True
                while answer_switch:
                    add_term = raw_input('Is this similar to %s? (Y, N, exit): ' % (term_list[0]))
                    if add_term.lower() == 'y':
                        term_list.append(term[0])
                        answer_switch = False
                    elif add_term.lower() == 'exit':
                        #pass switch to exit program
                        exit_switch = True
                        return term_list, term_dict, exit_switch
                    elif add_term.lower() == 'n':
                        term_dict[term[0]] = ''
                        answer_switch = False
                    else:
                        pass
                    
        exit_switch = False
        return term_list, term_dict, exit_switch

    #making a pred weight list because of scoping problems in iPyhton notebooks
    smoker_pred_weight_list = []
    pregnancy_pred_weight_list = []
    birth_control_pred_weight_list = []
    drug_pred_weight_list = []
    heart_failure_pred_weight_list = []
    hiv_pred_weight_list = []
    allergy_pred_weight_list = []
    
    pred_weight_list = [smoker_pred_weight_list, pregnancy_pred_weight_list,
                 birth_control_pred_weight_list, drug_pred_weight_list,
                 heart_failure_pred_weight_list, hiv_pred_weight_list, allergy_pred_weight_list]
    
    #create a combined list of all preds, create Counter dict
    tot_pred_list = []
    for p in pred_list:
        tot_pred_list += p
    count_pred = Counter(tot_pred_list)

    #add weights to pred terms and create new pred weight lists
    for n in xrange(len(pred_list)):
        for idx in range(len(pred_list[n])):
            weight  = 7 - (count_pred[pred_list[n][idx]]-1)
            pred_weight_list[n].append((pred_list[n][idx], weight))




    for idx, term in enumerate(term_list):
        top_terms = get_pred(data, term, term_dicts[idx], pred_weight_list[idx])
        print '\n**NEW Concept**\n'
        term_list[idx], term_dicts[idx], exit_switch = human_checker(term, top_terms, term_dicts[idx])
        #save list and dict
        #make sure it is not null before saving
        if pred_list[idx]:
            pickle.dump(term_list, open('data/term_list.pkl', 'wb'))
            pickle.dump(term_dicts, open('data/not_term_dict.pkl', 'wb'))
        else:
            print 'Term list Null'
        #if exit, exit program
        if exit_switch:
            break
    print 'Active Learning Complete'
    return term_list, term_dicts

In [4]:
#load in past predictor terms
pred_list = pickle.load(open('data/predictor_list.pkl', 'rb'))
pred_dicts = pickle.load(open('data/not_predictor_dict.pkl', 'rb'))
#load in past concept terms
term_list = pickle.load(open('data/term_list.pkl', 'rb'))
term_dicts = pickle.load(open('data/not_term_dict.pkl', 'rb'))

Active Learn Predictor Terms


In [ ]:
pred_list, pred_dicts = active_learn_predictors(data, term_list, pred_list, pred_dicts)

Active Learn Concept Terms


In [ ]:
term_list, term_dicts = active_learn_terms(data, term_list, pred_list, term_dicts)

In [8]:
term_list = pickle.load(open('../data/term_list.pkl','rb'))

Change save names for presentation examples

Display Highlighted Criteria

Display criteria split by Inclusion and Exclusion

Sentences highlighted based with different colors depending on the concept they contain

Tag the trials which two set of tags, Inclusion and Exclusion


In [17]:
#load trail concept lookup dict
trial_concept_lookup = pickle.load(open('data/trial_concept_lookup.pkl', 'rb'))

In [5]:
trial_concept_lookup = {}

In [10]:
def criteria_highlight(data, term_list, term_color_lookup, trial_concept_lookup,
                       concept_lookup):
    for key, value in data.items():
        #print a color key
        print 'Color Legend'
        for c in xrange(len(term_color_lookup)):
            print (term_color_lookup[c] + concept_lookup[c] + '\x1b[0m \x1b[0m')
        print
        print key
        if key not in trial_concept_lookup:
            trial_concept_lookup[key] = {'inclusion':set(),
                                         'exclusion':set()}
        for group in value:
            doc = [' '.join(word) for word in [[word[0] for word in sent] for sent in group]]
            #check each sentence for concept terms
            for sent_idx in xrange(len(doc)):
                for concept_idx in xrange(len(term_list)):
                    for term in term_list[concept_idx]:
                        if term.lower() in doc[sent_idx].lower():
                            #tag trial with this concept
                            #split into inclusion and exclusion sections
                            if 'inclusion criteria' in doc[0].lower():
                                trial_concept_lookup[key]['inclusion'].add(concept_lookup[concept_idx])
                            elif 'exclusion criteria' in doc[0].lower():
                                trial_concept_lookup[key]['exclusion'].add(concept_lookup[concept_idx])
                            #if the background is being set to black you have to escape twice
                            if concept_idx == 6:
                                doc[sent_idx] = (term_color_lookup[concept_idx] + doc[sent_idx]
                                             + '\x1b[0m \x1b[0m')
                            else:
                                doc[sent_idx] = (term_color_lookup[concept_idx] + doc[sent_idx]
                                                 + '\x1b[0m')
                                
            #check to print inclusion or exclusion tags
            if 'inclusion criteria' in doc[0].lower():
                if len(trial_concept_lookup[key]['inclusion']) >= 1:
                    print 'Tags: ', list(trial_concept_lookup[key]['inclusion'])
                else:
                    print 'Tags: None'
                print

            elif 'exclusion criteria' in doc[0].lower():
                if len(trial_concept_lookup[key]['exclusion']) >= 1:
                    print 'Tags: ', list(trial_concept_lookup[key]['exclusion'])
                else:
                    print 'Tags: None'
                print

            for sent in doc:
                print sent
            print
        #save lookup dict
        pickle.dump(trial_concept_lookup, open('../data/trial_concept_lookup.pkl', 'wb'))
        return trial_concept_lookup

In [98]:
term_color_lookup = ['\x1b[41m', '\x1b[42m', '\x1b[43m', '\x1b[44m', '\x1b[45m', '\x1b[46m',
                     '\x1b[40m \x1b[37m']
concept_lookup = ['Smoking', 'Pregnancy', 'Birth Control', 'Illicit drugs',
                  'Congestive heart failure', 'HIV', 'Allergies']

shuffled_trials = data.items()
shuffle(shuffled_trials)

for trial in shuffled_trials:
    if trial[0] not in trial_concept_lookup:
        trial_concept_lookup = criteria_highlight({trial[0]:trial[1]}, term_list,
                                          term_color_lookup,
                                          trial_concept_lookup, concept_lookup)
    break


Color Legend
Smoking 
Pregnancy 
Birth Control 
Illicit drugs 
Congestive heart failure 
HIV 
 Allergies 

NCT01532141
Tags:  ['Pregnancy', 'Smoking', 'Illicit drugs', 'HIV', 'Birth Control']

Inclusion Criteria :
Subjects who were able and willing to give written informed consent .
Male or female subjects aged between 18 and 45 years , inclusive .
Subjects of body mass index ( BMI ) between 18 . 0 and 30 . 0 kg m2 , inclusive .
Subjects who were healthy as determined by pre-study medical history , physical examination , vital signs , complete neurological examination and 12-lead ECG .
Subjects who had negative tests for HBsAg , anti-HCVAb and HIV-1 and HIV-2 Ab at screening
Subjects who had clinical laboratory test results clinically acceptable at screening and admission to each treatment period .
Subjects who had a negative screen for alcohol and drugs of abuse at screening and admission to each treatment period .
Subjects who were non-smokers or ex-smokers for at least 3 months .
( If female ) She was not of childbearing potential by reason of surgery or , if of childbearing potential , she used one of the following methods of contraception : double barrier or intrauterine device .
( If female ) She had a negative pregnancy test ( β-HCG ) at screening and admission to each treatment period

Tags:  ['Pregnancy', 'Allergies', 'Illicit drugs', 'Birth Control']

Exclusion Criteria :
Subjects who had a clinically relevant history or presence of respiratory , gastrointestinal , renal , hepatic , haematological , lymphatic , neurological , cardiovascular , psychiatric , musculoskeletal , genitourinary , immunological , dermatological , endocrine , connective tissue diseases or disorders .
Subjects who had a clinically relevant surgical history .
Subjects who had any significant abnormality in the coagulation tests .
Subjects who had any significant abnormality in the liver function tests ( a case-by-case decision for any abnormality was to be discussed with the Sponsor before inclusion ).
 Subjects who had a history of relevant atopy or drug hypersensitivity . 
Subjects who had a history of alcoholism or drug abuse .
Subjects who consumed more than 14 units of alcohol a week .
Subjects who had a significant infection or known inflammatory process at screening or admission to each treatment period .
Subjects who had acute gastrointestinal symptoms ( e . g ., nausea , vomiting , diarrhoea , heartburn ) at the time of screening or admission to each treatment period .
Subjects who had received fluoxetine within 5 weeks of admission to the first period .
Subjects who had used any other medicines within 2 weeks of admission to first period that could affected the safety or other study assessments , in the investigator ' s opinion .
Subjects who had previously received BIA 9-1067 .
Subjects who have used any investigational drug or participated in any clinical trial within 90 days prior to screening .
Subjects who have donated or received any blood or blood products within the 3 months prior to screening .
Subjects who were vegetarians , vegans or have medical dietary restrictions .
Subjects who could not communicated reliably with the investigator .
Subjects who were unlikely to co-operate with the requirements of the study .
Subjects who were unwilling or unable to give written informed consent .
( If female ) She was pregnant or breast-feeding .
( If female ) She was of childbearing potential and she did not use an approved effective contraceptive method ( double-barrier , intra-uterine device ) or she uses oral contraceptives .

Notes

Notes: If a sentence had two concepts in then the first concept in the list will be the one that accounts for the highlight color. Both will be added to the tags however.

Example: Tags - Birth Control and Pregnacy:

Positive pregnancy test in women of child bearing potential or who are unwilling to use an acceptable method of contraception .

Problems:

Negatives - Not pregnant or sentences that say was pregnant but now are not...

Final Concept Terms and Predictor Terms


In [392]:
term_list


Out[392]:
[['Non-smoker',
  'smoker',
  'Current smoker',
  'smoking',
  'tobacco',
  'nicotine',
  'cigarettes',
  u'tobacco products'],
 ['Pregnancy',
  u'negative pregnancy test',
  u'pregnancy',
  u'urine pregnancy test',
  u'negative serum pregnancy test',
  u'negative serum',
  u'negative urine pregnancy test',
  u'pregnant women',
  u'pregnant'],
 ['Birth control',
  'contraception',
  u'birth control',
  u'fertile patients',
  u'effective contraception',
  u'child-bearing potential',
  u'abstinence',
  u'adequate contraception',
  u'condom',
  u'iud',
  u'intrauterine device',
  u'diaphragm',
  u'oral contraceptives'],
 ['Illicit drugs',
  'Alcohol abuse',
  'illegal',
  'illicit',
  'drug abuse',
  u'alcohol',
  u'substance abuse',
  u'alcohol abuse'],
 ['Congestive Heart Failure',
  'heart failure',
  u'myocardial infarction',
  u'congestive heart failure',
  u'symptomatic congestive heart failure',
  u'cardiovascular disease',
  u'heart disease',
  u'cardiac disease'],
 ['HIV',
  'aids',
  'human immunodeficiency virus',
  u'hiv',
  u'human immunodeficiency',
  u'known hiv'],
 ['Allergies',
  'allergy',
  'hypersensitivity',
  u'known hypersensitivity',
  u'known allergy']]

In [15]:
pred_list


Out[15]:
[['current',
  u'history of',
  u'use of',
  u'tobacco use',
  u'patients who',
  u'smoking of',
  u'products in',
  u'user of',
  u'products within',
  u'smokers with',
  u'subjects who',
  u'products with',
  u'nicotine containing',
  u'smoker of',
  u'forms of'],
 ['potential',
  'negative',
  u'women of',
  u'have a',
  u'pregnancy or',
  u'females of',
  u'test at',
  u'history of',
  u'female patients',
  u'if female',
  u'females with',
  u'planning a',
  u'test for',
  u'women with',
  u'female subjects',
  u'child bearing',
  u'woman of'],
 ['effective',
  'Fertile patients',
  'must use effective',
  'must use',
  'use effective',
  'Fertile patients must use',
  'fertile',
  u'women of',
  u'method of',
  u'methods of',
  u'form of',
  u'use an',
  u'females of',
  u'patients of',
  u'use a',
  u'dose of',
  u'use of',
  u'forms of',
  u'child bearing',
  u'female patients',
  u'female subjects',
  u'fertile patients',
  u'using an',
  u'administration of'],
 ['use',
  'abuse',
  u'history of',
  u'patient is',
  u'abuse within',
  u'evidence of',
  u'abuse in',
  u'ounces of',
  u'treatment for',
  u'use of',
  u'dose of',
  u'drugs within',
  u'administration of',
  u'drug within',
  u'consumption of',
  u'intake of',
  u'drugs of',
  u'abuse of',
  u'subjects who',
  u'drugs known',
  u'presence of',
  u'drinks per',
  u'dependence on',
  u'test for',
  u'drugs with',
  u'drugs that',
  u'current or'],
 [u'history of',
  u'patients with',
  u'infarction within',
  u'evidence of',
  u'uncontrolled intercurrent',
  u'illness including',
  u'disease including',
  u'risk of',
  u'clinically significant',
  u'patient has',
  u'subjects with',
  u'symptomatic congestive',
  u'presence of',
  u'cardiovascular disease',
  u'diagnosis of',
  u'subject has',
  u'symptoms of',
  u'cardiac disease',
  u'uncontrolled congestive',
  u'has symptomatic',
  u'heart disease',
  u'severe cardiovascular'],
 [u'history of',
  u'subjects with',
  u'infection with',
  u'patients with',
  u'test for',
  u'diagnosis of',
  u'known hiv',
  u'any confirmed',
  u'patient has',
  u'known human',
  u'presence of',
  u'positive test',
  u'hiv positive',
  u'co-infection with',
  u'infection including',
  u'known infection',
  u'positive for',
  u'known diagnosis',
  u'known positive',
  u'known history',
  u'subjects who',
  u'report having'],
 ['known',
  'history',
  'suspected',
  'known suspected',
  'clinically significant',
  u'history of',
  u'patients with',
  u'known allergy',
  u'known hypersensitivity',
  u'subjects with',
  u'hypersensitivity to',
  u'allergy or',
  u'participant has',
  u'a known',
  u'allergy that',
  u'have known',
  u'intolerance of',
  u'children with',
  u'known severe',
  u'evidence of']]

In [ ]: