In [91]:

    
import nltk
import codecs
import string
import random
from collections import Counter
import cPickle as pickle
import re
from copy import deepcopy
import csv
import pandas as pd

from __future__ import division

This notebook was for developing the programs that had to be run on harbinger to process the full corpus of text. Because of the size it was too computationally intensive to run on my computer (the pos tagging took around 11 hours on harbinger to complete).

Load Data



In [2]:

    
trial_criteria = {}
predefined_criteria = {}
for row in codecs.open('data/clinical_study.txt','r','utf-8').readlines():
    data = row.split('|')
    trial_criteria[data[0]] = data[26]
    # 27 = gender, 28 = min age, 29 = max age, 30 = Health volunteers
    predefined_criteria[data[0]] = (data[27], data[28], data[29], data[30])

Prepare text for Stanford Tagger

Create list and lookup dict



In [38]:

    
def stanford_list(trial_criteria):
    #create a flat list of all sentences for the stanford tagger to run on
    #also creating a look up dict where the index of the sentence in the 
    #stanford tagger list is associated with the trial it came from
    stanford_sentence_list = []
    sentence_lookup_dict = {}
    for trial, criteria in trial_criteria.items():
        stan_len = len(stanford_sentence_list)
        new_list_len = len(criteria.split('<br />'))
        stanford_sentence_list += criteria.split('<br />')
        for ix in xrange(stan_len, stan_len + new_list_len, 1):
            sentence_lookup_dict[ix] = trial
    return stanford_sentence_list, sentence_lookup_dict



In [44]:

    
stanford_sentence_list, sentence_lookup_dict = stanford_list(trial_criteria)

Remove bullet point symobols



In [84]:

    
def clean_bullet_points(stanford_sentence_list):
    '''Remove bullet point symbols from text'''
    for ix, sent in enumerate(stanford_sentence_list):
        m = re.search('^-?[0-9]?.?\)?\s', sent)
        if m:
            stanford_sentence_list[ix] = sent[len(m.group(0)):]
    return stanford_sentence_list



In [86]:

    
stanford_sentence_list = clean_bullet_points(stanford_sentence_list)

Save Stanford preped text and lookup dict



In [92]:

    
df = pd.DataFrame(stanford_sentence_list)



In [96]:

    
df.to_csv('data/stanford_sentence_list.csv', header=False,
          index=False, encoding='utf-8', sep="\t")



In [ ]:

    
pickle.dump(sentence_lookup_dict, open('data/sentence_lookup_dict.pkl', 'wb'))

Split Inclusive and Exclusive



In [46]:

    
def split_doc_on_exclusion(doc):
    '''This function splits the criteria documents into the Inclusion 
    and Exclusion sections'''
    split_doc = doc.split()
    for idx, word in enumerate(split_doc):
        if word.lower() == u'exclusion' and u'criteria' in split_doc[idx + 1].lower():
            inclusive = ' '.join(split_doc[:idx])
            exclusive =  ' '.join(split_doc[idx:])
            return inclusive, exclusive
    if split_doc[0].lower() == u'inclusion' and u'criteria' in split_doc[1].lower():
        return doc, None

no_inc_exc_dict = {}
trial_criteria_split = {}

for key, doc in trial_criteria.items():
    try:
        inclusive, exclusive = split_doc_on_exclusion(doc)
        if exclusive is None:
            trial_criteria_split[key] = [inclusive]
        else:
            trial_criteria_split[key] = [inclusive, exclusive]
            
    #if there are not inclusion/exclusion
    except:
         no_inc_exc_dict[key] = doc



In [47]:

    
len(trial_criteria_split)









    Out[47]:





152877



In [48]:

    
#save ram
del trial_criteria

~10,000 trials did not have an inclusive exclusive split



In [40]:

    
len(no_inc_exc_dict)









    Out[40]:





10887



In [29]:

    
len(trial_criteria)









    Out[29]:





163764

Save Files



In [11]:

    
#save to pickle
pickle.dump(predefined_criteria, open('data/predefined_criteria.pkl', 'wb'))
pickle.dump(trial_criteria, open('data/trial_criteria.pkl', 'wb'))
pickle.dump(trial_criteria_split, open('data/trial_criteria_split.pkl', 'wb'))
pickle.dump(no_inc_exc_dict, open('data/no_inc_exc_dict.pkl', 'wb'))

Load Data From Pickle



In [11]:

    
trial_criteria = pickle.load(open('data/trial_criteria.pkl', 'rb'))



In [68]:

    
trial_criteria_split = pickle.load(open('data/trial_criteria_split.pkl', 'rb'))

Initial Concept Term Lists



In [6]:

    
smoker_list = ['Non-smoker', 'smoker', 'Current smoker', 'smoking', 'tobacco', 'nicotine',
               'cigarettes']
pregnancy_list = ['Pregnancy']
birth_control_list = ['Birth control', 'contraception']
drug_list = ['Illicit drugs', 'Alcohol abuse', 'illegal', 'illicit', 'drug abuse']
heart_failure_list = ['Congestive Heart Failure', 'heart failure']
hiv_list = ['HIV', 'aids', 'human immunodeficiency virus']
allergy_list = ['Allergies', 'allergy', 'hypersensitivity']

Inital Predictive Terms



In [5]:

    
smoker_pred_list = ['current']
pregnancy_pred_list = ['potential', 'negative']
birth_control_pred_list = ['effective', 'Fertile patients', 'must use effective',
                           'must use', 'use effective', 'Fertile patients must use',
                           'fertile']
drug_pred_list = ['use', 'abuse']
heart_failure_pred_list = []
hiv_pred_list = []
allergy_pred_list = ['known', 'history', 'suspected', 'known suspected',
                     'clinically significant']



In [70]:

    
trial_criteria_split_test = deepcopy(dict(trial_criteria_split.items()[:3]))

Process Text



In [71]:

    
def process_text(text_dict):
    #break sentences on '-' for each subdoc of each document
    for key, doc in text_dict.items():
        for n in xrange(len(doc)):
            text_dict[key][n] = re.split(' - ', text_dict[key][n])


    #get sentence tokenizer
    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')

    #run the sentence tokenizer over all the documents
    def sent_token(text):
        sentence_groups = []
        for sent_group in text:
            group_holder = []
            for sent in sent_group:
                group_holder += (sent_tokenizer.tokenize(sent))
            sentence_groups.append(group_holder)
            del group_holder
        return sentence_groups

    #run sentence tokenizer over each doc in the dict
    for key, doc in text_dict.items():
        text_dict[key] = sent_token(doc)



    #CREATING TOKENS

    #patter for tokenizing
    pattern = r'''(?x)    # set flag to allow verbose regexps
            ([A-Z]\.)+        # abbreviations, e.g. U.S.A
            | \w+([-‘]\w+)*        # words with optional internal hyphens
            | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
            | \.\.\.            # ellipsis...   
            | [][.,;"'?():\-_`]+  # these are separate tokens
            '''



    for key, doc in text_dict.items():
        for n in xrange(len(doc)):
            text_dict[key][n] = [nltk.regexp_tokenize(sent, pattern) for sent
                                             in doc[n]]
    return text_dict



In [72]:

    
result = process_text(trial_criteria_split_test)

Save Tokenized Text



In [ ]:

    
pickle.dump(result, open('data/trial_criteria_split_token.pkl', 'wb'))

Load Tokenized Text



In [ ]:

    
trial_criteria_split_token = pickle.load(open('data/trial_criteria_split_token.pkl', 'rb'))

POS Tag Tokens



In [73]:

    
def pos_tag(text_dict):
    #tag document structured criteria text
    def doc_tagger_pos(text):
        result = []
        for doc in text:
            doc_text = []
            for sent in doc:
                doc_text.append(nltk.pos_tag(sent))
            result.append(doc_text)
        return result
    
    for key, doc in text_dict.items():
        text_dict[key] = doc_tagger_pos(doc)
        
    return text_dict
    
result_pos = pos_tag(result)



In [77]:

    
#save test corpus
pickle.dump(result,open('data/test_tagged_data.pkl', 'wb'))

Save tagged text



In [ ]:

    
#save tagged corpus
pickle.save_object(criteria_text_sent_tag,
                   'data/criteria_corpus_pos_tagged.pkl')



In [ ]:

    
#load tagged corpus
criteria_text_sent_tag = pickle.open_object('data/criteria_corpus_pos_tagged.pkl')