In [91]:
import nltk
import codecs
import string
import random
from collections import Counter
import cPickle as pickle
import re
from copy import deepcopy
import csv
import pandas as pd

from __future__ import division

This notebook was for developing the programs that had to be run on harbinger to process the full corpus of text. Because of the size it was too computationally intensive to run on my computer (the pos tagging took around 11 hours on harbinger to complete).

Load Data


In [2]:
trial_criteria = {}
predefined_criteria = {}
for row in codecs.open('data/clinical_study.txt','r','utf-8').readlines():
    data = row.split('|')
    trial_criteria[data[0]] = data[26]
    # 27 = gender, 28 = min age, 29 = max age, 30 = Health volunteers
    predefined_criteria[data[0]] = (data[27], data[28], data[29], data[30])

Prepare text for Stanford Tagger

Create list and lookup dict


In [38]:
def stanford_list(trial_criteria):
    #create a flat list of all sentences for the stanford tagger to run on
    #also creating a look up dict where the index of the sentence in the 
    #stanford tagger list is associated with the trial it came from
    stanford_sentence_list = []
    sentence_lookup_dict = {}
    for trial, criteria in trial_criteria.items():
        stan_len = len(stanford_sentence_list)
        new_list_len = len(criteria.split('<br />'))
        stanford_sentence_list += criteria.split('<br />')
        for ix in xrange(stan_len, stan_len + new_list_len, 1):
            sentence_lookup_dict[ix] = trial
    return stanford_sentence_list, sentence_lookup_dict

In [44]:
stanford_sentence_list, sentence_lookup_dict = stanford_list(trial_criteria)

Remove bullet point symobols


In [84]:
def clean_bullet_points(stanford_sentence_list):
    '''Remove bullet point symbols from text'''
    for ix, sent in enumerate(stanford_sentence_list):
        m = re.search('^-?[0-9]?.?\)?\s', sent)
        if m:
            stanford_sentence_list[ix] = sent[len(m.group(0)):]
    return stanford_sentence_list

In [86]:
stanford_sentence_list = clean_bullet_points(stanford_sentence_list)

Save Stanford preped text and lookup dict


In [92]:
df = pd.DataFrame(stanford_sentence_list)

In [96]:
df.to_csv('data/stanford_sentence_list.csv', header=False,
          index=False, encoding='utf-8', sep="\t")

In [ ]:
pickle.dump(sentence_lookup_dict, open('data/sentence_lookup_dict.pkl', 'wb'))

Split Inclusive and Exclusive


In [46]:
def split_doc_on_exclusion(doc):
    '''This function splits the criteria documents into the Inclusion 
    and Exclusion sections'''
    split_doc = doc.split()
    for idx, word in enumerate(split_doc):
        if word.lower() == u'exclusion' and u'criteria' in split_doc[idx + 1].lower():
            inclusive = ' '.join(split_doc[:idx])
            exclusive =  ' '.join(split_doc[idx:])
            return inclusive, exclusive
    if split_doc[0].lower() == u'inclusion' and u'criteria' in split_doc[1].lower():
        return doc, None

no_inc_exc_dict = {}
trial_criteria_split = {}

for key, doc in trial_criteria.items():
    try:
        inclusive, exclusive = split_doc_on_exclusion(doc)
        if exclusive is None:
            trial_criteria_split[key] = [inclusive]
        else:
            trial_criteria_split[key] = [inclusive, exclusive]
            
    #if there are not inclusion/exclusion
    except:
         no_inc_exc_dict[key] = doc

In [47]:
len(trial_criteria_split)


Out[47]:
152877

In [48]:
#save ram
del trial_criteria

~10,000 trials did not have an inclusive exclusive split


In [40]:
len(no_inc_exc_dict)


Out[40]:
10887

In [29]:
len(trial_criteria)


Out[29]:
163764

Save Files


In [11]:
#save to pickle
pickle.dump(predefined_criteria, open('data/predefined_criteria.pkl', 'wb'))
pickle.dump(trial_criteria, open('data/trial_criteria.pkl', 'wb'))
pickle.dump(trial_criteria_split, open('data/trial_criteria_split.pkl', 'wb'))
pickle.dump(no_inc_exc_dict, open('data/no_inc_exc_dict.pkl', 'wb'))

Load Data From Pickle


In [11]:
trial_criteria = pickle.load(open('data/trial_criteria.pkl', 'rb'))

In [68]:
trial_criteria_split = pickle.load(open('data/trial_criteria_split.pkl', 'rb'))

Initial Concept Term Lists


In [6]:
smoker_list = ['Non-smoker', 'smoker', 'Current smoker', 'smoking', 'tobacco', 'nicotine',
               'cigarettes']
pregnancy_list = ['Pregnancy']
birth_control_list = ['Birth control', 'contraception']
drug_list = ['Illicit drugs', 'Alcohol abuse', 'illegal', 'illicit', 'drug abuse']
heart_failure_list = ['Congestive Heart Failure', 'heart failure']
hiv_list = ['HIV', 'aids', 'human immunodeficiency virus']
allergy_list = ['Allergies', 'allergy', 'hypersensitivity']

Inital Predictive Terms


In [5]:
smoker_pred_list = ['current']
pregnancy_pred_list = ['potential', 'negative']
birth_control_pred_list = ['effective', 'Fertile patients', 'must use effective',
                           'must use', 'use effective', 'Fertile patients must use',
                           'fertile']
drug_pred_list = ['use', 'abuse']
heart_failure_pred_list = []
hiv_pred_list = []
allergy_pred_list = ['known', 'history', 'suspected', 'known suspected',
                     'clinically significant']

In [70]:
trial_criteria_split_test = deepcopy(dict(trial_criteria_split.items()[:3]))

Process Text


In [71]:
def process_text(text_dict):
    #break sentences on '-' for each subdoc of each document
    for key, doc in text_dict.items():
        for n in xrange(len(doc)):
            text_dict[key][n] = re.split(' - ', text_dict[key][n])


    #get sentence tokenizer
    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')

    #run the sentence tokenizer over all the documents
    def sent_token(text):
        sentence_groups = []
        for sent_group in text:
            group_holder = []
            for sent in sent_group:
                group_holder += (sent_tokenizer.tokenize(sent))
            sentence_groups.append(group_holder)
            del group_holder
        return sentence_groups

    #run sentence tokenizer over each doc in the dict
    for key, doc in text_dict.items():
        text_dict[key] = sent_token(doc)



    #CREATING TOKENS

    #patter for tokenizing
    pattern = r'''(?x)    # set flag to allow verbose regexps
            ([A-Z]\.)+        # abbreviations, e.g. U.S.A
            | \w+([-‘]\w+)*        # words with optional internal hyphens
            | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
            | \.\.\.            # ellipsis...   
            | [][.,;"'?():\-_`]+  # these are separate tokens
            '''



    for key, doc in text_dict.items():
        for n in xrange(len(doc)):
            text_dict[key][n] = [nltk.regexp_tokenize(sent, pattern) for sent
                                             in doc[n]]
    return text_dict

In [72]:
result = process_text(trial_criteria_split_test)

Save Tokenized Text


In [ ]:
pickle.dump(result, open('data/trial_criteria_split_token.pkl', 'wb'))

Load Tokenized Text


In [ ]:
trial_criteria_split_token = pickle.load(open('data/trial_criteria_split_token.pkl', 'rb'))

POS Tag Tokens


In [73]:
def pos_tag(text_dict):
    #tag document structured criteria text
    def doc_tagger_pos(text):
        result = []
        for doc in text:
            doc_text = []
            for sent in doc:
                doc_text.append(nltk.pos_tag(sent))
            result.append(doc_text)
        return result
    
    for key, doc in text_dict.items():
        text_dict[key] = doc_tagger_pos(doc)
        
    return text_dict
    
result_pos = pos_tag(result)

In [77]:
#save test corpus
pickle.dump(result,open('data/test_tagged_data.pkl', 'wb'))

Save tagged text


In [ ]:
#save tagged corpus
pickle.save_object(criteria_text_sent_tag,
                   'data/criteria_corpus_pos_tagged.pkl')

In [ ]:
#load tagged corpus
criteria_text_sent_tag = pickle.open_object('data/criteria_corpus_pos_tagged.pkl')