In [91]:
import nltk
import codecs
import string
import random
from collections import Counter
import cPickle as pickle
import re
from copy import deepcopy
import csv
import pandas as pd
from __future__ import division
This notebook was for developing the programs that had to be run on harbinger to process the full corpus of text. Because of the size it was too computationally intensive to run on my computer (the pos tagging took around 11 hours on harbinger to complete).
In [2]:
trial_criteria = {}
predefined_criteria = {}
for row in codecs.open('data/clinical_study.txt','r','utf-8').readlines():
data = row.split('|')
trial_criteria[data[0]] = data[26]
# 27 = gender, 28 = min age, 29 = max age, 30 = Health volunteers
predefined_criteria[data[0]] = (data[27], data[28], data[29], data[30])
In [38]:
def stanford_list(trial_criteria):
#create a flat list of all sentences for the stanford tagger to run on
#also creating a look up dict where the index of the sentence in the
#stanford tagger list is associated with the trial it came from
stanford_sentence_list = []
sentence_lookup_dict = {}
for trial, criteria in trial_criteria.items():
stan_len = len(stanford_sentence_list)
new_list_len = len(criteria.split('<br />'))
stanford_sentence_list += criteria.split('<br />')
for ix in xrange(stan_len, stan_len + new_list_len, 1):
sentence_lookup_dict[ix] = trial
return stanford_sentence_list, sentence_lookup_dict
In [44]:
stanford_sentence_list, sentence_lookup_dict = stanford_list(trial_criteria)
In [84]:
def clean_bullet_points(stanford_sentence_list):
'''Remove bullet point symbols from text'''
for ix, sent in enumerate(stanford_sentence_list):
m = re.search('^-?[0-9]?.?\)?\s', sent)
if m:
stanford_sentence_list[ix] = sent[len(m.group(0)):]
return stanford_sentence_list
In [86]:
stanford_sentence_list = clean_bullet_points(stanford_sentence_list)
In [92]:
df = pd.DataFrame(stanford_sentence_list)
In [96]:
df.to_csv('data/stanford_sentence_list.csv', header=False,
index=False, encoding='utf-8', sep="\t")
In [ ]:
pickle.dump(sentence_lookup_dict, open('data/sentence_lookup_dict.pkl', 'wb'))
In [46]:
def split_doc_on_exclusion(doc):
'''This function splits the criteria documents into the Inclusion
and Exclusion sections'''
split_doc = doc.split()
for idx, word in enumerate(split_doc):
if word.lower() == u'exclusion' and u'criteria' in split_doc[idx + 1].lower():
inclusive = ' '.join(split_doc[:idx])
exclusive = ' '.join(split_doc[idx:])
return inclusive, exclusive
if split_doc[0].lower() == u'inclusion' and u'criteria' in split_doc[1].lower():
return doc, None
no_inc_exc_dict = {}
trial_criteria_split = {}
for key, doc in trial_criteria.items():
try:
inclusive, exclusive = split_doc_on_exclusion(doc)
if exclusive is None:
trial_criteria_split[key] = [inclusive]
else:
trial_criteria_split[key] = [inclusive, exclusive]
#if there are not inclusion/exclusion
except:
no_inc_exc_dict[key] = doc
In [47]:
len(trial_criteria_split)
Out[47]:
In [48]:
#save ram
del trial_criteria
In [40]:
len(no_inc_exc_dict)
Out[40]:
In [29]:
len(trial_criteria)
Out[29]:
In [11]:
#save to pickle
pickle.dump(predefined_criteria, open('data/predefined_criteria.pkl', 'wb'))
pickle.dump(trial_criteria, open('data/trial_criteria.pkl', 'wb'))
pickle.dump(trial_criteria_split, open('data/trial_criteria_split.pkl', 'wb'))
pickle.dump(no_inc_exc_dict, open('data/no_inc_exc_dict.pkl', 'wb'))
In [11]:
trial_criteria = pickle.load(open('data/trial_criteria.pkl', 'rb'))
In [68]:
trial_criteria_split = pickle.load(open('data/trial_criteria_split.pkl', 'rb'))
In [6]:
smoker_list = ['Non-smoker', 'smoker', 'Current smoker', 'smoking', 'tobacco', 'nicotine',
'cigarettes']
pregnancy_list = ['Pregnancy']
birth_control_list = ['Birth control', 'contraception']
drug_list = ['Illicit drugs', 'Alcohol abuse', 'illegal', 'illicit', 'drug abuse']
heart_failure_list = ['Congestive Heart Failure', 'heart failure']
hiv_list = ['HIV', 'aids', 'human immunodeficiency virus']
allergy_list = ['Allergies', 'allergy', 'hypersensitivity']
In [5]:
smoker_pred_list = ['current']
pregnancy_pred_list = ['potential', 'negative']
birth_control_pred_list = ['effective', 'Fertile patients', 'must use effective',
'must use', 'use effective', 'Fertile patients must use',
'fertile']
drug_pred_list = ['use', 'abuse']
heart_failure_pred_list = []
hiv_pred_list = []
allergy_pred_list = ['known', 'history', 'suspected', 'known suspected',
'clinically significant']
In [70]:
trial_criteria_split_test = deepcopy(dict(trial_criteria_split.items()[:3]))
In [71]:
def process_text(text_dict):
#break sentences on '-' for each subdoc of each document
for key, doc in text_dict.items():
for n in xrange(len(doc)):
text_dict[key][n] = re.split(' - ', text_dict[key][n])
#get sentence tokenizer
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
#run the sentence tokenizer over all the documents
def sent_token(text):
sentence_groups = []
for sent_group in text:
group_holder = []
for sent in sent_group:
group_holder += (sent_tokenizer.tokenize(sent))
sentence_groups.append(group_holder)
del group_holder
return sentence_groups
#run sentence tokenizer over each doc in the dict
for key, doc in text_dict.items():
text_dict[key] = sent_token(doc)
#CREATING TOKENS
#patter for tokenizing
pattern = r'''(?x) # set flag to allow verbose regexps
([A-Z]\.)+ # abbreviations, e.g. U.S.A
| \w+([-‘]\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis...
| [][.,;"'?():\-_`]+ # these are separate tokens
'''
for key, doc in text_dict.items():
for n in xrange(len(doc)):
text_dict[key][n] = [nltk.regexp_tokenize(sent, pattern) for sent
in doc[n]]
return text_dict
In [72]:
result = process_text(trial_criteria_split_test)
In [ ]:
pickle.dump(result, open('data/trial_criteria_split_token.pkl', 'wb'))
In [ ]:
trial_criteria_split_token = pickle.load(open('data/trial_criteria_split_token.pkl', 'rb'))
In [73]:
def pos_tag(text_dict):
#tag document structured criteria text
def doc_tagger_pos(text):
result = []
for doc in text:
doc_text = []
for sent in doc:
doc_text.append(nltk.pos_tag(sent))
result.append(doc_text)
return result
for key, doc in text_dict.items():
text_dict[key] = doc_tagger_pos(doc)
return text_dict
result_pos = pos_tag(result)
In [77]:
#save test corpus
pickle.dump(result,open('data/test_tagged_data.pkl', 'wb'))
In [ ]:
#save tagged corpus
pickle.save_object(criteria_text_sent_tag,
'data/criteria_corpus_pos_tagged.pkl')
In [ ]:
#load tagged corpus
criteria_text_sent_tag = pickle.open_object('data/criteria_corpus_pos_tagged.pkl')