In [1]:
import os, codecs, cPickle as pickle, json
from collections import Counter
In [2]:
# sentence-to-NCTID lookup
sent_lookup = pickle.load(open('../data/sentence_lookup_dict.pkl','rb'))
In [3]:
# sentence text
trial_criteria = {}
for row in codecs.open('../data/clinical_study.txt','r','utf-8').readlines():
data = row.strip().split('|')
trial_criteria[data[0]] = data[26].split('<br />')
sent_raw = [s for t in trial_criteria.keys() for s in trial_criteria[t]]
del trial_criteria
In [6]:
# output files
crit_text = codecs.open('../data/criteria_text.txt','w','utf-8')
tag_text = codecs.open('../data/criteria_tagged.txt','w','utf-8')
# initialize trial-level variables
cur_trial = ''
inc_exc = 'U'
disp_order = 1
total_order = 0
for i in range(55):
if i != 46:
all_tagged = pickle.load(open('../data/stanford_tagged/stanford_tagged_criteria_%d.pkl' % (i+1), 'rb'))
for j in range(len(all_tagged)):
# reset variables for line-specific info
header = 0
# if new trial ID, reset variables
nct_id = sent_lookup[total_order]
if nct_id != cur_trial:
cur_trial = nct_id
inc_exc = 'U'
disp_order = 1
# determine header information
cur_sent = sent_raw[total_order]
if 'inclusion criteria' in cur_sent.lower():
inc_exc = 'I'
header = 1
elif 'exclusion criteria' in cur_sent.lower():
inc_exc = 'E'
header = 1
if 'may apply' in cur_sent.lower() or len(cur_sent) > 50:
header = 0
crit_text.write('%d\t%s\t%s\t%s\t%d\n' % (total_order,
cur_trial,
cur_sent.replace('\t',' '),
'H' if header else inc_exc,
disp_order))
tag_text.write('%d\t%s\n' % (total_order, json.dumps(all_tagged[j], separators=(',',':'))))
# increment
disp_order += 1
total_order += 1
crit_text.close()
tag_text.close()
In [ ]: