In [1]:
import os, codecs, cPickle as pickle, json
from collections import Counter

Load lookup data


In [2]:
# sentence-to-NCTID lookup
sent_lookup = pickle.load(open('../data/sentence_lookup_dict.pkl','rb'))

In [3]:
# sentence text
trial_criteria = {}
for row in codecs.open('../data/clinical_study.txt','r','utf-8').readlines():
    data = row.strip().split('|')
    trial_criteria[data[0]] = data[26].split('<br />')

sent_raw = [s for t in trial_criteria.keys() for s in trial_criteria[t]]
del trial_criteria

Process text and write output


In [6]:
# output files
crit_text = codecs.open('../data/criteria_text.txt','w','utf-8')
tag_text = codecs.open('../data/criteria_tagged.txt','w','utf-8')

# initialize trial-level variables
cur_trial = ''
inc_exc = 'U'
disp_order = 1
total_order = 0

for i in range(55):
    if i != 46:
        all_tagged = pickle.load(open('../data/stanford_tagged/stanford_tagged_criteria_%d.pkl' % (i+1), 'rb'))
        
        for j in range(len(all_tagged)):
            
            # reset variables for line-specific info
            header = 0

            # if new trial ID, reset variables
            nct_id = sent_lookup[total_order]
            if nct_id != cur_trial:
                cur_trial = nct_id
                inc_exc = 'U'
                disp_order = 1

            # determine header information
            cur_sent = sent_raw[total_order]
            if 'inclusion criteria' in cur_sent.lower():
                inc_exc = 'I'
                header = 1
            elif 'exclusion criteria' in cur_sent.lower():
                inc_exc = 'E'
                header = 1
            if 'may apply' in cur_sent.lower() or len(cur_sent) > 50:
                header = 0

            crit_text.write('%d\t%s\t%s\t%s\t%d\n' % (total_order,
                                                      cur_trial,
                                                      cur_sent.replace('\t',' '),
                                                      'H' if header else inc_exc,
                                                      disp_order))
            tag_text.write('%d\t%s\n' % (total_order, json.dumps(all_tagged[j], separators=(',',':'))))

            # increment
            disp_order += 1
            total_order += 1

crit_text.close()
tag_text.close()

In [ ]: