In [1]:
import cPickle as pickle,pandas as pd,nltk,codecs,json

In [2]:
corrections = {"Sarcoma, Ewing's": 'Sarcoma, Ewing',
               'Beta-Thalassemia': 'beta-Thalassemia',
               'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',
               'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',
               'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',
               'Felty''s Syndrome': 'Felty Syndrome',
               'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',
               'Retrognathism': 'Retrognathia',
               'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',
               'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',
               'Von Willebrand Diseases': 'von Willebrand Diseases',
               'Pontine Glioma': 'Brain Stem Neoplasms',
               'Mental Retardation': 'Intellectual Disability',
               'Overdose': 'Drug Overdose',
               'Beta-Mannosidosis': 'beta-Mannosidosis',
               'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',
               'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',
               'Alpha-Thalassemia': 'alpha-Thalassemia',
               'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',
               'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',
               'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',
               'Alpha-Mannosidosis': 'alpha-Mannosidosis',
               'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'
               }

conditions = {}
for row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():
    row_id, nct_id, mesh_term = row.strip().split('|')
    if nct_id not in conditions: conditions[nct_id] = []
    conditions[nct_id].append(corrections[mesh_term] if mesh_term in corrections else mesh_term)

In [9]:
investigators = {}
for row in codecs.open('../data/investigators.txt','r','utf-8').readlines():
    row_id, fac_id, nct_id, name, role, aff = row.strip().split('|')
    if nct_id in conditions:
        if int(row_id) not in investigators: investigators[int(row_id)] = []
        investigators[int(row_id)] = list(set(investigators[int(row_id)] + conditions[nct_id]))

In [10]:
pickle.dump(investigators, open('../data/investigator_mesh.pkl','wb'))

In [ ]: