In [1]:
import cPickle as pickle,pandas as pd,nltk,codecs,json
In [2]:
corrections = {"Sarcoma, Ewing's": 'Sarcoma, Ewing',
'Beta-Thalassemia': 'beta-Thalassemia',
'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',
'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',
'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',
'Felty''s Syndrome': 'Felty Syndrome',
'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',
'Retrognathism': 'Retrognathia',
'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',
'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',
'Von Willebrand Diseases': 'von Willebrand Diseases',
'Pontine Glioma': 'Brain Stem Neoplasms',
'Mental Retardation': 'Intellectual Disability',
'Overdose': 'Drug Overdose',
'Beta-Mannosidosis': 'beta-Mannosidosis',
'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',
'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',
'Alpha-Thalassemia': 'alpha-Thalassemia',
'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',
'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',
'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',
'Alpha-Mannosidosis': 'alpha-Mannosidosis',
'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'
}
conditions = {}
for row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():
row_id, nct_id, mesh_term = row.strip().split('|')
if nct_id not in conditions: conditions[nct_id] = []
conditions[nct_id].append(corrections[mesh_term] if mesh_term in corrections else mesh_term)
In [9]:
investigators = {}
for row in codecs.open('../data/investigators.txt','r','utf-8').readlines():
row_id, fac_id, nct_id, name, role, aff = row.strip().split('|')
if nct_id in conditions:
if int(row_id) not in investigators: investigators[int(row_id)] = []
investigators[int(row_id)] = list(set(investigators[int(row_id)] + conditions[nct_id]))
In [10]:
pickle.dump(investigators, open('../data/investigator_mesh.pkl','wb'))
In [ ]: