In [1]:
%load_ext sql
import pandas as pd
import json
from __future__ import division
In [2]:
# sql connection parameters come from connect.py
have_connect = !ls connect.py 2>/dev/null
if len(have_connect) == 0:
!mv ../connect.py .
from connect import *
get_ipython().magic('sql mysql://' + mysqlusername + ':' + mysqlpassword + '@' + mysqlserver + ':3306/' + mysqldbname)
mesh_thesaurus = %sql select mesh_seqid, mesh_id, mesh_term from mesh_thesaurus
condition_browse = %sql select nct_id, mesh_term from condition_browse
interventions = %sql select distinct nct_id, intervention_type from interventions
sponsors = %sql select nct_id, agency_class, any_industry from sponsors_mod
locations = %sql select distinct nct_id, continent from location_countries join country_continent using (country)
clinical_study = %sql select nct_id, overall_status, left(phase, 7) phase, enrollment, \
case when start_date != '' then year(start_date_dt) \
when completion_date != '' then year(completion_date_dt) \
else year(verification_date_dt) end study_year \
from clinical_study \
where study_type = 'Interventional'
In [3]:
mesh_lookup = {}
mesh_ids = {}
for mesh_seqid, mesh_id, mesh_term in mesh_thesaurus:
mesh_lookup[mesh_seqid] = {'id': mesh_id, 'term': mesh_term}
if mesh_term not in mesh_ids: mesh_ids[mesh_term] = set()
mesh_ids[mesh_term].add(mesh_seqid)
In [4]:
study_conditions = {}
distinct_conditions = set()
for nct_id, mesh_term in condition_browse:
if nct_id not in study_conditions: study_conditions[nct_id] = set()
for t in mesh_ids[mesh_term]:
study_conditions[nct_id].add(t)
if t not in distinct_conditions: distinct_conditions.add(t)
In [5]:
study_interventions = {}
distinct_interventions = set()
for nct_id, intervention_type in interventions:
if nct_id not in study_interventions: study_interventions[nct_id] = set()
if intervention_type not in distinct_interventions: distinct_interventions.add(intervention_type)
study_interventions[nct_id].add(intervention_type)
intervention_lookup = {n+1: i for n, i in enumerate(list(distinct_interventions) + ['Unknown'])}
intervention_lookup_r = dict([(v, k) for k, v in intervention_lookup.items()])
In [6]:
study_sponsors = {nct_id: {'class': agency_class, 'ind': any_industry} for nct_id, agency_class, any_industry in sponsors}
distinct_sponsors = {agency_class for nct_id, agency_class, any_industry in sponsors}
sponsor_lookup = {n+1: i for n, i in enumerate(list(distinct_sponsors))}
sponsor_lookup_r = dict([(v, k) for k, v in sponsor_lookup.items()])
In [7]:
study_locations = {}
distinct_locations = set()
for nct_id, continent in locations:
if nct_id not in study_locations: study_locations[nct_id] = set()
if continent not in distinct_locations: distinct_locations.add(continent)
study_locations[nct_id].add(continent)
location_lookup = {n+1: i for n, i in enumerate(list(distinct_locations) + ['Unknown'])}
location_lookup_r = dict([(v, k) for k, v in location_lookup.items()])
In [8]:
distinct_status = {overall_status for nct_id, overall_status, phase, enrollment, study_year in clinical_study}
distinct_phase = {phase for nct_id, overall_status, phase, enrollment, study_year in clinical_study}
status_lookup = {n+1: i for n, i in enumerate(list(distinct_status))}
status_lookup_r = dict([(v, k) for k, v in status_lookup.items()])
phase_lookup = {n+1: i for n, i in enumerate(list(distinct_phase))}
phase_lookup_r = dict([(v, k) for k, v in phase_lookup.items()])
In [9]:
study_data = []
for nct_id, overall_status, phase, enrollment, study_year in clinical_study:
curdata = {
'yr': study_year,
'ph': phase_lookup_r[phase],
'st': status_lookup_r[overall_status],
'sp': sponsor_lookup_r[study_sponsors[nct_id]['class']],
'in': 1 if study_sponsors[nct_id]['ind'] == 'Y' else 0
}
curdata['en'] = enrollment
if nct_id in study_conditions:
curdata['co'] = list(study_conditions[nct_id])
else:
curdata['co'] = [99999]
if nct_id in study_interventions:
curdata['iv'] = [intervention_lookup_r[i] for i in study_interventions[nct_id]]
else:
curdata['iv'] = [intervention_lookup_r['Unknown']]
if nct_id in study_locations:
curdata['lo'] = [location_lookup_r[l] for l in study_locations[nct_id]]
else:
curdata['lo'] = [location_lookup_r['Unknown']]
study_data.append(curdata)
Create MeSH dictionary only for relevant conditions (and those above them in the hierarchy)
In [10]:
mesh_adds = {
'A': 'Anatomy',
'B': 'Organisms',
'C': 'Diseases',
'D': 'Chemicals and Drugs',
'E': 'Analytical, Diagnostic and Therapeutic Techniques and Equipment',
'F': 'Psychiatry and Psychology',
'G': 'Phenomena and Processes',
'H': 'Disciplines and Occupations',
'I': 'Anthropology, Education, Sociology and Social Phenomena',
'J': 'Technology, Industry, Agriculture',
'K': 'Humanities',
'L': 'Information Science',
'M': 'Named Groups',
'N': 'Health Care',
'V': 'Publication Characteristics',
'Z': 'Geographicals'
}
maxkey = max(mesh_lookup.keys())
for i, v in enumerate(mesh_adds.items()):
mesh_lookup[maxkey + i + 1] = {'id': v[0], 'term': v[1]}
mesh_lookup[99999] = {'id': 'U00', 'term': 'Unassigned condition'}
mesh_lookup_r = {mesh_lookup[m]['id']: m for m in mesh_lookup.keys()}
cond_ids = {mesh_lookup[c]['id'] for c in distinct_conditions}
for c in cond_ids:
for i in [1,3,7,11,15,19,23,27,31,35,39,43,47]:
if len(c) >= i and mesh_lookup_r[c[:i]] not in distinct_conditions:
distinct_conditions.add(mesh_lookup_r[c[:i]])
distinct_conditions.add(99999)
mesh_lookup_succinct = {m: mesh_lookup[m] for m in mesh_lookup if m in distinct_conditions}
In [11]:
all_data = {
'studies': study_data,
'mesh': mesh_lookup_succinct,
'interventions': intervention_lookup,
'locations': location_lookup,
'phase': phase_lookup,
'sponsors': sponsor_lookup,
'status': status_lookup
}
fout = open('all_data.json', 'w')
json.dump(all_data, fout, separators=(',',':'))
fout.close()
In [ ]: