Load necessary packages and extensions


In [1]:
%load_ext sql
import pandas as pd
import json
from __future__ import division

Get data from database


In [2]:
# sql connection parameters come from connect.py
have_connect = !ls connect.py 2>/dev/null
if len(have_connect) == 0:
    !mv ../connect.py .
from connect import *

get_ipython().magic('sql mysql://' + mysqlusername + ':' + mysqlpassword + '@' + mysqlserver + ':3306/' + mysqldbname)
mesh_thesaurus = %sql select mesh_seqid, mesh_id, mesh_term from mesh_thesaurus
condition_browse = %sql select nct_id, mesh_term from condition_browse 
interventions = %sql select distinct nct_id, intervention_type from interventions
sponsors = %sql select nct_id, agency_class, any_industry from sponsors_mod
locations = %sql select distinct nct_id, continent from location_countries join country_continent using (country)
clinical_study = %sql select nct_id, overall_status, left(phase, 7) phase, enrollment, \
                        case when start_date != '' then year(start_date_dt) \
                             when completion_date != '' then year(completion_date_dt) \
                             else year(verification_date_dt) end study_year \
                      from clinical_study \
                      where study_type = 'Interventional'


54935 rows affected.
309589 rows affected.
153419 rows affected.
152611 rows affected.
158525 rows affected.
123600 rows affected.

Transform data into JSON objects for use by JavaScript code


In [3]:
mesh_lookup = {}
mesh_ids = {}
for mesh_seqid, mesh_id, mesh_term in mesh_thesaurus:
    mesh_lookup[mesh_seqid] = {'id': mesh_id, 'term': mesh_term}
    if mesh_term not in mesh_ids: mesh_ids[mesh_term] = set()
    mesh_ids[mesh_term].add(mesh_seqid)

In [4]:
study_conditions = {}
distinct_conditions = set()
for nct_id, mesh_term in condition_browse:
    if nct_id not in study_conditions: study_conditions[nct_id] = set()
    for t in mesh_ids[mesh_term]:
        study_conditions[nct_id].add(t)
        if t not in distinct_conditions: distinct_conditions.add(t)

In [5]:
study_interventions = {}
distinct_interventions = set()
for nct_id, intervention_type in interventions:
    if nct_id not in study_interventions: study_interventions[nct_id] = set()
    if intervention_type not in distinct_interventions: distinct_interventions.add(intervention_type)
    study_interventions[nct_id].add(intervention_type)

intervention_lookup = {n+1: i for n, i in enumerate(list(distinct_interventions) + ['Unknown'])}
intervention_lookup_r = dict([(v, k) for k, v in intervention_lookup.items()])

In [6]:
study_sponsors = {nct_id: {'class': agency_class, 'ind': any_industry} for nct_id, agency_class, any_industry in sponsors}
distinct_sponsors = {agency_class for nct_id, agency_class, any_industry in sponsors}
sponsor_lookup = {n+1: i for n, i in enumerate(list(distinct_sponsors))}
sponsor_lookup_r = dict([(v, k) for k, v in sponsor_lookup.items()])

In [7]:
study_locations = {}
distinct_locations = set()
for nct_id, continent in locations:
    if nct_id not in study_locations: study_locations[nct_id] = set()
    if continent not in distinct_locations: distinct_locations.add(continent)
    study_locations[nct_id].add(continent)

location_lookup = {n+1: i for n, i in enumerate(list(distinct_locations) + ['Unknown'])}
location_lookup_r = dict([(v, k) for k, v in location_lookup.items()])

In [8]:
distinct_status = {overall_status for nct_id, overall_status, phase, enrollment, study_year in clinical_study}
distinct_phase = {phase for nct_id, overall_status, phase, enrollment, study_year in clinical_study}

status_lookup = {n+1: i for n, i in enumerate(list(distinct_status))}
status_lookup_r = dict([(v, k) for k, v in status_lookup.items()])
phase_lookup = {n+1: i for n, i in enumerate(list(distinct_phase))}
phase_lookup_r = dict([(v, k) for k, v in phase_lookup.items()])

In [9]:
study_data = []
for nct_id, overall_status, phase, enrollment, study_year in clinical_study:
    curdata = {
        'yr': study_year,
        'ph': phase_lookup_r[phase],
        'st': status_lookup_r[overall_status],
        'sp': sponsor_lookup_r[study_sponsors[nct_id]['class']],
        'in': 1 if study_sponsors[nct_id]['ind'] == 'Y' else 0
        }
    curdata['en'] = enrollment
    if nct_id in study_conditions:
        curdata['co'] = list(study_conditions[nct_id])
    else:
        curdata['co'] = [99999]
    if nct_id in study_interventions: 
        curdata['iv'] = [intervention_lookup_r[i] for i in study_interventions[nct_id]]
    else:
        curdata['iv'] = [intervention_lookup_r['Unknown']]
    if nct_id in study_locations:
        curdata['lo'] = [location_lookup_r[l] for l in study_locations[nct_id]]
    else:
        curdata['lo'] = [location_lookup_r['Unknown']]
    study_data.append(curdata)

Create MeSH dictionary only for relevant conditions (and those above them in the hierarchy)


In [10]:
mesh_adds = {
    'A': 'Anatomy',
    'B': 'Organisms',
    'C': 'Diseases',
    'D': 'Chemicals and Drugs',
    'E': 'Analytical, Diagnostic and Therapeutic Techniques and Equipment',
    'F': 'Psychiatry and Psychology',
    'G': 'Phenomena and Processes',
    'H': 'Disciplines and Occupations',
    'I': 'Anthropology, Education, Sociology and Social Phenomena',
    'J': 'Technology, Industry, Agriculture',
    'K': 'Humanities',
    'L': 'Information Science',
    'M': 'Named Groups',
    'N': 'Health Care',
    'V': 'Publication Characteristics',
    'Z': 'Geographicals'
}
maxkey = max(mesh_lookup.keys())
for i, v in enumerate(mesh_adds.items()):
    mesh_lookup[maxkey + i + 1] = {'id': v[0], 'term': v[1]}

mesh_lookup[99999] = {'id': 'U00', 'term': 'Unassigned condition'}

mesh_lookup_r = {mesh_lookup[m]['id']: m for m in mesh_lookup.keys()}

cond_ids = {mesh_lookup[c]['id'] for c in distinct_conditions}
for c in cond_ids:
    for i in [1,3,7,11,15,19,23,27,31,35,39,43,47]:
        if len(c) >= i and mesh_lookup_r[c[:i]] not in distinct_conditions:
            distinct_conditions.add(mesh_lookup_r[c[:i]])

distinct_conditions.add(99999)

mesh_lookup_succinct = {m: mesh_lookup[m] for m in mesh_lookup if m in distinct_conditions}

In [11]:
all_data = {
    'studies': study_data,
    'mesh': mesh_lookup_succinct,
    'interventions': intervention_lookup,
    'locations': location_lookup,
    'phase': phase_lookup,
    'sponsors': sponsor_lookup,
    'status': status_lookup
}

fout = open('all_data.json', 'w')
json.dump(all_data, fout, separators=(',',':'))
fout.close()

In [ ]: