Loading necessary packages


In [1]:
from pandas.io.sql import read_frame
import pandas as pd
import numpy as np
import MySQLdb
import matplotlib.pyplot as plt

Getting data


In [2]:
# does dataset.pickle exist?
have_pickle = !ls dataset.pickle 2>/dev/null
if len(have_pickle) > 0:
    dataset = pd.read_pickle('dataset.pickle')
else:
    # sql connection parameters come from connect.py
    have_connect = !ls connect.py 2>/dev/null
    if len(have_connect) == 0:
        !mv ../../connect.py .
    from connect import *

    mysql_cn= MySQLdb.connect(host=mysqlserver, 
                    port=3306,user=mysqlusername, passwd=mysqlpassword, 
                    db=mysqldbname)
    dataset = read_frame("select * from datamining_table", mysql_cn, 'nct_id')
    dataset.to_pickle('dataset.pickle')

Getting summary stats


In [3]:
dataset.describe().transpose()


Out[3]:
count mean std min 25% 50% 75% max
target 85062 0.867356 0.339192 0.0000 1.00 1.00 1.00 1.00
study_length_yrs 74269 2.460825 2.531695 0.0000 1.00 2.00 3.00 89.00
phase_0 85062 0.005008 0.070591 0.0000 0.00 0.00 0.00 1.00
phase_1 85062 0.178505 0.382940 0.0000 0.00 0.00 0.00 1.00
phase_2 85062 0.246843 0.431177 0.0000 0.00 0.00 0.00 1.00
phase_3 85062 0.177000 0.381671 0.0000 0.00 0.00 0.00 1.00
phase_4 85062 0.115704 0.319871 0.0000 0.00 0.00 0.00 1.00
type_interventional 85062 0.843761 0.363084 0.0000 1.00 1.00 1.00 1.00
type_observational 85062 0.155745 0.362616 0.0000 0.00 0.00 0.00 1.00
number_of_arms 85062 1.367602 1.423462 0.0000 0.00 1.00 2.00 19.00
number_of_groups 85062 0.171969 0.598828 0.0000 0.00 0.00 0.00 60.00
gender_female 85062 0.091968 0.288983 0.0000 0.00 0.00 0.00 1.00
gender_male 85062 0.060450 0.238320 0.0000 0.00 0.00 0.00 1.00
minimum_age_days 77213 7201.892366 3839.647617 30.4375 6574.50 6574.50 6574.50 31046.25
maximum_age_days 42158 20259.067966 8779.457596 1.0000 16436.25 23741.25 27393.75 34698.75
healthy_volunteers 85062 0.240178 0.427194 0.0000 0.00 0.00 0.00 1.00
is_fda_regulated 85062 0.335685 0.472232 0.0000 0.00 0.00 1.00 1.00
total_officials 72885 1.297578 1.551588 1.0000 1.00 1.00 1.00 97.00
sponsor_lead_industry 85062 0.404728 0.490842 0.0000 0.00 0.00 1.00 1.00
sponsor_lead_govt 85062 0.098963 0.298614 0.0000 0.00 0.00 0.00 1.00
sponsor_any_industry 85062 0.478040 0.499520 0.0000 0.00 0.00 1.00 1.00
total_sponsors 85062 1.445934 1.011338 1.0000 1.00 1.00 2.00 84.00
total_facilities 74492 9.615274 40.328682 1.0000 1.00 1.00 4.00 3511.00
facilities_countries 74492 1.643183 2.725659 1.0000 1.00 1.00 1.00 55.00
facilities_states 74492 3.395989 6.905619 1.0000 1.00 1.00 1.00 179.00
design_intervention_crossover 85062 0.093367 0.290948 0.0000 0.00 0.00 0.00 1.00
design_intervention_parallel 85062 0.432637 0.495444 0.0000 0.00 0.00 1.00 1.00
design_intervention_single 85062 0.236933 0.425203 0.0000 0.00 0.00 0.00 1.00
design_masking_doubleblind 85062 0.295396 0.456223 0.0000 0.00 0.00 1.00 1.00
design_masking_openlabel 85062 0.419776 0.493525 0.0000 0.00 0.00 1.00 1.00
design_masking_singleblind 85062 0.078743 0.269338 0.0000 0.00 0.00 0.00 1.00
design_purpose_science 85062 0.030390 0.171658 0.0000 0.00 0.00 0.00 1.00
design_purpose_diagnostic 85062 0.031730 0.175281 0.0000 0.00 0.00 0.00 1.00
design_purpose_prevention 85062 0.089029 0.284788 0.0000 0.00 0.00 0.00 1.00
design_purpose_treatment 85062 0.606922 0.488437 0.0000 0.00 1.00 1.00 1.00
design_observation_cohort 85062 0.056324 0.230547 0.0000 0.00 0.00 0.00 1.00
design_allocation_nonrandom 85062 0.139322 0.346284 0.0000 0.00 0.00 0.00 1.00
safety_issue 85062 0.294773 0.455943 0.0000 0.00 0.00 1.00 1.00
intervention_drug 85062 0.578413 0.493816 0.0000 0.00 1.00 1.00 1.00
intervention_procedure 85062 0.088970 0.284703 0.0000 0.00 0.00 0.00 1.00
intervention_behavior 85062 0.066728 0.249551 0.0000 0.00 0.00 0.00 1.00
intervention_device 85062 0.072935 0.260031 0.0000 0.00 0.00 0.00 1.00
intervention_biological 85062 0.067445 0.250792 0.0000 0.00 0.00 0.00 1.00
intervention_supplement 85062 0.025899 0.158834 0.0000 0.00 0.00 0.00 1.00
total_conditions 72358 4.958954 5.106381 1.0000 2.00 4.00 6.00 248.00
total_conditions_main 72358 2.193704 1.151675 1.0000 1.00 2.00 3.00 19.00
total_conditions_top 72358 1.133904 0.426171 1.0000 1.00 1.00 1.00 4.00
location_africa 85062 0.025146 0.156570 0.0000 0.00 0.00 0.00 1.00
location_europe 85062 0.264960 0.441314 0.0000 0.00 0.00 1.00 1.00
location_asia 85062 0.128142 0.334250 0.0000 0.00 0.00 0.00 1.00
location_latinamerica 85062 0.048917 0.215696 0.0000 0.00 0.00 0.00 1.00
location_oceania 85062 0.027145 0.162507 0.0000 0.00 0.00 0.00 1.00
location_northamerica 85062 0.520914 0.499565 0.0000 0.00 1.00 1.00 1.00
clus_0 85062 0.054066 0.226150 0.0000 0.00 0.00 0.00 1.00
clus_1 85062 0.220087 0.414307 0.0000 0.00 0.00 0.00 1.00
clus_2 85062 0.231184 0.421592 0.0000 0.00 0.00 0.00 1.00
clus_3 85062 0.030977 0.173257 0.0000 0.00 0.00 0.00 1.00
clus_4 85062 0.005302 0.072622 0.0000 0.00 0.00 0.00 1.00
clus_5 85062 0.006454 0.080078 0.0000 0.00 0.00 0.00 1.00
clus_6 85062 0.001117 0.033401 0.0000 0.00 0.00 0.00 1.00
... ... ... ... ... ... ... ...

92 rows × 8 columns

Modifying data

Capping some continuous variables


In [4]:
%matplotlib inline

to_cap = [
'study_length_yrs',
'number_of_arms',
'number_of_groups',
'total_officials',
'total_sponsors',
'total_facilities',
'facilities_countries',
'facilities_states',
'total_conditions',
'total_conditions_main',
'total_conditions_top'
]

for c in to_cap:
    p = plt.figure()
    p.set_figwidth(10)
    dataset[c].hist(bins=100)
    p.suptitle(c)
    p.show()


/Users/jost/courses/clinicaltrials/env/lib/python2.7/site-packages/matplotlib/figure.py:371: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
  "matplotlib is currently using a non-GUI backend, "

In [11]:
cap_length = {
'study_length_yrs': 10,
'number_of_arms': 4,
'number_of_groups': 1,
'total_officials': 2,
'total_sponsors': 3,
'total_facilities': 2,
'facilities_countries': 2,
'facilities_states': 2,
'total_conditions': 11,
'total_conditions_main': 5,
'total_conditions_top': 2
}

for c in cap_length.keys():
    maxval = cap_length[c]
    dataset[c] = d2[c].map(lambda x: maxval if x > maxval else x)

Normalizing and replacing missing with the mean (0)


In [37]:
dataset_norm = dataset.copy()
dataset_norm_withingroup = dataset.copy()

to_normalize = [
'study_length_yrs',
'number_of_arms',
'number_of_groups',
'minimum_age_days',
'maximum_age_days',
'total_officials',
'total_sponsors',
'total_facilities',
'facilities_countries',
'facilities_states',
'total_conditions',
'total_conditions_main',
'total_conditions_top'
]

zscore = lambda x: (x - x.mean()) / x.std()

def normalize(df,colname,grouped=False):
    if grouped:
        df[c] = df[c].groupby(df['target']).transform(zscore)
    else:
        df[c] = df[c].groupby(df['target'].isnull()).transform(zscore)
    df[c] = df[c].map(lambda x: 0 if np.isnan(x) else x)

for c in to_normalize:
    normalize(dataset_norm, c, False)
    normalize(dataset_norm_withingroup, c, True)

Add random identifiers


In [40]:
# initialize variables
def init_vars(df):
    df['training'] = 1.0
    df['sample'] = 1.0

init_vars(dataset_norm)
init_vars(dataset_norm_withingroup)

In [48]:
# procedures to create test group and sample of target for balanced training set
def update_training(df):
    df['training'] = df['training'].map(lambda x: 0.0 if np.random.rand() > 0.8 else 1.0)

def update_sample(df):
    def transformvar(df):
        if df['target'] == 1 and np.random.rand() > 0.15293:
            return 0.0
        else:
            return 1.0
    df['sample'] = df.apply(transformvar, axis=1)

In [58]:
update_training(dataset_norm)
update_sample(dataset_norm)
update_training(dataset_norm_withingroup)
update_sample(dataset_norm_withingroup)

Write datasets for analysis


In [61]:
dataset_norm.to_pickle('dataset_norm.pickle')
dataset_norm_withingroup.to_pickle('dataset_norm_withingroup.pickle')

In [ ]: