Loading necessary packages



In [1]:

    
from pandas.io.sql import read_frame
import pandas as pd
import numpy as np
import MySQLdb
import matplotlib.pyplot as plt

Getting data



In [2]:

    
# does dataset.pickle exist?
have_pickle = !ls dataset.pickle 2>/dev/null
if len(have_pickle) > 0:
    dataset = pd.read_pickle('dataset.pickle')
else:
    # sql connection parameters come from connect.py
    have_connect = !ls connect.py 2>/dev/null
    if len(have_connect) == 0:
        !mv ../../connect.py .
    from connect import *

    mysql_cn= MySQLdb.connect(host=mysqlserver, 
                    port=3306,user=mysqlusername, passwd=mysqlpassword, 
                    db=mysqldbname)
    dataset = read_frame("select * from datamining_table", mysql_cn, 'nct_id')
    dataset.to_pickle('dataset.pickle')

Getting summary stats



In [3]:

    
dataset.describe().transpose()









    Out[3]:






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
  
  
    
      target
       85062
           0.867356
          0.339192
        0.0000
           1.00
           1.00
           1.00
           1.00
    
    
      study_length_yrs
       74269
           2.460825
          2.531695
        0.0000
           1.00
           2.00
           3.00
          89.00
    
    
      phase_0
       85062
           0.005008
          0.070591
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      phase_1
       85062
           0.178505
          0.382940
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      phase_2
       85062
           0.246843
          0.431177
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      phase_3
       85062
           0.177000
          0.381671
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      phase_4
       85062
           0.115704
          0.319871
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      type_interventional
       85062
           0.843761
          0.363084
        0.0000
           1.00
           1.00
           1.00
           1.00
    
    
      type_observational
       85062
           0.155745
          0.362616
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      number_of_arms
       85062
           1.367602
          1.423462
        0.0000
           0.00
           1.00
           2.00
          19.00
    
    
      number_of_groups
       85062
           0.171969
          0.598828
        0.0000
           0.00
           0.00
           0.00
          60.00
    
    
      gender_female
       85062
           0.091968
          0.288983
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      gender_male
       85062
           0.060450
          0.238320
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      minimum_age_days
       77213
        7201.892366
       3839.647617
       30.4375
        6574.50
        6574.50
        6574.50
       31046.25
    
    
      maximum_age_days
       42158
       20259.067966
       8779.457596
        1.0000
       16436.25
       23741.25
       27393.75
       34698.75
    
    
      healthy_volunteers
       85062
           0.240178
          0.427194
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      is_fda_regulated
       85062
           0.335685
          0.472232
        0.0000
           0.00
           0.00
           1.00
           1.00
    
    
      total_officials
       72885
           1.297578
          1.551588
        1.0000
           1.00
           1.00
           1.00
          97.00
    
    
      sponsor_lead_industry
       85062
           0.404728
          0.490842
        0.0000
           0.00
           0.00
           1.00
           1.00
    
    
      sponsor_lead_govt
       85062
           0.098963
          0.298614
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      sponsor_any_industry
       85062
           0.478040
          0.499520
        0.0000
           0.00
           0.00
           1.00
           1.00
    
    
      total_sponsors
       85062
           1.445934
          1.011338
        1.0000
           1.00
           1.00
           2.00
          84.00
    
    
      total_facilities
       74492
           9.615274
         40.328682
        1.0000
           1.00
           1.00
           4.00
        3511.00
    
    
      facilities_countries
       74492
           1.643183
          2.725659
        1.0000
           1.00
           1.00
           1.00
          55.00
    
    
      facilities_states
       74492
           3.395989
          6.905619
        1.0000
           1.00
           1.00
           1.00
         179.00
    
    
      design_intervention_crossover
       85062
           0.093367
          0.290948
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      design_intervention_parallel
       85062
           0.432637
          0.495444
        0.0000
           0.00
           0.00
           1.00
           1.00
    
    
      design_intervention_single
       85062
           0.236933
          0.425203
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      design_masking_doubleblind
       85062
           0.295396
          0.456223
        0.0000
           0.00
           0.00
           1.00
           1.00
    
    
      design_masking_openlabel
       85062
           0.419776
          0.493525
        0.0000
           0.00
           0.00
           1.00
           1.00
    
    
      design_masking_singleblind
       85062
           0.078743
          0.269338
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      design_purpose_science
       85062
           0.030390
          0.171658
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      design_purpose_diagnostic
       85062
           0.031730
          0.175281
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      design_purpose_prevention
       85062
           0.089029
          0.284788
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      design_purpose_treatment
       85062
           0.606922
          0.488437
        0.0000
           0.00
           1.00
           1.00
           1.00
    
    
      design_observation_cohort
       85062
           0.056324
          0.230547
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      design_allocation_nonrandom
       85062
           0.139322
          0.346284
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      safety_issue
       85062
           0.294773
          0.455943
        0.0000
           0.00
           0.00
           1.00
           1.00
    
    
      intervention_drug
       85062
           0.578413
          0.493816
        0.0000
           0.00
           1.00
           1.00
           1.00
    
    
      intervention_procedure
       85062
           0.088970
          0.284703
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      intervention_behavior
       85062
           0.066728
          0.249551
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      intervention_device
       85062
           0.072935
          0.260031
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      intervention_biological
       85062
           0.067445
          0.250792
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      intervention_supplement
       85062
           0.025899
          0.158834
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      total_conditions
       72358
           4.958954
          5.106381
        1.0000
           2.00
           4.00
           6.00
         248.00
    
    
      total_conditions_main
       72358
           2.193704
          1.151675
        1.0000
           1.00
           2.00
           3.00
          19.00
    
    
      total_conditions_top
       72358
           1.133904
          0.426171
        1.0000
           1.00
           1.00
           1.00
           4.00
    
    
      location_africa
       85062
           0.025146
          0.156570
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      location_europe
       85062
           0.264960
          0.441314
        0.0000
           0.00
           0.00
           1.00
           1.00
    
    
      location_asia
       85062
           0.128142
          0.334250
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      location_latinamerica
       85062
           0.048917
          0.215696
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      location_oceania
       85062
           0.027145
          0.162507
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      location_northamerica
       85062
           0.520914
          0.499565
        0.0000
           0.00
           1.00
           1.00
           1.00
    
    
      clus_0
       85062
           0.054066
          0.226150
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      clus_1
       85062
           0.220087
          0.414307
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      clus_2
       85062
           0.231184
          0.421592
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      clus_3
       85062
           0.030977
          0.173257
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      clus_4
       85062
           0.005302
          0.072622
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      clus_5
       85062
           0.006454
          0.080078
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      clus_6
       85062
           0.001117
          0.033401
        0.0000
           0.00
           0.00
           0.00
           1.00
    
    
      
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
  

92 rows × 8 columns

Modifying data

Capping some continuous variables



In [4]:

    
%matplotlib inline

to_cap = [
'study_length_yrs',
'number_of_arms',
'number_of_groups',
'total_officials',
'total_sponsors',
'total_facilities',
'facilities_countries',
'facilities_states',
'total_conditions',
'total_conditions_main',
'total_conditions_top'
]

for c in to_cap:
    p = plt.figure()
    p.set_figwidth(10)
    dataset[c].hist(bins=100)
    p.suptitle(c)
    p.show()









    



/Users/jost/courses/clinicaltrials/env/lib/python2.7/site-packages/matplotlib/figure.py:371: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
  "matplotlib is currently using a non-GUI backend, "



In [11]:

    
cap_length = {
'study_length_yrs': 10,
'number_of_arms': 4,
'number_of_groups': 1,
'total_officials': 2,
'total_sponsors': 3,
'total_facilities': 2,
'facilities_countries': 2,
'facilities_states': 2,
'total_conditions': 11,
'total_conditions_main': 5,
'total_conditions_top': 2
}

for c in cap_length.keys():
    maxval = cap_length[c]
    dataset[c] = d2[c].map(lambda x: maxval if x > maxval else x)

Normalizing and replacing missing with the mean (0)



In [37]:

    
dataset_norm = dataset.copy()
dataset_norm_withingroup = dataset.copy()

to_normalize = [
'study_length_yrs',
'number_of_arms',
'number_of_groups',
'minimum_age_days',
'maximum_age_days',
'total_officials',
'total_sponsors',
'total_facilities',
'facilities_countries',
'facilities_states',
'total_conditions',
'total_conditions_main',
'total_conditions_top'
]

zscore = lambda x: (x - x.mean()) / x.std()

def normalize(df,colname,grouped=False):
    if grouped:
        df[c] = df[c].groupby(df['target']).transform(zscore)
    else:
        df[c] = df[c].groupby(df['target'].isnull()).transform(zscore)
    df[c] = df[c].map(lambda x: 0 if np.isnan(x) else x)

for c in to_normalize:
    normalize(dataset_norm, c, False)
    normalize(dataset_norm_withingroup, c, True)

Add random identifiers



In [40]:

    
# initialize variables
def init_vars(df):
    df['training'] = 1.0
    df['sample'] = 1.0

init_vars(dataset_norm)
init_vars(dataset_norm_withingroup)



In [48]:

    
# procedures to create test group and sample of target for balanced training set
def update_training(df):
    df['training'] = df['training'].map(lambda x: 0.0 if np.random.rand() > 0.8 else 1.0)

def update_sample(df):
    def transformvar(df):
        if df['target'] == 1 and np.random.rand() > 0.15293:
            return 0.0
        else:
            return 1.0
    df['sample'] = df.apply(transformvar, axis=1)



In [58]:

    
update_training(dataset_norm)
update_sample(dataset_norm)
update_training(dataset_norm_withingroup)
update_sample(dataset_norm_withingroup)

Write datasets for analysis



In [61]:

    
dataset_norm.to_pickle('dataset_norm.pickle')
dataset_norm_withingroup.to_pickle('dataset_norm_withingroup.pickle')



In [ ]:

	count	mean	std	min	25%	50%	75%	max
target	85062	0.867356	0.339192	0.0000	1.00	1.00	1.00	1.00
study_length_yrs	74269	2.460825	2.531695	0.0000	1.00	2.00	3.00	89.00
phase_0	85062	0.005008	0.070591	0.0000	0.00	0.00	0.00	1.00
phase_1	85062	0.178505	0.382940	0.0000	0.00	0.00	0.00	1.00
phase_2	85062	0.246843	0.431177	0.0000	0.00	0.00	0.00	1.00
phase_3	85062	0.177000	0.381671	0.0000	0.00	0.00	0.00	1.00
phase_4	85062	0.115704	0.319871	0.0000	0.00	0.00	0.00	1.00
type_interventional	85062	0.843761	0.363084	0.0000	1.00	1.00	1.00	1.00
type_observational	85062	0.155745	0.362616	0.0000	0.00	0.00	0.00	1.00
number_of_arms	85062	1.367602	1.423462	0.0000	0.00	1.00	2.00	19.00
number_of_groups	85062	0.171969	0.598828	0.0000	0.00	0.00	0.00	60.00
gender_female	85062	0.091968	0.288983	0.0000	0.00	0.00	0.00	1.00
gender_male	85062	0.060450	0.238320	0.0000	0.00	0.00	0.00	1.00
minimum_age_days	77213	7201.892366	3839.647617	30.4375	6574.50	6574.50	6574.50	31046.25
maximum_age_days	42158	20259.067966	8779.457596	1.0000	16436.25	23741.25	27393.75	34698.75
healthy_volunteers	85062	0.240178	0.427194	0.0000	0.00	0.00	0.00	1.00
is_fda_regulated	85062	0.335685	0.472232	0.0000	0.00	0.00	1.00	1.00
total_officials	72885	1.297578	1.551588	1.0000	1.00	1.00	1.00	97.00
sponsor_lead_industry	85062	0.404728	0.490842	0.0000	0.00	0.00	1.00	1.00
sponsor_lead_govt	85062	0.098963	0.298614	0.0000	0.00	0.00	0.00	1.00
sponsor_any_industry	85062	0.478040	0.499520	0.0000	0.00	0.00	1.00	1.00
total_sponsors	85062	1.445934	1.011338	1.0000	1.00	1.00	2.00	84.00
total_facilities	74492	9.615274	40.328682	1.0000	1.00	1.00	4.00	3511.00
facilities_countries	74492	1.643183	2.725659	1.0000	1.00	1.00	1.00	55.00
facilities_states	74492	3.395989	6.905619	1.0000	1.00	1.00	1.00	179.00
design_intervention_crossover	85062	0.093367	0.290948	0.0000	0.00	0.00	0.00	1.00
design_intervention_parallel	85062	0.432637	0.495444	0.0000	0.00	0.00	1.00	1.00
design_intervention_single	85062	0.236933	0.425203	0.0000	0.00	0.00	0.00	1.00
design_masking_doubleblind	85062	0.295396	0.456223	0.0000	0.00	0.00	1.00	1.00
design_masking_openlabel	85062	0.419776	0.493525	0.0000	0.00	0.00	1.00	1.00
design_masking_singleblind	85062	0.078743	0.269338	0.0000	0.00	0.00	0.00	1.00
design_purpose_science	85062	0.030390	0.171658	0.0000	0.00	0.00	0.00	1.00
design_purpose_diagnostic	85062	0.031730	0.175281	0.0000	0.00	0.00	0.00	1.00
design_purpose_prevention	85062	0.089029	0.284788	0.0000	0.00	0.00	0.00	1.00
design_purpose_treatment	85062	0.606922	0.488437	0.0000	0.00	1.00	1.00	1.00
design_observation_cohort	85062	0.056324	0.230547	0.0000	0.00	0.00	0.00	1.00
design_allocation_nonrandom	85062	0.139322	0.346284	0.0000	0.00	0.00	0.00	1.00
safety_issue	85062	0.294773	0.455943	0.0000	0.00	0.00	1.00	1.00
intervention_drug	85062	0.578413	0.493816	0.0000	0.00	1.00	1.00	1.00
intervention_procedure	85062	0.088970	0.284703	0.0000	0.00	0.00	0.00	1.00
intervention_behavior	85062	0.066728	0.249551	0.0000	0.00	0.00	0.00	1.00
intervention_device	85062	0.072935	0.260031	0.0000	0.00	0.00	0.00	1.00
intervention_biological	85062	0.067445	0.250792	0.0000	0.00	0.00	0.00	1.00
intervention_supplement	85062	0.025899	0.158834	0.0000	0.00	0.00	0.00	1.00
total_conditions	72358	4.958954	5.106381	1.0000	2.00	4.00	6.00	248.00
total_conditions_main	72358	2.193704	1.151675	1.0000	1.00	2.00	3.00	19.00
total_conditions_top	72358	1.133904	0.426171	1.0000	1.00	1.00	1.00	4.00
location_africa	85062	0.025146	0.156570	0.0000	0.00	0.00	0.00	1.00
location_europe	85062	0.264960	0.441314	0.0000	0.00	0.00	1.00	1.00
location_asia	85062	0.128142	0.334250	0.0000	0.00	0.00	0.00	1.00
location_latinamerica	85062	0.048917	0.215696	0.0000	0.00	0.00	0.00	1.00
location_oceania	85062	0.027145	0.162507	0.0000	0.00	0.00	0.00	1.00
location_northamerica	85062	0.520914	0.499565	0.0000	0.00	1.00	1.00	1.00
clus_0	85062	0.054066	0.226150	0.0000	0.00	0.00	0.00	1.00
clus_1	85062	0.220087	0.414307	0.0000	0.00	0.00	0.00	1.00
clus_2	85062	0.231184	0.421592	0.0000	0.00	0.00	0.00	1.00
clus_3	85062	0.030977	0.173257	0.0000	0.00	0.00	0.00	1.00
clus_4	85062	0.005302	0.072622	0.0000	0.00	0.00	0.00	1.00
clus_5	85062	0.006454	0.080078	0.0000	0.00	0.00	0.00	1.00
clus_6	85062	0.001117	0.033401	0.0000	0.00	0.00	0.00	1.00
	...	...	...	...	...	...	...	...