In [1]:

    
from __future__ import print_function

# Import libraries
import numpy as np
import pandas as pd
import sklearn
import psycopg2
import sys
import datetime as dt
import mp_utils as mp

# to display dataframes in notebooks
from IPython.display import display, HTML

from collections import OrderedDict

# used to print out pretty pandas dataframes
from IPython.display import display, HTML

from sklearn.pipeline import Pipeline

# used to impute mean for data and standardize for computational stability
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# logistic regression is our favourite model ever
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV # l2 regularized regression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier

# used to calculate AUROC/accuracy
from sklearn import metrics

# gradient boosting - must download package https://github.com/dmlc/xgboost
import xgboost as xgb

#import matplotlib
#import matplotlib.pyplot as plt
#from matplotlib.font_manager import FontProperties # for unicode fonts
#%matplotlib inline

# below config used on pc70
sqluser = 'alistairewj'
dbname = 'mimic'
schema_name = 'mimiciii'
query_schema = 'SET search_path to public,' + schema_name + ';'


# two options for loading data
# option 1) use SQL - requires database and to have run queries/make_all.sql
# option 2) use CSVs downloaded
USE_SQL=1
USE_CSV=0



In [2]:

    
if USE_SQL:
    # Connect to local postgres version of mimic
    con = psycopg2.connect(dbname=dbname, user=sqluser)

    # exclusion criteria:
    #   - less than 15 years old
    #   - stayed in the ICU less than 4 hours
    #   - never have any chartevents data (i.e. likely administrative error)
    #   - organ donor accounts (administrative "readmissions" for patients who died in hospital)
    query = query_schema + \
    """
    select 
        *
    from dm_cohort
    """
    co = pd.read_sql_query(query,con)
    
    # convert the inclusion flags to boolean
    for c in co.columns:
        if c[0:10]=='inclusion_':
            co[c] = co[c].astype(bool)

    # extract static vars into a separate dataframe
    df_static = pd.read_sql_query(query_schema + 'select * from mp_static_data', con)
    #for dtvar in ['intime','outtime','deathtime']:
    #    df_static[dtvar] = pd.to_datetime(df_static[dtvar])

    vars_static = [u'is_male', u'emergency_admission', u'age',
                   # services
                   u'service_any_noncard_surg',
                   u'service_any_card_surg',
                   u'service_cmed',
                   u'service_traum',
                   u'service_nmed',
                   # ethnicities
                   u'race_black',u'race_hispanic',u'race_asian',u'race_other',
                   # phatness
                   u'height', u'weight', u'bmi']


    # get ~5 million rows containing data from errbody
    # this takes a little bit of time to load into memory (~2 minutes)

    # %%time results
    # CPU times: user 42.8 s, sys: 1min 3s, total: 1min 46s
    # Wall time: 2min 7s

    df = pd.read_sql_query(query_schema + 'select * from mp_data', con)
    df.drop('subject_id',axis=1,inplace=True)
    df.drop('hadm_id',axis=1,inplace=True)
    df.sort_values(['icustay_id','hr'],axis=0,ascending=True,inplace=True)

    # get death information
    df_death = pd.read_sql_query(query_schema + """
    select 
    co.subject_id, co.hadm_id, co.icustay_id
    , ceil(extract(epoch from (co.outtime - co.intime))/60.0/60.0) as dischtime_hours
    , ceil(extract(epoch from (adm.deathtime - co.intime))/60.0/60.0) as deathtime_hours
    , case when adm.deathtime is null then 0 else 1 end as death
    from dm_cohort co
    inner join admissions adm
    on co.hadm_id = adm.hadm_id
    where co.excluded = 0
    """, con)
    
    # get censoring information
    df_censor = pd.read_sql_query(query_schema + """
    select co.icustay_id, min(cs.charttime) as censortime
    , ceil(extract(epoch from min(cs.charttime-co.intime) )/60.0/60.0) as censortime_hours
    from dm_cohort co 
    inner join mp_code_status cs
    on co.icustay_id = cs.icustay_id
    where cmo+dnr+dni+dncpr+cmo_notes>0
    and co.excluded = 0
    group by co.icustay_id
    """, con)
    
    # extract static vars into a separate dataframe
    df_static = pd.read_sql_query(query_schema + 'select * from mp_static_data', con)
    
elif USE_CSV:
    co = pd.read_csv('df_cohort.csv.gz')
    
    # convert the inclusion flags to boolean
    for c in co.columns:
        if c[0:10]=='inclusion_':
            co[c] = co[c].astype(bool)
    df = pd.read_csv('df_data.csv.gz')
    df_static = pd.read_csv('df_static_data.csv.gz')
    df_censor = pd.read_csv('df_censor.csv.gz')
    df_death = pd.read_csv('df_death.csv.gz')
    
else:
    print('Must use SQL or CSV to load data!')
    
    
print(df.shape)









    



(6386894, 54)

Base exclusion criteria



In [3]:

    
# print out the exclusions *SEQUENTIALLY* - i.e. if already excluded, don't re-print
print('Cohort - initial size: {} ICU stays'.format(co.shape[0]))

idxRem = np.zeros(co.shape[0],dtype=bool)
for c in co.columns:
    if c[0:len('exclusion_')]=='exclusion_':
        N_REM = np.sum( (co[c].values==1) )
        print('  {:5g} ({:2.2f}%) - {}'.format(N_REM,N_REM*100.0/co.shape[0], c))
        idxRem[co[c].values==1] = True

# summarize all exclusions
N_REM = np.sum( idxRem )
print('  {:5g} ({:2.2f}%) - {}'.format(N_REM,N_REM*100.0/co.shape[0], 'all exclusions'))
print('')
print('Final cohort size: {} ICU stays ({:2.2f}%).'.format(co.shape[0] - np.sum(idxRem), (1-np.mean(idxRem))*100.0))
co = co.loc[~idxRem,:]









    



Cohort - initial size: 61532 ICU stays
   8101 (13.17%) - exclusion_over_15
   1347 (2.19%) - exclusion_valid_data
   3641 (5.92%) - exclusion_stay_lt_4hr
      4 (0.01%) - exclusion_organ_donor
   9447 (15.35%) - all exclusions

Final cohort size: 52085 ICU stays (84.65%).

Mortality stats

Mortality in base cohort



In [4]:

    
# mortality stats for base cohort
for c in co.columns:
    if c[0:len('death_')]=='death_':
        N_ALL = co.shape[0]
        N = co.set_index('icustay_id').loc[:,c].sum()
        print('{:40s}{:5g} of {:5g} died ({:2.2f}%).'.format(c, N, N_ALL, N*100.0/N_ALL))









    



death_48hr_post_icu_admit                1614 of 52085 died (3.10%).
death_icu                                4185 of 52085 died (8.03%).
death_in_hospital                        6192 of 52085 died (11.89%).
death_30dy_post_icu_admit                7567 of 52085 died (14.53%).
death_30dy_post_icu_disch                8081 of 52085 died (15.52%).
death_30dy_post_hos_disch                8633 of 52085 died (16.57%).
death_6mo_post_hos_disch                12788 of 52085 died (24.55%).
death_1yr_post_hos_disch                15052 of 52085 died (28.90%).
death_2yr_post_hos_disch                15052 of 52085 died (28.90%).
death_30dy_post_hos_admit                7124 of 52085 died (13.68%).

Mortality in MIMIC-II patients staying >= 24 hours

This is mainly an example of how the inclFcn works. It derives from the cohort a boolean index of patients to retain in the dataset.



In [5]:

    
inclFcn = lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_stay_ge_24hr'],'icustay_id']

# mortality stats for base cohort
for c in co.columns:
    if c[0:len('death_')]=='death_':
        N_ALL = inclFcn(co).shape[0]
        N = co.set_index('icustay_id').loc[inclFcn(co),c].sum()
        print('{:40s}{:5g} of {:5g} died ({:2.2f}%).'.format(c, N, N_ALL, N*100.0/N_ALL))









    



death_48hr_post_icu_admit                 405 of 23497 died (1.72%).
death_icu                                2020 of 23497 died (8.60%).
death_in_hospital                        3034 of 23497 died (12.91%).
death_30dy_post_icu_admit                3613 of 23497 died (15.38%).
death_30dy_post_icu_disch                3933 of 23497 died (16.74%).
death_30dy_post_hos_disch                4212 of 23497 died (17.93%).
death_6mo_post_hos_disch                 6205 of 23497 died (26.41%).
death_1yr_post_hos_disch                 7362 of 23497 died (31.33%).
death_2yr_post_hos_disch                 7362 of 23497 died (31.33%).
death_30dy_post_hos_admit                3390 of 23497 died (14.43%).

Here we have the same function in a slightly more obscure way - with the benefit of being able to list all inclusions in a list. This just helps readability in the below code.



In [6]:

    
inclusions = ['inclusion_only_mimicii', 'inclusion_stay_ge_24hr']
inclFcn = lambda x: x.loc[x[inclusions].all(axis=1),'icustay_id']

# mortality stats for base cohort
for c in co.columns:
    if c[0:len('death_')]=='death_':
        N_ALL = inclFcn(co).shape[0]
        N = co.set_index('icustay_id').loc[inclFcn(co),c].sum()
        print('{:40s}{:5g} of {:5g} died ({:2.2f}%).'.format(c, N, N_ALL, N*100.0/N_ALL))









    



death_48hr_post_icu_admit                 405 of 23497 died (1.72%).
death_icu                                2020 of 23497 died (8.60%).
death_in_hospital                        3034 of 23497 died (12.91%).
death_30dy_post_icu_admit                3613 of 23497 died (15.38%).
death_30dy_post_icu_disch                3933 of 23497 died (16.74%).
death_30dy_post_hos_disch                4212 of 23497 died (17.93%).
death_6mo_post_hos_disch                 6205 of 23497 died (26.41%).
death_1yr_post_hos_disch                 7362 of 23497 died (31.33%).
death_2yr_post_hos_disch                 7362 of 23497 died (31.33%).
death_30dy_post_hos_admit                3390 of 23497 died (14.43%).

Exclusion criteria

Each study has its own exclusion criteria (sometimes studies have multiple experiments). We define a dictionary of all exclusions with the dictionary key as the study name. Some studies have multiple experiments, so we append a, b, or c.

The dictionary stores a length 2 list. The first element defines the window for data extraction: it contains a dictionary of the windows and the corresponding window sizes. The second element is the exclusion criteria. Both are functions which use co or df as their input.



In [7]:

    
# first we can define the different windows: there aren't that many!
df_tmp=co.copy().set_index('icustay_id')

# admission+12 hours
time_12hr = df_tmp.copy()
time_12hr['windowtime'] = 12
time_12hr = time_12hr['windowtime'].to_dict()

# admission+24 hours
time_24hr = df_tmp.copy()
time_24hr['windowtime'] = 24
time_24hr = time_24hr['windowtime'].to_dict()

# admission+48 hours
time_48hr = df_tmp.copy()
time_48hr['windowtime'] = 48
time_48hr = time_48hr['windowtime'].to_dict()

# admission+72 hours
time_72hr = df_tmp.copy()
time_72hr['windowtime'] = 72
time_72hr = time_72hr['windowtime'].to_dict()

# admission+96 hours
time_96hr = df_tmp.copy()
time_96hr['windowtime'] = 96
time_96hr = time_96hr['windowtime'].to_dict()

# entire stay
time_all = df_tmp.copy()
time_all = time_all['dischtime_hours'].apply(np.ceil).astype(int).to_dict()

# 12 hours before the patient died/discharged
time_predeath = df_tmp.copy()
time_predeath['windowtime'] = time_predeath['dischtime_hours']
idx = time_predeath['deathtime_hours']<time_predeath['dischtime_hours']
time_predeath.loc[idx,'windowtime'] = time_predeath.loc[idx,'deathtime_hours']
# move from discharge/death time to 12 hours beforehand
time_predeath['windowtime'] = time_predeath['windowtime']-12
time_predeath = time_predeath['windowtime'].apply(np.ceil).astype(int).to_dict()



In [8]:

    
# example params used to extract patient data
# element 1: dictionary specifying end time of window for each patient
# element 2: size of window
# element 3: extra hours added to make it easier to get data on labs (and allows us to get labs pre-ICU)
# e.g. [time_24hr, 8, 24] is
#   (1) window ends at admission+24hr
#   (2) window is 8 hours long
#   (3) lab window is 8+24=32 hours long

def inclFcn(x, inclusions):
    return x.loc[x[inclusions].all(axis=1),'icustay_id']


# this one is used more than once, so we define it here
hugExclFcnMIMIC3 = lambda x: x.loc[x['inclusion_over_18']&x['inclusion_hug2009_obs']&x['inclusion_hug2009_not_nsicu_csicu']&x['inclusion_first_admission']&x['inclusion_full_code']&x['inclusion_not_brain_death']&x['inclusion_not_crf'],'icustay_id'].values
hugExclFcn = lambda x: np.intersect1d(hugExclFcnMIMIC3(x),x.loc[x['inclusion_only_mimicii'],'icustay_id'].values)


# physionet2012 subset - not exact but close
def physChallExclFcn(x):
    out = x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_stay_ge_48hr']&x['inclusion_has_saps'],'icustay_id'].values
    out = np.sort(out)
    out = out[0:4000]
    return out
 
# caballero2015 is a random subsample - then limits to 18yrs, resulting in 11648
def caballeroExclFcn(x):
    out = x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18'],'icustay_id'].values
    out = np.sort(out)
    out = out[0:11648]
    return out

np.random.seed(546345)
W_extra = 24

exclusions = OrderedDict([
['caballero2015dynamically_a',  [[time_24hr, 24, W_extra], caballeroExclFcn, 'hospital_expire_flag']],
['caballero2015dynamically_b',  [[time_48hr, 48, W_extra], caballeroExclFcn, 'hospital_expire_flag']],
['caballero2015dynamically_c',  [[time_72hr, 72, W_extra], caballeroExclFcn, 'hospital_expire_flag']],
['calvert2016computational',    [[time_predeath, 5, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_only_micu']&x['inclusion_calvert2016_obs']&x['inclusion_stay_ge_17hr']&x['inclusion_stay_le_500hr']&x['inclusion_non_alc_icd9'],'icustay_id'].values, 'hospital_expire_flag']],
['calvert2016using',            [[time_predeath, 5, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_only_micu']&x['inclusion_calvert2016_obs']&x['inclusion_stay_ge_17hr']&x['inclusion_stay_le_500hr'],'icustay_id'].values, 'hospital_expire_flag']],
['celi2012database_a',          [[time_72hr, 72, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_aki_icd9'],'icustay_id'].values , 'hospital_expire_flag']],
['celi2012database_b',          [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_sah_icd9'],'icustay_id'].values , 'hospital_expire_flag']],
['che2016recurrent_a',          [[time_48hr, 48, W_extra], lambda x: x.loc[x['inclusion_over_18'],'icustay_id'].values , 'death_48hr_post_icu_admit']],
['che2016recurrent_b',          [[time_48hr, 48, W_extra], physChallExclFcn , 'hospital_expire_flag']],
['ding2016mortality',           [[time_48hr, 48, W_extra], physChallExclFcn , 'hospital_expire_flag']],
['ghassemi2014unfolding_a',     [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'hospital_expire_flag']],
['ghassemi2014unfolding_b',     [[time_12hr, 12, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_stay_ge_12hr'],'icustay_id'].values, 'hospital_expire_flag']],
['ghassemi2014unfolding_c',     [[time_12hr, 12, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_stay_ge_12hr'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['ghassemi2014unfolding_d',     [[time_12hr, 12, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_stay_ge_12hr'],'icustay_id'].values, 'death_1yr_post_hos_disch']],
['ghassemi2015multivariate_a',    [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_gt_6_notes']&x['inclusion_stay_ge_24hr']&x['inclusion_has_saps'],'icustay_id'].values, 'hospital_expire_flag']],
['ghassemi2015multivariate_b',    [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_gt_6_notes']&x['inclusion_stay_ge_24hr']&x['inclusion_has_saps'],'icustay_id'].values, 'death_1yr_post_hos_disch']],
['grnarova2016neural_a',          [[time_all,  24, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_multiple_hadm'],'icustay_id'].values, 'hospital_expire_flag']],
['grnarova2016neural_b',          [[time_all,  24, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_multiple_hadm'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['grnarova2016neural_c',          [[time_all,  24, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_multiple_hadm'],'icustay_id'].values, 'death_1yr_post_hos_disch']],
['harutyunyan2017multitask',    [[time_48hr, 48, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_multiple_icustay'],'icustay_id'].values, 'hospital_expire_flag']],
['hoogendoorn2016prediction',   [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_hug2009_obs']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'hospital_expire_flag']],
['hug2009icu',                  [[time_24hr, 24, W_extra], hugExclFcn, 'death_30dy_post_icu_disch']],
['johnson2012patient',          [[time_48hr, 48, W_extra], physChallExclFcn, 'hospital_expire_flag']],
['johnson2014data',             [[time_48hr, 48, W_extra], physChallExclFcn, 'hospital_expire_flag']],
['joshi2012prognostic',         [[time_24hr, 24, W_extra], hugExclFcn, 'hospital_expire_flag']],
['joshi2016identifiable',       [[time_48hr, 48, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_stay_ge_48hr'],'icustay_id'].values, 'hospital_expire_flag']],
['lee2015customization_a',        [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_lee2015_service']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'hospital_expire_flag']],
['lee2015customization_b',        [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_lee2015_service']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['lee2015customization_c',        [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_lee2015_service']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'death_2yr_post_hos_disch']],
['lee2015personalized',         [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['lee2017patient',              [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['lehman2012risk',              [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr']&x['inclusion_first_admission'],'icustay_id'].values, 'hospital_expire_flag']],
['luo2016interpretable_a',        [[time_all,  24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_sapsii']&x['inclusion_no_disch_summary'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['luo2016interpretable_b',        [[time_all,  24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_sapsii']&x['inclusion_no_disch_summary'],'icustay_id'].values, 'death_6mo_post_hos_disch']],
['luo2016predicting',           [[time_24hr, 12, W_extra], lambda x: np.intersect1d(hugExclFcn(x),x.loc[x['inclusion_stay_ge_24hr'],'icustay_id'].values) , 'death_30dy_post_icu_disch']],
['pirracchio2015mortality',     [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii'],'icustay_id'].values , 'hospital_expire_flag']],
['ripoll2014sepsis',            [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_saps']&x['inclusion_not_explicit_sepsis'],'icustay_id'].values, 'hospital_expire_flag']],
['wojtusiak2017c',              [[time_all,  24, W_extra], lambda x: x.loc[x['inclusion_over_65']&x['inclusion_alive_hos_disch'],'icustay_id'].values, 'death_30dy_post_hos_disch']]
])

Compare sample sizes and mortality rates



In [11]:

    
repro_stats = pd.DataFrame(None, columns=['N_Repro','Y_Repro'])

N = co.shape[0]
    
for current_study in exclusions:
    params, iid_keep, y_outcome_label = exclusions[current_study]
    
    # iid_keep is currently a function - apply it to co to get ICUSTAY_IDs to keep for this study
    iid_keep = iid_keep(co)
    
    N_STUDY = iid_keep.shape[0]
    Y_STUDY = co.set_index('icustay_id').loc[iid_keep,y_outcome_label].mean()*100.0
    
    # print size of cohort in study
    print('{:5g} ({:5.2f}%) - Mortality = {:5.2f}% - {}'.format(
            N_STUDY, N_STUDY*100.0/N, Y_STUDY,
            current_study)
         )
    
    repro_stats.loc[current_study] = [N_STUDY, Y_STUDY]









    



11648 (22.36%) - Mortality = 13.01% - caballero2015dynamically_a
11648 (22.36%) - Mortality = 13.01% - caballero2015dynamically_b
11648 (22.36%) - Mortality = 13.01% - caballero2015dynamically_c
 1985 ( 3.81%) - Mortality = 13.80% - calvert2016computational
18396 (35.32%) - Mortality = 14.71% - calvert2016using
 4741 ( 9.10%) - Mortality = 23.92% - celi2012database_a
 1070 ( 2.05%) - Mortality = 19.16% - celi2012database_b
51986 (99.81%) - Mortality =  3.10% - che2016recurrent_a
 4000 ( 7.68%) - Mortality = 14.35% - che2016recurrent_b
 4000 ( 7.68%) - Mortality = 14.35% - ding2016mortality
23442 (45.01%) - Mortality = 12.92% - ghassemi2014unfolding_a
28172 (54.09%) - Mortality = 12.20% - ghassemi2014unfolding_b
28172 (54.09%) - Mortality = 16.92% - ghassemi2014unfolding_c
28172 (54.09%) - Mortality = 29.75% - ghassemi2014unfolding_d
21969 (42.18%) - Mortality = 13.51% - ghassemi2015multivariate_a
21969 (42.18%) - Mortality = 32.35% - ghassemi2015multivariate_b
29572 (56.78%) - Mortality = 12.49% - grnarova2016neural_a
29572 (56.78%) - Mortality = 16.36% - grnarova2016neural_b
29572 (56.78%) - Mortality = 24.63% - grnarova2016neural_c
45493 (87.34%) - Mortality = 10.54% - harutyunyan2017multitask
17545 (33.69%) - Mortality = 14.97% - hoogendoorn2016prediction
10696 (20.54%) - Mortality =  6.35% - hug2009icu
 4000 ( 7.68%) - Mortality = 14.35% - johnson2012patient
 4000 ( 7.68%) - Mortality = 14.35% - johnson2014data
10696 (20.54%) - Mortality =  4.14% - joshi2012prognostic
26508 (50.89%) - Mortality = 14.95% - joshi2016identifiable
20961 (40.24%) - Mortality = 12.69% - lee2015customization_a
20961 (40.24%) - Mortality = 17.86% - lee2015customization_b
20961 (40.24%) - Mortality = 31.57% - lee2015customization_c
23443 (45.01%) - Mortality = 17.94% - lee2015personalized
23443 (45.01%) - Mortality = 17.94% - lee2017patient
21738 (41.74%) - Mortality = 12.32% - lehman2012risk
27747 (53.27%) - Mortality = 17.05% - luo2016interpretable_a
27747 (53.27%) - Mortality = 25.17% - luo2016interpretable_b
 8931 (17.15%) - Mortality =  6.45% - luo2016predicting
28795 (55.28%) - Mortality = 12.72% - pirracchio2015mortality
 2251 ( 4.32%) - Mortality = 39.63% - ripoll2014sepsis
22699 (43.58%) - Mortality =  7.74% - wojtusiak2017c

With the above dataframe, repro_stats, we can compare our results to those extracted manually from the studies. We load in the manual extraction from the data subfolder, merge it with this dataframe, and output to CSV.



In [12]:

    
study_data = pd.read_csv('../data/study_data.csv')
study_data.set_index('Cohort',inplace=True)
# add in reproduction sample size // outcome
study_data_merged = study_data.merge(repro_stats, how='left',
                left_index=True, right_index=True)


# print out the table as it was in the paper (maybe a bit more precision)
study_data_merged[ ['N_Study','N_Repro','Y_Study','Y_Repro'] ]









    Out[12]:







  
    
      
      N_Study
      N_Repro
      Y_Study
      Y_Repro
    
    
      Cohort
      
      
      
      
    
  
  
    
      caballero2015dynamically_a
      11648
      11648.0
      -
      13.006525
    
    
      caballero2015dynamically_b
      11648
      11648.0
      -
      13.006525
    
    
      caballero2015dynamically_c
      11648
      11648.0
      -
      13.006525
    
    
      calvert2016computational
      3054
      1985.0
      12.84
      13.803526
    
    
      calvert2016using
      9683
      18396.0
      10.68
      14.709720
    
    
      celi2012database_a
      1400
      4741.0
      30.7
      23.919004
    
    
      celi2012database_b
      223
      1070.0
      25.6
      19.158879
    
    
      che2016recurrent_a
      4000
      51986.0
      13.85
      3.095064
    
    
      ding2016mortality
      4000
      4000.0
      13.85
      14.350000
    
    
      ghassemi2014unfolding_a
      19308
      23442.0
      10.84
      12.916987
    
    
      ghassemi2014unfolding_b
      19308
      28172.0
      10.80
      12.200057
    
    
      ghassemi2015multivariate_a
      10202
      21969.0
      -
      13.514498
    
    
      grnarova2016neural_a
      31244
      29572.0
      13.82
      12.494928
    
    
      harutyunyan2017multitask
      42276
      45493.0
      -
      10.535687
    
    
      hoogendoorn2016prediction
      13923
      17545.0
      -
      14.967227
    
    
      johnson2012patient
      4000
      4000.0
      -
      14.350000
    
    
      johnson2014data
      4000
      4000.0
      -
      14.350000
    
    
      joshi2012prognostic
      10066
      10696.0
      12.0
      4.141735
    
    
      lee2015customization_a
      17490
      20961.0
      17.73
      12.690234
    
    
      lehman2012risk
      14739
      21738.0
      14.6
      12.319441
    
    
      pirracchio2015mortality
      24508
      28795.0
      12.2
      12.720958
    
    
      ripoll2014sepsis
      2002
      2251.0
      21.10
      39.626833
    
    
      che2016recurrent_b
      19714
      4000.0
      8.7
      14.350000
    
    
      hug2009icu
      10066
      10696.0
      17.0
      6.348168
    
    
      luo2016predicting
      7863
      8931.0
      17.0
      6.449446
    
    
      joshi2016identifiable
      17000
      26508.0
      -
      14.950204
    
    
      ghassemi2014unfolding_c
      19308
      28172.0
      3.23
      16.917507
    
    
      grnarova2016neural_b
      31244
      29572.0
      3.70
      16.363452
    
    
      lee2015personalized
      17490
      23443.0
      15.1
      17.941390
    
    
      lee2015customization_b
      17490
      20961.0
      23.56
      17.861743
    
    
      lee2017patient
      17152
      23443.0
      15.1
      17.941390
    
    
      luo2016interpretable_a
      18412
      27747.0
      3.4
      17.054096
    
    
      wojtusiak2017c
      21651
      22699.0
      NaN
      7.736024
    
    
      luo2016interpretable_b
      18412
      27747.0
      9.5
      25.166685
    
    
      ghassemi2014unfolding_d
      19308
      28172.0
      3.34
      29.752946
    
    
      ghassemi2015multivariate_b
      10202
      21969.0
      -
      32.354682
    
    
      grnarova2016neural_c
      31244
      29572.0
      12.06
      24.628027
    
    
      lee2015customization_c
      17152
      20961.0
      43.82
      31.572921

Define K-folds for AUROC comparison



In [13]:

    
# define var_static which is used later
#TODO: should refactor so this isn't needed
var_min, var_max, var_first, var_last, var_sum, var_first_early, var_last_early, var_static = mp.vars_of_interest()

K=5
np.random.seed(871)
# get unique subject_id (this is needed later)
sid = np.sort(np.unique(df_death['subject_id'].values))

# assign k-fold
idxK_sid = np.random.permutation(sid.shape[0])
idxK_sid = np.mod(idxK_sid,K)

# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, df_death['subject_id'].values)

# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]

Run results for a single study

The below code cell:

extracts the cohort for a specific study
extracts the outcome of that study
builds predictive models in 5-fold cross-validation for that outcome

The two models at the moment are Gradient Boosting (xgboost) and logistic regression (scikit-learn). This code cell only runs for one study.



In [14]:

    
# pick the study to run the example on
current_study = 'celi2012database_b'
    
# Rough timing info:
#     rf - 3 seconds per fold
#    xgb - 30 seconds per fold
# logreg - 4 seconds per fold
#  lasso - 8 seconds per fold
models = OrderedDict([
          ['xgb', xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)],
          #['lasso', LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000)],
          #['rf', RandomForestClassifier()],
          ['logreg', LogisticRegression(fit_intercept=True)]
         ])

print('')
print('====================={}==========='.format('='*len(current_study)))
print('========== BEGINNING {}==========='.format(current_study))
print('====================={}==========='.format('='*len(current_study)))

params = exclusions[current_study][0]
df_data = mp.get_design_matrix(df, params[0], W=params[1], W_extra=params[2])

# get a list of icustay_id who stayed at least 12 hours
iid_keep = exclusions[current_study][1](co)
print('Reducing sample size from {} to {} ({:2.2f}%).'.format(
        df_data.shape[0], iid_keep.shape[0], iid_keep.shape[0]*100.0 / df_data.shape[0]))
df_data = df_data.loc[iid_keep,:]
print('')

y_outcome_label = exclusions[current_study][2]

# load the data into a numpy array

# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)

# map above K-fold indices to this dataset
X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)

# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = [x for x in df_data.columns.values] + var_static

mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = list()
    tar_val[mdl] = list()

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        # get prediction on this dataset
        if mdl == 'lasso':
            curr_prob = curr_mdl.predict(X[idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
            curr_prob = curr_prob[:,1]

        pred_val[mdl].append(curr_prob)
        tar_val[mdl].append(y[idxK == k])

        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)

        print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))









    



==================================================
========== BEGINNING celi2012database_b===========
==================================================
Reducing sample size from 52050 to 1070 (2.06%).

=============== xgb ===============
2017-08-24 15:02:00.055696 - Finished fold 1 of 5. AUROC 0.895.
2017-08-24 15:02:00.626802 - Finished fold 2 of 5. AUROC 0.955.
2017-08-24 15:02:01.201830 - Finished fold 3 of 5. AUROC 0.900.
2017-08-24 15:02:02.202544 - Finished fold 4 of 5. AUROC 0.899.
2017-08-24 15:02:03.289922 - Finished fold 5 of 5. AUROC 0.859.
=============== logreg ===============
2017-08-24 15:02:03.327384 - Finished fold 1 of 5. AUROC 0.873.
2017-08-24 15:02:03.361398 - Finished fold 2 of 5. AUROC 0.899.
2017-08-24 15:02:03.396752 - Finished fold 3 of 5. AUROC 0.868.
2017-08-24 15:02:03.436943 - Finished fold 4 of 5. AUROC 0.890.
2017-08-24 15:02:03.476754 - Finished fold 5 of 5. AUROC 0.859.

Run results for all results

The below code block is identical to the above code block, except it loops over all studies evaluated. This code block takes a while - it is training ~150 models of each type. The final AUROCs are output to the results.txt file. The final models/results/predictions/targets are saved in various dictionaries with the suffix _all.



In [15]:

    
mdl_val_all = dict()
results_val_all = dict()
pred_val_all = dict()
tar_val_all = dict()

# Rough timing info:
#     rf - 3 seconds per fold
#    xgb - 30 seconds per fold
# logreg - 4 seconds per fold
#  lasso - 8 seconds per fold
models = OrderedDict([
          ['xgb', xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)],
          #['lasso', LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000)],
          #['rf', RandomForestClassifier()],
          ['logreg', LogisticRegression(fit_intercept=True)]
         ])
    
with open('results.txt','w') as fp:
    fp.write('StudyName,SampleSize,Outcome')
    for mdl in models:
        fp.write(',{}'.format(mdl))
    fp.write('\n')
    
for current_study in exclusions:    
    print('\n==================== {} =========='.format('='*len(current_study)))
    print('========== BEGINNING {} =========='.format(current_study))
    print('==================== {} =========='.format('='*len(current_study)))

    params = exclusions[current_study][0]
    df_data = mp.get_design_matrix(df, params[0], W=params[1], W_extra=params[2])

    # get a list of icustay_id who stayed at least 12 hours
    iid_keep = exclusions[current_study][1](co)
    print('Reducing sample size from {} to {} ({:2.2f}%).'.format(
            df_data.shape[0], iid_keep.shape[0], iid_keep.shape[0]*100.0 / df_data.shape[0]))
    df_data = df_data.loc[iid_keep,:]
    print('')

    y_outcome_label = exclusions[current_study][2]
    
    # load the data into a numpy array

    # first, the data from static vars from df_static
    X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
    # next, add in the outcome: death in hospital
    X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)

    # map above K-fold indices to this dataset
    X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
    # get indices which map subject_ids in sid to the X dataframe
    idxMap = np.searchsorted(sid, X['subject_id'].values)
    # use these indices to map the k-fold integers
    idxK = idxK_sid[idxMap]
    # drop the subject_id column
    X.drop('subject_id',axis=1,inplace=True)

    # convert to numpy data (assumes target, death, is the last column)
    X = X.values
    y = X[:,-1]
    X = X[:,0:-1]
    X_header = [x for x in df_data.columns.values] + var_static
    
    mdl_val = dict()
    results_val = dict()
    pred_val = dict()
    tar_val = dict()

    for mdl in models:
        print('=============== {} ==============='.format(mdl))
        mdl_val[mdl] = list()
        results_val[mdl] = list() # initialize list for scores
        pred_val[mdl] = list()
        tar_val[mdl] = list()

        if mdl == 'xgb':
            # no pre-processing of data necessary for xgb
            estimator = Pipeline([(mdl, models[mdl])])

        else:
            estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                              strategy="mean",
                                              axis=0)),
                          ("scaler", StandardScaler()),
                          (mdl, models[mdl])]) 

        for k in range(K):
            # train the model using all but the kth fold
            curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

            # get prediction on this dataset
            if mdl == 'lasso':
                curr_prob = curr_mdl.predict(X[idxK == k, :])
            else:
                curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
                curr_prob = curr_prob[:,1]

            pred_val[mdl].append(curr_prob)
            tar_val[mdl].append(y[idxK == k])

            # calculate score (AUROC)
            curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

            # add score to list of scores
            results_val[mdl].append(curr_score)

            # save the current model
            mdl_val[mdl].append(curr_mdl)

            print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))

    # create a pointer for above dicts with new var names
    # we will likely re-use the dicts in subsequent calls for getting model perfomances
    mdl_val_all[current_study] = mdl_val
    results_val_all[current_study] = results_val
    pred_val_all[current_study] = pred_val
    tar_val_all[current_study] = tar_val
    
    # print to file
    with open('results.txt','a') as fp:
        # print study name, sample size and frequency of outcome
        fp.write( '{},{},{:2.2f}'.format(current_study, X.shape[0], np.mean(y)*100.0 ) )
        
        for i, mdl in enumerate(models):
            fp.write(',{:0.6f}'.format( np.mean(results_val[mdl]) ))
        
        fp.write('\n')









    



==================== ========================== ==========
========== BEGINNING caballero2015dynamically_a ==========
==================== ========================== ==========
Reducing sample size from 52050 to 11648 (22.38%).

=============== xgb ===============
2017-08-24 15:02:10.689410 - Finished fold 1 of 5. AUROC 0.898.
2017-08-24 15:02:19.282924 - Finished fold 2 of 5. AUROC 0.898.
2017-08-24 15:02:26.227492 - Finished fold 3 of 5. AUROC 0.914.
2017-08-24 15:02:34.092461 - Finished fold 4 of 5. AUROC 0.910.
2017-08-24 15:02:38.108726 - Finished fold 5 of 5. AUROC 0.915.
=============== logreg ===============
2017-08-24 15:02:38.598989 - Finished fold 1 of 5. AUROC 0.883.
2017-08-24 15:02:39.093343 - Finished fold 2 of 5. AUROC 0.888.
2017-08-24 15:02:39.630761 - Finished fold 3 of 5. AUROC 0.882.
2017-08-24 15:02:40.178536 - Finished fold 4 of 5. AUROC 0.892.
2017-08-24 15:02:40.717573 - Finished fold 5 of 5. AUROC 0.899.

==================== ========================== ==========
========== BEGINNING caballero2015dynamically_b ==========
==================== ========================== ==========
Reducing sample size from 52050 to 11648 (22.38%).

=============== xgb ===============
2017-08-24 15:02:48.164232 - Finished fold 1 of 5. AUROC 0.915.
2017-08-24 15:02:51.448008 - Finished fold 2 of 5. AUROC 0.915.
2017-08-24 15:02:56.148969 - Finished fold 3 of 5. AUROC 0.929.
2017-08-24 15:03:03.151555 - Finished fold 4 of 5. AUROC 0.924.
2017-08-24 15:03:07.006469 - Finished fold 5 of 5. AUROC 0.927.
=============== logreg ===============
2017-08-24 15:03:07.479561 - Finished fold 1 of 5. AUROC 0.893.
2017-08-24 15:03:08.019806 - Finished fold 2 of 5. AUROC 0.907.
2017-08-24 15:03:08.576943 - Finished fold 3 of 5. AUROC 0.901.
2017-08-24 15:03:09.143274 - Finished fold 4 of 5. AUROC 0.912.
2017-08-24 15:03:09.691301 - Finished fold 5 of 5. AUROC 0.910.

==================== ========================== ==========
========== BEGINNING caballero2015dynamically_c ==========
==================== ========================== ==========
Reducing sample size from 52050 to 11648 (22.38%).

=============== xgb ===============
2017-08-24 15:03:19.741895 - Finished fold 1 of 5. AUROC 0.921.
2017-08-24 15:03:23.881739 - Finished fold 2 of 5. AUROC 0.926.
2017-08-24 15:03:27.843818 - Finished fold 3 of 5. AUROC 0.938.
2017-08-24 15:03:31.775496 - Finished fold 4 of 5. AUROC 0.936.
2017-08-24 15:03:35.944336 - Finished fold 5 of 5. AUROC 0.936.
=============== logreg ===============
2017-08-24 15:03:36.449209 - Finished fold 1 of 5. AUROC 0.901.
2017-08-24 15:03:36.935989 - Finished fold 2 of 5. AUROC 0.919.
2017-08-24 15:03:37.424044 - Finished fold 3 of 5. AUROC 0.912.
2017-08-24 15:03:37.933657 - Finished fold 4 of 5. AUROC 0.924.
2017-08-24 15:03:38.439463 - Finished fold 5 of 5. AUROC 0.922.

==================== ======================== ==========
========== BEGINNING calvert2016computational ==========
==================== ======================== ==========
Reducing sample size from 52042 to 1985 (3.81%).

=============== xgb ===============
2017-08-24 15:03:41.054190 - Finished fold 1 of 5. AUROC 0.951.
2017-08-24 15:03:41.688004 - Finished fold 2 of 5. AUROC 0.970.
2017-08-24 15:03:42.411946 - Finished fold 3 of 5. AUROC 0.945.
2017-08-24 15:03:43.329728 - Finished fold 4 of 5. AUROC 0.933.
2017-08-24 15:03:44.031611 - Finished fold 5 of 5. AUROC 0.974.
=============== logreg ===============
2017-08-24 15:03:44.097718 - Finished fold 1 of 5. AUROC 0.901.
2017-08-24 15:03:44.166280 - Finished fold 2 of 5. AUROC 0.948.
2017-08-24 15:03:44.241489 - Finished fold 3 of 5. AUROC 0.940.
2017-08-24 15:03:44.310584 - Finished fold 4 of 5. AUROC 0.871.
2017-08-24 15:03:44.388297 - Finished fold 5 of 5. AUROC 0.955.

==================== ================ ==========
========== BEGINNING calvert2016using ==========
==================== ================ ==========
Reducing sample size from 52042 to 18396 (35.35%).

=============== xgb ===============
2017-08-24 15:03:52.248697 - Finished fold 1 of 5. AUROC 0.925.
2017-08-24 15:03:57.216430 - Finished fold 2 of 5. AUROC 0.942.
2017-08-24 15:04:02.256560 - Finished fold 3 of 5. AUROC 0.935.
2017-08-24 15:04:07.093441 - Finished fold 4 of 5. AUROC 0.937.
2017-08-24 15:04:13.919652 - Finished fold 5 of 5. AUROC 0.933.
=============== logreg ===============
2017-08-24 15:04:14.903489 - Finished fold 1 of 5. AUROC 0.902.
2017-08-24 15:04:15.823500 - Finished fold 2 of 5. AUROC 0.925.
2017-08-24 15:04:16.921071 - Finished fold 3 of 5. AUROC 0.916.
2017-08-24 15:04:18.099623 - Finished fold 4 of 5. AUROC 0.910.
2017-08-24 15:04:19.316609 - Finished fold 5 of 5. AUROC 0.913.

==================== ================== ==========
========== BEGINNING celi2012database_a ==========
==================== ================== ==========
Reducing sample size from 52050 to 4741 (9.11%).

=============== xgb ===============
2017-08-24 15:04:27.530598 - Finished fold 1 of 5. AUROC 0.872.
2017-08-24 15:04:29.712421 - Finished fold 2 of 5. AUROC 0.882.
2017-08-24 15:04:33.208817 - Finished fold 3 of 5. AUROC 0.878.
2017-08-24 15:04:34.883452 - Finished fold 4 of 5. AUROC 0.892.
2017-08-24 15:04:36.554279 - Finished fold 5 of 5. AUROC 0.889.
=============== logreg ===============
2017-08-24 15:04:36.675330 - Finished fold 1 of 5. AUROC 0.849.
2017-08-24 15:04:36.840260 - Finished fold 2 of 5. AUROC 0.874.
2017-08-24 15:04:37.051678 - Finished fold 3 of 5. AUROC 0.877.
2017-08-24 15:04:37.219919 - Finished fold 4 of 5. AUROC 0.883.
2017-08-24 15:04:37.373168 - Finished fold 5 of 5. AUROC 0.897.

==================== ================== ==========
========== BEGINNING celi2012database_b ==========
==================== ================== ==========
Reducing sample size from 52050 to 1070 (2.06%).

=============== xgb ===============
2017-08-24 15:04:40.881597 - Finished fold 1 of 5. AUROC 0.895.
2017-08-24 15:04:41.361838 - Finished fold 2 of 5. AUROC 0.955.
2017-08-24 15:04:42.211288 - Finished fold 3 of 5. AUROC 0.900.
2017-08-24 15:04:42.717543 - Finished fold 4 of 5. AUROC 0.899.
2017-08-24 15:04:43.161471 - Finished fold 5 of 5. AUROC 0.859.
=============== logreg ===============
2017-08-24 15:04:43.188251 - Finished fold 1 of 5. AUROC 0.873.
2017-08-24 15:04:43.225735 - Finished fold 2 of 5. AUROC 0.899.
2017-08-24 15:04:43.260163 - Finished fold 3 of 5. AUROC 0.868.
2017-08-24 15:04:43.296856 - Finished fold 4 of 5. AUROC 0.890.
2017-08-24 15:04:43.332309 - Finished fold 5 of 5. AUROC 0.859.

==================== ================== ==========
========== BEGINNING che2016recurrent_a ==========
==================== ================== ==========
Reducing sample size from 52050 to 51986 (99.88%).

=============== xgb ===============
2017-08-24 15:05:03.306727 - Finished fold 1 of 5. AUROC 0.989.
2017-08-24 15:05:19.245671 - Finished fold 2 of 5. AUROC 0.989.
2017-08-24 15:05:34.869711 - Finished fold 3 of 5. AUROC 0.985.
2017-08-24 15:05:50.469131 - Finished fold 4 of 5. AUROC 0.984.
2017-08-24 15:06:06.107621 - Finished fold 5 of 5. AUROC 0.986.
=============== logreg ===============
2017-08-24 15:06:08.864201 - Finished fold 1 of 5. AUROC 0.969.
2017-08-24 15:06:11.630003 - Finished fold 2 of 5. AUROC 0.967.
2017-08-24 15:06:14.378411 - Finished fold 3 of 5. AUROC 0.965.






    



/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/base.py:352: RuntimeWarning: overflow encountered in exp
  np.exp(prob, prob)






    



2017-08-24 15:06:16.815520 - Finished fold 4 of 5. AUROC 0.970.
2017-08-24 15:06:19.634833 - Finished fold 5 of 5. AUROC 0.961.

==================== ================== ==========
========== BEGINNING che2016recurrent_b ==========
==================== ================== ==========
Reducing sample size from 52050 to 4000 (7.68%).

=============== xgb ===============
2017-08-24 15:06:24.773976 - Finished fold 1 of 5. AUROC 0.824.
2017-08-24 15:06:25.735558 - Finished fold 2 of 5. AUROC 0.826.
2017-08-24 15:06:26.777312 - Finished fold 3 of 5. AUROC 0.866.
2017-08-24 15:06:27.817189 - Finished fold 4 of 5. AUROC 0.850.
2017-08-24 15:06:28.799237 - Finished fold 5 of 5. AUROC 0.858.
=============== logreg ===============
2017-08-24 15:06:28.913223 - Finished fold 1 of 5. AUROC 0.794.
2017-08-24 15:06:29.026636 - Finished fold 2 of 5. AUROC 0.836.
2017-08-24 15:06:29.169059 - Finished fold 3 of 5. AUROC 0.849.
2017-08-24 15:06:29.295513 - Finished fold 4 of 5. AUROC 0.847.
2017-08-24 15:06:29.434908 - Finished fold 5 of 5. AUROC 0.829.

==================== ================= ==========
========== BEGINNING ding2016mortality ==========
==================== ================= ==========
Reducing sample size from 52050 to 4000 (7.68%).

=============== xgb ===============
2017-08-24 15:06:34.537377 - Finished fold 1 of 5. AUROC 0.824.
2017-08-24 15:06:35.700315 - Finished fold 2 of 5. AUROC 0.826.
2017-08-24 15:06:36.735772 - Finished fold 3 of 5. AUROC 0.866.
2017-08-24 15:06:37.732986 - Finished fold 4 of 5. AUROC 0.850.
2017-08-24 15:06:38.790253 - Finished fold 5 of 5. AUROC 0.858.
=============== logreg ===============
2017-08-24 15:06:38.912776 - Finished fold 1 of 5. AUROC 0.794.
2017-08-24 15:06:39.037103 - Finished fold 2 of 5. AUROC 0.836.
2017-08-24 15:06:39.180190 - Finished fold 3 of 5. AUROC 0.849.
2017-08-24 15:06:39.314629 - Finished fold 4 of 5. AUROC 0.847.
2017-08-24 15:06:39.453793 - Finished fold 5 of 5. AUROC 0.829.

==================== ======================= ==========
========== BEGINNING ghassemi2014unfolding_a ==========
==================== ======================= ==========
Reducing sample size from 52050 to 23442 (45.04%).

=============== xgb ===============
2017-08-24 15:06:48.268816 - Finished fold 1 of 5. AUROC 0.876.
2017-08-24 15:06:54.244002 - Finished fold 2 of 5. AUROC 0.873.
2017-08-24 15:07:00.317909 - Finished fold 3 of 5. AUROC 0.882.
2017-08-24 15:07:06.380756 - Finished fold 4 of 5. AUROC 0.888.
2017-08-24 15:07:12.493694 - Finished fold 5 of 5. AUROC 0.894.
=============== logreg ===============
2017-08-24 15:07:13.443125 - Finished fold 1 of 5. AUROC 0.858.
2017-08-24 15:07:14.404426 - Finished fold 2 of 5. AUROC 0.864.
2017-08-24 15:07:15.451064 - Finished fold 3 of 5. AUROC 0.862.
2017-08-24 15:07:16.719313 - Finished fold 4 of 5. AUROC 0.874.
2017-08-24 15:07:17.686592 - Finished fold 5 of 5. AUROC 0.875.

==================== ======================= ==========
========== BEGINNING ghassemi2014unfolding_b ==========
==================== ======================= ==========
Reducing sample size from 52050 to 28172 (54.12%).

=============== xgb ===============
2017-08-24 15:07:27.449468 - Finished fold 1 of 5. AUROC 0.874.
2017-08-24 15:07:34.758833 - Finished fold 2 of 5. AUROC 0.876.
2017-08-24 15:07:42.241218 - Finished fold 3 of 5. AUROC 0.882.
2017-08-24 15:07:49.577196 - Finished fold 4 of 5. AUROC 0.889.
2017-08-24 15:07:56.919700 - Finished fold 5 of 5. AUROC 0.896.
=============== logreg ===============
2017-08-24 15:07:57.978677 - Finished fold 1 of 5. AUROC 0.849.
2017-08-24 15:07:59.179914 - Finished fold 2 of 5. AUROC 0.856.
2017-08-24 15:08:00.364824 - Finished fold 3 of 5. AUROC 0.856.
2017-08-24 15:08:01.756023 - Finished fold 4 of 5. AUROC 0.871.
2017-08-24 15:08:02.880160 - Finished fold 5 of 5. AUROC 0.874.

==================== ======================= ==========
========== BEGINNING ghassemi2014unfolding_c ==========
==================== ======================= ==========
Reducing sample size from 52050 to 28172 (54.12%).

=============== xgb ===============
2017-08-24 15:08:12.680960 - Finished fold 1 of 5. AUROC 0.865.
2017-08-24 15:08:20.235484 - Finished fold 2 of 5. AUROC 0.867.
2017-08-24 15:08:27.905037 - Finished fold 3 of 5. AUROC 0.868.
2017-08-24 15:08:35.257520 - Finished fold 4 of 5. AUROC 0.866.
2017-08-24 15:08:42.584230 - Finished fold 5 of 5. AUROC 0.878.
=============== logreg ===============
2017-08-24 15:08:43.571296 - Finished fold 1 of 5. AUROC 0.838.
2017-08-24 15:08:44.558976 - Finished fold 2 of 5. AUROC 0.846.
2017-08-24 15:08:45.778648 - Finished fold 3 of 5. AUROC 0.845.
2017-08-24 15:08:46.996684 - Finished fold 4 of 5. AUROC 0.843.
2017-08-24 15:08:47.992288 - Finished fold 5 of 5. AUROC 0.854.

==================== ======================= ==========
========== BEGINNING ghassemi2014unfolding_d ==========
==================== ======================= ==========
Reducing sample size from 52050 to 28172 (54.12%).

=============== xgb ===============
2017-08-24 15:08:57.612165 - Finished fold 1 of 5. AUROC 0.843.
2017-08-24 15:09:05.360247 - Finished fold 2 of 5. AUROC 0.848.
2017-08-24 15:09:12.880743 - Finished fold 3 of 5. AUROC 0.841.
2017-08-24 15:09:20.232645 - Finished fold 4 of 5. AUROC 0.836.
2017-08-24 15:09:27.728024 - Finished fold 5 of 5. AUROC 0.861.
=============== logreg ===============
2017-08-24 15:09:28.844789 - Finished fold 1 of 5. AUROC 0.811.
2017-08-24 15:09:30.002385 - Finished fold 2 of 5. AUROC 0.820.
2017-08-24 15:09:31.103603 - Finished fold 3 of 5. AUROC 0.817.
2017-08-24 15:09:32.577365 - Finished fold 4 of 5. AUROC 0.815.
2017-08-24 15:09:33.670365 - Finished fold 5 of 5. AUROC 0.837.

==================== ========================== ==========
========== BEGINNING ghassemi2015multivariate_a ==========
==================== ========================== ==========
Reducing sample size from 52050 to 21969 (42.21%).

=============== xgb ===============
2017-08-24 15:09:42.227297 - Finished fold 1 of 5. AUROC 0.866.
2017-08-24 15:09:51.118765 - Finished fold 2 of 5. AUROC 0.866.
2017-08-24 15:09:56.949868 - Finished fold 3 of 5. AUROC 0.876.
2017-08-24 15:10:02.832616 - Finished fold 4 of 5. AUROC 0.881.
2017-08-24 15:10:08.448307 - Finished fold 5 of 5. AUROC 0.890.
=============== logreg ===============
2017-08-24 15:10:09.318016 - Finished fold 1 of 5. AUROC 0.851.
2017-08-24 15:10:10.021191 - Finished fold 2 of 5. AUROC 0.858.
2017-08-24 15:10:10.961087 - Finished fold 3 of 5. AUROC 0.856.
2017-08-24 15:10:12.093648 - Finished fold 4 of 5. AUROC 0.869.
2017-08-24 15:10:13.009346 - Finished fold 5 of 5. AUROC 0.872.

==================== ========================== ==========
========== BEGINNING ghassemi2015multivariate_b ==========
==================== ========================== ==========
Reducing sample size from 52050 to 21969 (42.21%).

=============== xgb ===============
2017-08-24 15:10:21.376833 - Finished fold 1 of 5. AUROC 0.844.
2017-08-24 15:10:26.807198 - Finished fold 2 of 5. AUROC 0.840.
2017-08-24 15:10:32.632508 - Finished fold 3 of 5. AUROC 0.837.
2017-08-24 15:10:38.312857 - Finished fold 4 of 5. AUROC 0.832.
2017-08-24 15:10:43.891150 - Finished fold 5 of 5. AUROC 0.845.
=============== logreg ===============
2017-08-24 15:10:44.722939 - Finished fold 1 of 5. AUROC 0.821.
2017-08-24 15:10:45.502482 - Finished fold 2 of 5. AUROC 0.815.
2017-08-24 15:10:46.413675 - Finished fold 3 of 5. AUROC 0.817.
2017-08-24 15:10:47.468215 - Finished fold 4 of 5. AUROC 0.813.
2017-08-24 15:10:48.340077 - Finished fold 5 of 5. AUROC 0.829.

==================== ==================== ==========
========== BEGINNING grnarova2016neural_a ==========
==================== ==================== ==========
Reducing sample size from 52050 to 29572 (56.81%).

=============== xgb ===============
2017-08-24 15:10:59.567311 - Finished fold 1 of 5. AUROC 0.981.
2017-08-24 15:11:08.099722 - Finished fold 2 of 5. AUROC 0.984.
2017-08-24 15:11:16.450816 - Finished fold 3 of 5. AUROC 0.983.
2017-08-24 15:11:24.756388 - Finished fold 4 of 5. AUROC 0.984.
2017-08-24 15:11:34.218908 - Finished fold 5 of 5. AUROC 0.983.
=============== logreg ===============
2017-08-24 15:11:35.371962 - Finished fold 1 of 5. AUROC 0.975.
2017-08-24 15:11:36.554050 - Finished fold 2 of 5. AUROC 0.979.
2017-08-24 15:11:37.879988 - Finished fold 3 of 5. AUROC 0.981.
2017-08-24 15:11:39.030612 - Finished fold 4 of 5. AUROC 0.978.
2017-08-24 15:11:40.222049 - Finished fold 5 of 5. AUROC 0.978.

==================== ==================== ==========
========== BEGINNING grnarova2016neural_b ==========
==================== ==================== ==========
Reducing sample size from 52050 to 29572 (56.81%).

=============== xgb ===============
2017-08-24 15:11:51.525478 - Finished fold 1 of 5. AUROC 0.955.
2017-08-24 15:11:59.936454 - Finished fold 2 of 5. AUROC 0.960.
2017-08-24 15:12:08.287136 - Finished fold 3 of 5. AUROC 0.964.
2017-08-24 15:12:17.039612 - Finished fold 4 of 5. AUROC 0.959.
2017-08-24 15:12:25.433224 - Finished fold 5 of 5. AUROC 0.960.
=============== logreg ===============
2017-08-24 15:12:26.884564 - Finished fold 1 of 5. AUROC 0.948.
2017-08-24 15:12:28.398257 - Finished fold 2 of 5. AUROC 0.954.
2017-08-24 15:12:29.439742 - Finished fold 3 of 5. AUROC 0.957.
2017-08-24 15:12:30.365090 - Finished fold 4 of 5. AUROC 0.955.
2017-08-24 15:12:31.396458 - Finished fold 5 of 5. AUROC 0.955.

==================== ==================== ==========
========== BEGINNING grnarova2016neural_c ==========
==================== ==================== ==========
Reducing sample size from 52050 to 29572 (56.81%).

=============== xgb ===============
2017-08-24 15:12:42.771939 - Finished fold 1 of 5. AUROC 0.912.
2017-08-24 15:12:51.092638 - Finished fold 2 of 5. AUROC 0.912.
2017-08-24 15:12:59.324280 - Finished fold 3 of 5. AUROC 0.915.
2017-08-24 15:13:07.637546 - Finished fold 4 of 5. AUROC 0.908.
2017-08-24 15:13:16.142746 - Finished fold 5 of 5. AUROC 0.914.
=============== logreg ===============
2017-08-24 15:13:17.238818 - Finished fold 1 of 5. AUROC 0.897.
2017-08-24 15:13:18.622335 - Finished fold 2 of 5. AUROC 0.902.
2017-08-24 15:13:19.989300 - Finished fold 3 of 5. AUROC 0.909.
2017-08-24 15:13:21.072171 - Finished fold 4 of 5. AUROC 0.895.
2017-08-24 15:13:22.263586 - Finished fold 5 of 5. AUROC 0.907.

==================== ======================== ==========
========== BEGINNING harutyunyan2017multitask ==========
==================== ======================== ==========
Reducing sample size from 52050 to 45493 (87.40%).

=============== xgb ===============
2017-08-24 15:13:40.082781 - Finished fold 1 of 5. AUROC 0.936.
2017-08-24 15:13:53.608722 - Finished fold 2 of 5. AUROC 0.949.
2017-08-24 15:14:07.393697 - Finished fold 3 of 5. AUROC 0.941.
2017-08-24 15:14:21.106144 - Finished fold 4 of 5. AUROC 0.941.
2017-08-24 15:14:35.089365 - Finished fold 5 of 5. AUROC 0.941.
=============== logreg ===============
2017-08-24 15:14:37.028009 - Finished fold 1 of 5. AUROC 0.921.
2017-08-24 15:14:38.945821 - Finished fold 2 of 5. AUROC 0.938.
2017-08-24 15:14:40.862447 - Finished fold 3 of 5. AUROC 0.931.
2017-08-24 15:14:42.658057 - Finished fold 4 of 5. AUROC 0.927.
2017-08-24 15:14:44.579806 - Finished fold 5 of 5. AUROC 0.931.

==================== ========================= ==========
========== BEGINNING hoogendoorn2016prediction ==========
==================== ========================= ==========
Reducing sample size from 52050 to 17545 (33.71%).

=============== xgb ===============
2017-08-24 15:14:51.871085 - Finished fold 1 of 5. AUROC 0.869.
2017-08-24 15:14:56.342646 - Finished fold 2 of 5. AUROC 0.871.
2017-08-24 15:15:01.006838 - Finished fold 3 of 5. AUROC 0.878.
2017-08-24 15:15:05.619795 - Finished fold 4 of 5. AUROC 0.882.
2017-08-24 15:15:10.332091 - Finished fold 5 of 5. AUROC 0.893.
=============== logreg ===============
2017-08-24 15:15:11.325287 - Finished fold 1 of 5. AUROC 0.851.
2017-08-24 15:15:12.001155 - Finished fold 2 of 5. AUROC 0.861.
2017-08-24 15:15:12.738673 - Finished fold 3 of 5. AUROC 0.860.
2017-08-24 15:15:13.610415 - Finished fold 4 of 5. AUROC 0.868.
2017-08-24 15:15:14.232124 - Finished fold 5 of 5. AUROC 0.875.

==================== ========== ==========
========== BEGINNING hug2009icu ==========
==================== ========== ==========
Reducing sample size from 52050 to 10696 (20.55%).

=============== xgb ===============
2017-08-24 15:15:19.990229 - Finished fold 1 of 5. AUROC 0.856.
2017-08-24 15:15:23.287701 - Finished fold 2 of 5. AUROC 0.856.
2017-08-24 15:15:26.443173 - Finished fold 3 of 5. AUROC 0.864.
2017-08-24 15:15:29.857154 - Finished fold 4 of 5. AUROC 0.830.
2017-08-24 15:15:33.170536 - Finished fold 5 of 5. AUROC 0.891.
=============== logreg ===============
2017-08-24 15:15:33.732391 - Finished fold 1 of 5. AUROC 0.827.
2017-08-24 15:15:34.093158 - Finished fold 2 of 5. AUROC 0.860.
2017-08-24 15:15:34.517525 - Finished fold 3 of 5. AUROC 0.863.
2017-08-24 15:15:34.961039 - Finished fold 4 of 5. AUROC 0.824.
2017-08-24 15:15:35.369909 - Finished fold 5 of 5. AUROC 0.883.

==================== ================== ==========
========== BEGINNING johnson2012patient ==========
==================== ================== ==========
Reducing sample size from 52050 to 4000 (7.68%).

=============== xgb ===============
2017-08-24 15:15:40.521943 - Finished fold 1 of 5. AUROC 0.824.
2017-08-24 15:15:41.561079 - Finished fold 2 of 5. AUROC 0.826.
2017-08-24 15:15:42.597157 - Finished fold 3 of 5. AUROC 0.866.
2017-08-24 15:15:43.673257 - Finished fold 4 of 5. AUROC 0.850.
2017-08-24 15:15:44.875300 - Finished fold 5 of 5. AUROC 0.858.
=============== logreg ===============
2017-08-24 15:15:44.998005 - Finished fold 1 of 5. AUROC 0.794.
2017-08-24 15:15:45.134003 - Finished fold 2 of 5. AUROC 0.836.
2017-08-24 15:15:45.269519 - Finished fold 3 of 5. AUROC 0.849.
2017-08-24 15:15:45.406821 - Finished fold 4 of 5. AUROC 0.847.
2017-08-24 15:15:45.545524 - Finished fold 5 of 5. AUROC 0.829.

==================== =============== ==========
========== BEGINNING johnson2014data ==========
==================== =============== ==========
Reducing sample size from 52050 to 4000 (7.68%).

=============== xgb ===============
2017-08-24 15:15:50.840541 - Finished fold 1 of 5. AUROC 0.824.
2017-08-24 15:15:51.866823 - Finished fold 2 of 5. AUROC 0.826.
2017-08-24 15:15:52.940034 - Finished fold 3 of 5. AUROC 0.866.
2017-08-24 15:15:53.974860 - Finished fold 4 of 5. AUROC 0.850.
2017-08-24 15:15:55.036362 - Finished fold 5 of 5. AUROC 0.858.
=============== logreg ===============
2017-08-24 15:15:55.151284 - Finished fold 1 of 5. AUROC 0.794.
2017-08-24 15:15:55.288040 - Finished fold 2 of 5. AUROC 0.836.
2017-08-24 15:15:55.429835 - Finished fold 3 of 5. AUROC 0.849.
2017-08-24 15:15:55.573734 - Finished fold 4 of 5. AUROC 0.847.
2017-08-24 15:15:55.703467 - Finished fold 5 of 5. AUROC 0.829.

==================== =================== ==========
========== BEGINNING joshi2012prognostic ==========
==================== =================== ==========
Reducing sample size from 52050 to 10696 (20.55%).

=============== xgb ===============
2017-08-24 15:16:01.386667 - Finished fold 1 of 5. AUROC 0.888.
2017-08-24 15:16:04.590740 - Finished fold 2 of 5. AUROC 0.875.
2017-08-24 15:16:09.039714 - Finished fold 3 of 5. AUROC 0.892.
2017-08-24 15:16:13.350529 - Finished fold 4 of 5. AUROC 0.872.
2017-08-24 15:16:16.621142 - Finished fold 5 of 5. AUROC 0.917.
=============== logreg ===============
2017-08-24 15:16:17.092020 - Finished fold 1 of 5. AUROC 0.853.
2017-08-24 15:16:17.576190 - Finished fold 2 of 5. AUROC 0.874.
2017-08-24 15:16:18.058006 - Finished fold 3 of 5. AUROC 0.875.
2017-08-24 15:16:18.547398 - Finished fold 4 of 5. AUROC 0.873.
2017-08-24 15:16:19.036603 - Finished fold 5 of 5. AUROC 0.910.

==================== ===================== ==========
========== BEGINNING joshi2016identifiable ==========
==================== ===================== ==========
Reducing sample size from 52050 to 26508 (50.93%).

=============== xgb ===============
2017-08-24 15:16:30.462300 - Finished fold 1 of 5. AUROC 0.870.
2017-08-24 15:16:37.912466 - Finished fold 2 of 5. AUROC 0.882.
2017-08-24 15:16:46.068384 - Finished fold 3 of 5. AUROC 0.873.
2017-08-24 15:16:54.116165 - Finished fold 4 of 5. AUROC 0.872.
2017-08-24 15:17:02.124129 - Finished fold 5 of 5. AUROC 0.883.
=============== logreg ===============
2017-08-24 15:17:03.573206 - Finished fold 1 of 5. AUROC 0.842.
2017-08-24 15:17:05.006596 - Finished fold 2 of 5. AUROC 0.861.
2017-08-24 15:17:06.489310 - Finished fold 3 of 5. AUROC 0.850.
2017-08-24 15:17:07.588555 - Finished fold 4 of 5. AUROC 0.850.
2017-08-24 15:17:08.926979 - Finished fold 5 of 5. AUROC 0.866.

==================== ====================== ==========
========== BEGINNING lee2015customization_a ==========
==================== ====================== ==========
Reducing sample size from 52050 to 20961 (40.27%).

=============== xgb ===============
2017-08-24 15:17:18.852862 - Finished fold 1 of 5. AUROC 0.873.
2017-08-24 15:17:25.588996 - Finished fold 2 of 5. AUROC 0.873.
2017-08-24 15:17:31.170566 - Finished fold 3 of 5. AUROC 0.884.
2017-08-24 15:17:36.837172 - Finished fold 4 of 5. AUROC 0.888.
2017-08-24 15:17:42.578580 - Finished fold 5 of 5. AUROC 0.890.
=============== logreg ===============
2017-08-24 15:17:43.396965 - Finished fold 1 of 5. AUROC 0.858.
2017-08-24 15:17:44.244216 - Finished fold 2 of 5. AUROC 0.866.
2017-08-24 15:17:45.121461 - Finished fold 3 of 5. AUROC 0.862.
2017-08-24 15:17:46.329793 - Finished fold 4 of 5. AUROC 0.871.
2017-08-24 15:17:47.156860 - Finished fold 5 of 5. AUROC 0.872.

==================== ====================== ==========
========== BEGINNING lee2015customization_b ==========
==================== ====================== ==========
Reducing sample size from 52050 to 20961 (40.27%).

=============== xgb ===============
2017-08-24 15:17:55.645648 - Finished fold 1 of 5. AUROC 0.861.
2017-08-24 15:18:04.071259 - Finished fold 2 of 5. AUROC 0.861.
2017-08-24 15:18:10.178543 - Finished fold 3 of 5. AUROC 0.873.
2017-08-24 15:18:16.596000 - Finished fold 4 of 5. AUROC 0.867.
2017-08-24 15:18:22.093962 - Finished fold 5 of 5. AUROC 0.868.
=============== logreg ===============
2017-08-24 15:18:22.771169 - Finished fold 1 of 5. AUROC 0.843.
2017-08-24 15:18:23.464854 - Finished fold 2 of 5. AUROC 0.847.
2017-08-24 15:18:24.164458 - Finished fold 3 of 5. AUROC 0.854.
2017-08-24 15:18:25.195832 - Finished fold 4 of 5. AUROC 0.850.
2017-08-24 15:18:25.938248 - Finished fold 5 of 5. AUROC 0.853.

==================== ====================== ==========
========== BEGINNING lee2015customization_c ==========
==================== ====================== ==========
Reducing sample size from 52050 to 20961 (40.27%).

=============== xgb ===============
2017-08-24 15:18:34.194645 - Finished fold 1 of 5. AUROC 0.846.
2017-08-24 15:18:40.684562 - Finished fold 2 of 5. AUROC 0.842.
2017-08-24 15:18:47.267620 - Finished fold 3 of 5. AUROC 0.846.
2017-08-24 15:18:53.750659 - Finished fold 4 of 5. AUROC 0.832.
2017-08-24 15:19:00.055388 - Finished fold 5 of 5. AUROC 0.849.
=============== logreg ===============
2017-08-24 15:19:00.826613 - Finished fold 1 of 5. AUROC 0.824.
2017-08-24 15:19:01.582516 - Finished fold 2 of 5. AUROC 0.821.
2017-08-24 15:19:02.263604 - Finished fold 3 of 5. AUROC 0.824.
2017-08-24 15:19:03.304041 - Finished fold 4 of 5. AUROC 0.816.
2017-08-24 15:19:04.122467 - Finished fold 5 of 5. AUROC 0.831.

==================== =================== ==========
========== BEGINNING lee2015personalized ==========
==================== =================== ==========
Reducing sample size from 52050 to 23443 (45.04%).

=============== xgb ===============
2017-08-24 15:19:13.259472 - Finished fold 1 of 5. AUROC 0.864.
2017-08-24 15:19:20.613712 - Finished fold 2 of 5. AUROC 0.861.
2017-08-24 15:19:26.586256 - Finished fold 3 of 5. AUROC 0.870.
2017-08-24 15:19:32.519734 - Finished fold 4 of 5. AUROC 0.869.
2017-08-24 15:19:38.647749 - Finished fold 5 of 5. AUROC 0.871.
=============== logreg ===============
2017-08-24 15:19:39.430639 - Finished fold 1 of 5. AUROC 0.843.
2017-08-24 15:19:40.226301 - Finished fold 2 of 5. AUROC 0.847.
2017-08-24 15:19:40.985881 - Finished fold 3 of 5. AUROC 0.850.
2017-08-24 15:19:41.997584 - Finished fold 4 of 5. AUROC 0.855.
2017-08-24 15:19:42.732692 - Finished fold 5 of 5. AUROC 0.856.

==================== ============== ==========
========== BEGINNING lee2017patient ==========
==================== ============== ==========
Reducing sample size from 52050 to 23443 (45.04%).

=============== xgb ===============
2017-08-24 15:19:51.451341 - Finished fold 1 of 5. AUROC 0.864.
2017-08-24 15:19:57.571580 - Finished fold 2 of 5. AUROC 0.861.
2017-08-24 15:20:03.544430 - Finished fold 3 of 5. AUROC 0.870.
2017-08-24 15:20:09.740345 - Finished fold 4 of 5. AUROC 0.869.
2017-08-24 15:20:15.619700 - Finished fold 5 of 5. AUROC 0.871.
=============== logreg ===============
2017-08-24 15:20:16.394362 - Finished fold 1 of 5. AUROC 0.843.
2017-08-24 15:20:17.177575 - Finished fold 2 of 5. AUROC 0.847.
2017-08-24 15:20:17.936568 - Finished fold 3 of 5. AUROC 0.850.
2017-08-24 15:20:18.946204 - Finished fold 4 of 5. AUROC 0.855.
2017-08-24 15:20:19.683381 - Finished fold 5 of 5. AUROC 0.856.

==================== ============== ==========
========== BEGINNING lehman2012risk ==========
==================== ============== ==========
Reducing sample size from 52050 to 21738 (41.76%).

=============== xgb ===============
2017-08-24 15:20:28.086366 - Finished fold 1 of 5. AUROC 0.874.
2017-08-24 15:20:33.674687 - Finished fold 2 of 5. AUROC 0.881.
2017-08-24 15:20:40.119140 - Finished fold 3 of 5. AUROC 0.887.
2017-08-24 15:20:48.211066 - Finished fold 4 of 5. AUROC 0.890.
2017-08-24 15:20:55.147580 - Finished fold 5 of 5. AUROC 0.895.
=============== logreg ===============
2017-08-24 15:20:56.012699 - Finished fold 1 of 5. AUROC 0.858.
2017-08-24 15:20:56.826765 - Finished fold 2 of 5. AUROC 0.873.
2017-08-24 15:20:57.794845 - Finished fold 3 of 5. AUROC 0.868.
2017-08-24 15:20:58.946668 - Finished fold 4 of 5. AUROC 0.875.
2017-08-24 15:20:59.889791 - Finished fold 5 of 5. AUROC 0.879.

==================== ====================== ==========
========== BEGINNING luo2016interpretable_a ==========
==================== ====================== ==========
Reducing sample size from 52050 to 27747 (53.31%).

=============== xgb ===============
2017-08-24 15:21:12.869564 - Finished fold 1 of 5. AUROC 0.927.
2017-08-24 15:21:21.928668 - Finished fold 2 of 5. AUROC 0.927.
2017-08-24 15:21:29.987201 - Finished fold 3 of 5. AUROC 0.931.
2017-08-24 15:21:39.482411 - Finished fold 4 of 5. AUROC 0.934.
2017-08-24 15:21:48.490834 - Finished fold 5 of 5. AUROC 0.927.
=============== logreg ===============
2017-08-24 15:21:49.992450 - Finished fold 1 of 5. AUROC 0.919.
2017-08-24 15:21:51.189804 - Finished fold 2 of 5. AUROC 0.916.
2017-08-24 15:21:52.660285 - Finished fold 3 of 5. AUROC 0.920.
2017-08-24 15:21:53.835775 - Finished fold 4 of 5. AUROC 0.921.
2017-08-24 15:21:55.342212 - Finished fold 5 of 5. AUROC 0.920.

==================== ====================== ==========
========== BEGINNING luo2016interpretable_b ==========
==================== ====================== ==========
Reducing sample size from 52050 to 27747 (53.31%).

=============== xgb ===============
2017-08-24 15:22:06.832750 - Finished fold 1 of 5. AUROC 0.892.
2017-08-24 15:22:14.554670 - Finished fold 2 of 5. AUROC 0.897.
2017-08-24 15:22:22.501011 - Finished fold 3 of 5. AUROC 0.887.
2017-08-24 15:22:30.386923 - Finished fold 4 of 5. AUROC 0.892.
2017-08-24 15:22:38.386335 - Finished fold 5 of 5. AUROC 0.889.
=============== logreg ===============
2017-08-24 15:22:39.712522 - Finished fold 1 of 5. AUROC 0.877.
2017-08-24 15:22:41.335352 - Finished fold 2 of 5. AUROC 0.877.
2017-08-24 15:22:42.827553 - Finished fold 3 of 5. AUROC 0.874.
2017-08-24 15:22:44.282435 - Finished fold 4 of 5. AUROC 0.879.
2017-08-24 15:22:45.808676 - Finished fold 5 of 5. AUROC 0.875.

==================== ================= ==========
========== BEGINNING luo2016predicting ==========
==================== ================= ==========
Reducing sample size from 52050 to 8931 (17.16%).

=============== xgb ===============
2017-08-24 15:22:51.070702 - Finished fold 1 of 5. AUROC 0.821.
2017-08-24 15:22:53.597344 - Finished fold 2 of 5. AUROC 0.835.
2017-08-24 15:22:56.092545 - Finished fold 3 of 5. AUROC 0.835.
2017-08-24 15:22:58.591427 - Finished fold 4 of 5. AUROC 0.804.
2017-08-24 15:23:01.326893 - Finished fold 5 of 5. AUROC 0.857.
=============== logreg ===============
2017-08-24 15:23:01.680617 - Finished fold 1 of 5. AUROC 0.806.
2017-08-24 15:23:02.050825 - Finished fold 2 of 5. AUROC 0.856.
2017-08-24 15:23:02.425472 - Finished fold 3 of 5. AUROC 0.833.
2017-08-24 15:23:02.815213 - Finished fold 4 of 5. AUROC 0.792.
2017-08-24 15:23:03.167318 - Finished fold 5 of 5. AUROC 0.847.

==================== ======================= ==========
========== BEGINNING pirracchio2015mortality ==========
==================== ======================= ==========
Reducing sample size from 52050 to 28795 (55.32%).

=============== xgb ===============
2017-08-24 15:23:15.657638 - Finished fold 1 of 5. AUROC 0.902.
2017-08-24 15:23:24.894364 - Finished fold 2 of 5. AUROC 0.900.
2017-08-24 15:23:36.638564 - Finished fold 3 of 5. AUROC 0.910.
2017-08-24 15:23:46.210677 - Finished fold 4 of 5. AUROC 0.908.
2017-08-24 15:23:55.829423 - Finished fold 5 of 5. AUROC 0.916.
=============== logreg ===============
2017-08-24 15:23:56.907370 - Finished fold 1 of 5. AUROC 0.882.
2017-08-24 15:23:58.054082 - Finished fold 2 of 5. AUROC 0.890.
2017-08-24 15:23:59.186352 - Finished fold 3 of 5. AUROC 0.892.
2017-08-24 15:24:00.652185 - Finished fold 4 of 5. AUROC 0.895.
2017-08-24 15:24:01.786215 - Finished fold 5 of 5. AUROC 0.898.

==================== ================ ==========
========== BEGINNING ripoll2014sepsis ==========
==================== ================ ==========
Reducing sample size from 52050 to 2251 (4.32%).

=============== xgb ===============
2017-08-24 15:24:05.535427 - Finished fold 1 of 5. AUROC 0.799.
2017-08-24 15:24:06.345012 - Finished fold 2 of 5. AUROC 0.791.
2017-08-24 15:24:07.197001 - Finished fold 3 of 5. AUROC 0.796.
2017-08-24 15:24:08.108112 - Finished fold 4 of 5. AUROC 0.832.
2017-08-24 15:24:08.981216 - Finished fold 5 of 5. AUROC 0.789.
=============== logreg ===============
2017-08-24 15:24:09.031167 - Finished fold 1 of 5. AUROC 0.772.
2017-08-24 15:24:09.115938 - Finished fold 2 of 5. AUROC 0.787.
2017-08-24 15:24:09.194739 - Finished fold 3 of 5. AUROC 0.768.
2017-08-24 15:24:09.271099 - Finished fold 4 of 5. AUROC 0.817.
2017-08-24 15:24:09.355291 - Finished fold 5 of 5. AUROC 0.792.

==================== ============== ==========
========== BEGINNING wojtusiak2017c ==========
==================== ============== ==========
Reducing sample size from 52050 to 22699 (43.61%).

=============== xgb ===============
2017-08-24 15:24:18.728748 - Finished fold 1 of 5. AUROC 0.799.
2017-08-24 15:24:24.912566 - Finished fold 2 of 5. AUROC 0.786.
2017-08-24 15:24:31.111226 - Finished fold 3 of 5. AUROC 0.802.
2017-08-24 15:24:37.281332 - Finished fold 4 of 5. AUROC 0.816.
2017-08-24 15:24:45.421267 - Finished fold 5 of 5. AUROC 0.792.
=============== logreg ===============
2017-08-24 15:24:46.606003 - Finished fold 1 of 5. AUROC 0.780.
2017-08-24 15:24:47.530606 - Finished fold 2 of 5. AUROC 0.785.
2017-08-24 15:24:48.611508 - Finished fold 3 of 5. AUROC 0.791.
2017-08-24 15:24:49.833434 - Finished fold 4 of 5. AUROC 0.802.
2017-08-24 15:24:51.279982 - Finished fold 5 of 5. AUROC 0.808.

Move the results from the above dictionaries into a single dataframe.



In [20]:

    
mdl_list = models.keys()

study_data = pd.read_csv('../data/study_data.csv')
study_data.set_index('Cohort',inplace=True)

# add in reproduction stats from earlier
study_data_merged = study_data.merge(repro_stats, how='left',
                left_index=True, right_index=True)

# add in AUROCs
for current_study in results_val_all:
    results_val = results_val_all[current_study]
    for mdl in results_val:
        study_data_merged.loc[current_study, mdl] = np.mean(results_val[mdl])
    
columns = ['Outcome','N_Study','N_Repro','Y_Study','Y_Repro','AUROC_Study'] + mdl_list
display(HTML(study_data_merged[columns].to_html()))

study_data_merged.to_csv('results_final.csv')









    





  
    
      
      Outcome
      N_Study
      N_Repro
      Y_Study
      Y_Repro
      AUROC_Study
      xgb
      logreg
    
    
      Cohort
      
      
      
      
      
      
      
      
    
  
  
    
      caballero2015dynamically_a
      death_in_hospital
      11648
      11648.0
      -
      13.006525
      0.8657
      0.907000
      0.888772
    
    
      caballero2015dynamically_b
      death_in_hospital
      11648
      11648.0
      -
      13.006525
      0.7985
      0.922231
      0.904719
    
    
      caballero2015dynamically_c
      death_in_hospital
      11648
      11648.0
      -
      13.006525
      0.7385
      0.931345
      0.915472
    
    
      calvert2016computational
      death_in_hospital
      3054
      1985.0
      12.84
      13.803526
      0.9340
      0.954481
      0.923016
    
    
      calvert2016using
      death_in_hospital
      9683
      18396.0
      10.68
      14.709720
      0.8800
      0.934364
      0.913103
    
    
      celi2012database_a
      death_in_hospital
      1400
      4741.0
      30.7
      23.919004
      0.8750
      0.882389
      0.875887
    
    
      celi2012database_b
      death_in_hospital
      223
      1070.0
      25.6
      19.158879
      0.9580
      0.901737
      0.877720
    
    
      che2016recurrent_a
      death_in_hospital
      4000
      51986.0
      13.85
      3.095064
      0.8424
      0.986628
      0.966270
    
    
      ding2016mortality
      death_in_hospital
      4000
      4000.0
      13.85
      14.350000
      0.8177
      0.844859
      0.831190
    
    
      ghassemi2014unfolding_a
      death_in_hospital
      19308
      23442.0
      10.84
      12.916987
      0.8400
      0.882558
      0.866768
    
    
      ghassemi2014unfolding_b
      death_in_hospital
      19308
      28172.0
      10.80
      12.200057
      0.8410
      0.883465
      0.861244
    
    
      ghassemi2015multivariate_a
      death_in_hospital
      10202
      21969.0
      -
      13.514498
      0.8120
      0.875999
      0.861076
    
    
      grnarova2016neural_a
      death_in_hospital
      31244
      29572.0
      13.82
      12.494928
      0.9630
      0.982942
      0.978148
    
    
      harutyunyan2017multitask
      death_in_hospital
      42276
      45493.0
      -
      10.535687
      0.8625
      0.941598
      0.929507
    
    
      hoogendoorn2016prediction
      death_in_hospital
      13923
      17545.0
      -
      14.967227
      0.8410
      0.878471
      0.863137
    
    
      johnson2012patient
      death_in_hospital
      4000
      4000.0
      -
      14.350000
      0.8602
      0.844859
      0.831190
    
    
      johnson2014data
      death_in_hospital
      4000
      4000.0
      -
      14.350000
      0.8457
      0.844859
      0.831190
    
    
      joshi2012prognostic
      death_in_hospital
      10066
      10696.0
      12.0
      4.141735
      0.8900
      0.888800
      0.876938
    
    
      lee2015customization_a
      death_in_hospital
      17490
      20961.0
      17.73
      12.690234
      0.7750
      0.881796
      0.865851
    
    
      lehman2012risk
      death_in_hospital
      14739
      21738.0
      14.6
      12.319441
      0.8200
      0.885441
      0.870387
    
    
      pirracchio2015mortality
      death_in_hospital
      24508
      28795.0
      12.2
      12.720958
      0.8800
      0.907004
      0.891246
    
    
      ripoll2014sepsis
      death_in_hospital
      2002
      2251.0
      21.10
      39.626833
      0.8223
      0.801198
      0.787370
    
    
      che2016recurrent_b
      death_48hr_post_icu_admit
      19714
      4000.0
      8.7
      14.350000
      0.8527
      0.844859
      0.831190
    
    
      hug2009icu
      death_30dy_post_icu_disch
      10066
      10696.0
      17.0
      6.348168
      0.8790
      0.859368
      0.851372
    
    
      luo2016predicting
      death_30dy_post_icu_disch
      7863
      8931.0
      17.0
      6.449446
      0.8480
      0.830696
      0.826844
    
    
      joshi2016identifiable
      death_30dy_post_icu_disch
      17000
      26508.0
      -
      14.950204
      0.7200
      0.876016
      0.853926
    
    
      ghassemi2014unfolding_c
      death_30dy_post_hos_disch
      19308
      28172.0
      3.23
      16.917507
      0.7610
      0.868777
      0.845200
    
    
      grnarova2016neural_b
      death_30dy_post_hos_disch
      31244
      29572.0
      3.70
      16.363452
      0.8580
      0.959645
      0.953682
    
    
      lee2015personalized
      death_30dy_post_hos_disch
      17490
      23443.0
      15.1
      17.941390
      0.7840
      0.867168
      0.850062
    
    
      lee2015customization_b
      death_30dy_post_hos_disch
      17490
      20961.0
      23.56
      17.861743
      0.7620
      0.865947
      0.849294
    
    
      lee2017patient
      death_30dy_post_hos_disch
      17152
      23443.0
      15.1
      17.941390
      0.8150
      0.867168
      0.850062
    
    
      luo2016interpretable_a
      death_30dy_post_hos_disch
      18412
      27747.0
      3.4
      17.054096
      0.8600
      0.929155
      0.919350
    
    
      wojtusiak2017c
      death_30dy_post_hos_disch
      21651
      22699.0
      NaN
      7.736024
      0.7340
      0.798828
      0.793172
    
    
      luo2016interpretable_b
      death_6mo_post_hos_disch
      18412
      27747.0
      9.5
      25.166685
      0.8420
      0.891435
      0.876588
    
    
      ghassemi2014unfolding_d
      death_1yr_post_hos_disch
      19308
      28172.0
      3.34
      29.752946
      0.7430
      0.845817
      0.819841
    
    
      ghassemi2015multivariate_b
      death_1yr_post_hos_disch
      10202
      21969.0
      -
      32.354682
      0.6860
      0.839746
      0.818851
    
    
      grnarova2016neural_c
      death_1yr_post_hos_disch
      31244
      29572.0
      12.06
      24.628027
      0.8530
      0.912094
      0.901945
    
    
      lee2015customization_c
      death_2yr_post_hos_disch
      17152
      20961.0
      43.82
      31.572921
      0.8300
      0.842981
      0.823152

Appendix

Baseline model 1

The below code block builds a "baseline" model. This model has no exclusions past the base cohort exclusions. K-fold validation is done on the patient level to ensure no information leakage between training/validation sets. The outcome is in-hospital mortality, and the data window used is the first 24 hours. Labs are extracted from up to 24 hours before the ICU admission (this is defined by W_extra=24).



In [26]:

    
W = 24 # window size
W_extra = 24 # extra time backward for labs
y_outcome_label = 'death_in_hospital'

# admission+W hours
df_tmp=co.copy().set_index('icustay_id')
time_dict = df_tmp.copy()
time_dict['windowtime'] = W
time_dict = time_dict['windowtime'].to_dict()


# Rough timing info:
#     rf - 3 seconds per fold
#    xgb - 30 seconds per fold
# logreg - 4 seconds per fold
#  lasso - 8 seconds per fold
models = OrderedDict([
          ['xgb', xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)],
          #['lasso', LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000)],
          #['rf', RandomForestClassifier()],
          ['logreg', LogisticRegression(fit_intercept=True)]
         ])



In [27]:

    
CENSOR_FLAG=False
current_study = 'baseline'

print('')
print('====================={}==========='.format('='*len(current_study)))
print('========== BEGINNING {} =========='.format(current_study))
print('====================={}==========='.format('='*len(current_study)))

# optionally remove patients who were DNR in first 24hrs
if CENSOR_FLAG:
    exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values
else:
    exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values
    
df_data = mp.get_design_matrix(df, time_dict, W=W, W_extra=W_extra)

iid_keep = exclFcn(co)

N_NEW=df_data.loc[iid_keep,:].shape[0]
print('Reducing sample size from {} to {} ({:2.2f}%).'.format(
        co.shape[0], N_NEW, N_NEW*100.0 / df_data.shape[0]))
df_data = df_data.loc[iid_keep,:]
print('')

# load the data into a numpy array

# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)

# map above K-fold indices to this dataset
X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)

# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = [x for x in df_data.columns.values] + var_static

mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = list()
    tar_val[mdl] = list()

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        # get prediction on this dataset
        if mdl == 'lasso':
            curr_prob = curr_mdl.predict(X[idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
            curr_prob = curr_prob[:,1]

        pred_val[mdl].append(curr_prob)
        tar_val[mdl].append(y[idxK == k])

        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)

        print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
        


# print final results
print('')
print('StudyName,SampleSize',end='')
for mdl in models:
    print(',{}'.format(mdl),end='')
print('')

print( '{},{}'.format(current_study, X.shape[0] ), end='' )

for i, mdl in enumerate(models):
    print(',{:0.6f}'.format( np.mean(results_val[mdl]) ), end='')
print('\n')









    



========================================
========== BEGINNING baseline ==========
========================================
Reducing sample size from 52085 to 38687 (74.33%).

=============== xgb ===============
2017-08-24 15:37:20.547733 - Finished fold 1 of 5. AUROC 0.880.
2017-08-24 15:37:31.865971 - Finished fold 2 of 5. AUROC 0.888.
2017-08-24 15:37:43.304737 - Finished fold 3 of 5. AUROC 0.887.
2017-08-24 15:37:54.738620 - Finished fold 4 of 5. AUROC 0.886.
2017-08-24 15:38:06.204805 - Finished fold 5 of 5. AUROC 0.897.
=============== logreg ===============
2017-08-24 15:38:07.586226 - Finished fold 1 of 5. AUROC 0.868.
2017-08-24 15:38:09.071851 - Finished fold 2 of 5. AUROC 0.871.
2017-08-24 15:38:10.688616 - Finished fold 3 of 5. AUROC 0.863.
2017-08-24 15:38:12.239024 - Finished fold 4 of 5. AUROC 0.862.
2017-08-24 15:38:13.748468 - Finished fold 5 of 5. AUROC 0.880.

StudyName,SampleSize,xgb,logreg
baseline,38687,0.887690,0.868978

Baseline model 2: No care withdrawal patients

Patients who choose to have their care withdrawn will receive palliative measures in the ICU. These patients show markedly different physiology than those undergoing full interventions and a model which synthesizes severity should not incorporate their data. Here we remove data for patients at the time of their withdrawal of care. If this is before the end of the first 24 hours of their ICU admission, we remove the patient entirely.



In [28]:

    
CENSOR_FLAG = True
current_study = 'baseline_withdrawal'

print('====================={}==========='.format('='*len(current_study)))
print('========== BEGINNING {} =========='.format(current_study))
print('====================={}==========='.format('='*len(current_study)))

# optionally remove patients who were DNR in first 24hrs
if CENSOR_FLAG:
    exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values
else:
    exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values
    
df_data = mp.get_design_matrix(df, time_dict, W=W, W_extra=W_extra)

iid_keep = exclFcn(co)
N_NEW=df_data.loc[iid_keep,:].shape[0]

print('Reducing sample size from {} to {} ({:2.2f}%).'.format(
        co.shape[0], N_NEW, N_NEW*100.0 / df_data.shape[0]))
df_data = df_data.loc[iid_keep,:]
print('')

# load the data into a numpy array

# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)

# map above K-fold indices to this dataset
X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)

# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = [x for x in df_data.columns.values] + var_static

mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = list()
    tar_val[mdl] = list()

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        # get prediction on this dataset
        if mdl == 'lasso':
            curr_prob = curr_mdl.predict(X[idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
            curr_prob = curr_prob[:,1]

        pred_val[mdl].append(curr_prob)
        tar_val[mdl].append(y[idxK == k])

        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)

        print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
        


# print final results
print('')
print('StudyName,SampleSize',end='')
for mdl in models:
    print(',{}'.format(mdl),end='')
print('')

print( '{},{}'.format(current_study, X.shape[0] ), end='' )

for i, mdl in enumerate(models):
    print(',{:0.6f}'.format( np.mean(results_val[mdl]) ), end='')

print('\n')









    



===================================================
========== BEGINNING baseline_withdrawal ==========
===================================================
Reducing sample size from 52085 to 38687 (74.33%).

=============== xgb ===============
2017-08-24 15:38:28.379026 - Finished fold 1 of 5. AUROC 0.880.
2017-08-24 15:38:39.824625 - Finished fold 2 of 5. AUROC 0.888.
2017-08-24 15:38:51.282528 - Finished fold 3 of 5. AUROC 0.887.
2017-08-24 15:39:02.546569 - Finished fold 4 of 5. AUROC 0.886.
2017-08-24 15:39:14.247252 - Finished fold 5 of 5. AUROC 0.897.
=============== logreg ===============
2017-08-24 15:39:15.617601 - Finished fold 1 of 5. AUROC 0.868.
2017-08-24 15:39:17.084646 - Finished fold 2 of 5. AUROC 0.871.
2017-08-24 15:39:18.699613 - Finished fold 3 of 5. AUROC 0.863.
2017-08-24 15:39:20.303882 - Finished fold 4 of 5. AUROC 0.862.
2017-08-24 15:39:21.799444 - Finished fold 5 of 5. AUROC 0.880.

StudyName,SampleSize,xgb,logreg
baseline_withdrawal,38687,0.887690,0.868978

	N_Study	N_Repro	Y_Study	Y_Repro
Cohort
caballero2015dynamically_a	11648	11648.0	-	13.006525
caballero2015dynamically_b	11648	11648.0	-	13.006525
caballero2015dynamically_c	11648	11648.0	-	13.006525
calvert2016computational	3054	1985.0	12.84	13.803526
calvert2016using	9683	18396.0	10.68	14.709720
celi2012database_a	1400	4741.0	30.7	23.919004
celi2012database_b	223	1070.0	25.6	19.158879
che2016recurrent_a	4000	51986.0	13.85	3.095064
ding2016mortality	4000	4000.0	13.85	14.350000
ghassemi2014unfolding_a	19308	23442.0	10.84	12.916987
ghassemi2014unfolding_b	19308	28172.0	10.80	12.200057
ghassemi2015multivariate_a	10202	21969.0	-	13.514498
grnarova2016neural_a	31244	29572.0	13.82	12.494928
harutyunyan2017multitask	42276	45493.0	-	10.535687
hoogendoorn2016prediction	13923	17545.0	-	14.967227
johnson2012patient	4000	4000.0	-	14.350000
johnson2014data	4000	4000.0	-	14.350000
joshi2012prognostic	10066	10696.0	12.0	4.141735
lee2015customization_a	17490	20961.0	17.73	12.690234
lehman2012risk	14739	21738.0	14.6	12.319441
pirracchio2015mortality	24508	28795.0	12.2	12.720958
ripoll2014sepsis	2002	2251.0	21.10	39.626833
che2016recurrent_b	19714	4000.0	8.7	14.350000
hug2009icu	10066	10696.0	17.0	6.348168
luo2016predicting	7863	8931.0	17.0	6.449446
joshi2016identifiable	17000	26508.0	-	14.950204
ghassemi2014unfolding_c	19308	28172.0	3.23	16.917507
grnarova2016neural_b	31244	29572.0	3.70	16.363452
lee2015personalized	17490	23443.0	15.1	17.941390
lee2015customization_b	17490	20961.0	23.56	17.861743
lee2017patient	17152	23443.0	15.1	17.941390
luo2016interpretable_a	18412	27747.0	3.4	17.054096
wojtusiak2017c	21651	22699.0	NaN	7.736024
luo2016interpretable_b	18412	27747.0	9.5	25.166685
ghassemi2014unfolding_d	19308	28172.0	3.34	29.752946
ghassemi2015multivariate_b	10202	21969.0	-	32.354682
grnarova2016neural_c	31244	29572.0	12.06	24.628027
lee2015customization_c	17152	20961.0	43.82	31.572921

	Outcome	N_Study	N_Repro	Y_Study	Y_Repro	AUROC_Study	xgb	logreg
Cohort
caballero2015dynamically_a	death_in_hospital	11648	11648.0	-	13.006525	0.8657	0.907000	0.888772
caballero2015dynamically_b	death_in_hospital	11648	11648.0	-	13.006525	0.7985	0.922231	0.904719
caballero2015dynamically_c	death_in_hospital	11648	11648.0	-	13.006525	0.7385	0.931345	0.915472
calvert2016computational	death_in_hospital	3054	1985.0	12.84	13.803526	0.9340	0.954481	0.923016
calvert2016using	death_in_hospital	9683	18396.0	10.68	14.709720	0.8800	0.934364	0.913103
celi2012database_a	death_in_hospital	1400	4741.0	30.7	23.919004	0.8750	0.882389	0.875887
celi2012database_b	death_in_hospital	223	1070.0	25.6	19.158879	0.9580	0.901737	0.877720
che2016recurrent_a	death_in_hospital	4000	51986.0	13.85	3.095064	0.8424	0.986628	0.966270
ding2016mortality	death_in_hospital	4000	4000.0	13.85	14.350000	0.8177	0.844859	0.831190
ghassemi2014unfolding_a	death_in_hospital	19308	23442.0	10.84	12.916987	0.8400	0.882558	0.866768
ghassemi2014unfolding_b	death_in_hospital	19308	28172.0	10.80	12.200057	0.8410	0.883465	0.861244
ghassemi2015multivariate_a	death_in_hospital	10202	21969.0	-	13.514498	0.8120	0.875999	0.861076
grnarova2016neural_a	death_in_hospital	31244	29572.0	13.82	12.494928	0.9630	0.982942	0.978148
harutyunyan2017multitask	death_in_hospital	42276	45493.0	-	10.535687	0.8625	0.941598	0.929507
hoogendoorn2016prediction	death_in_hospital	13923	17545.0	-	14.967227	0.8410	0.878471	0.863137
johnson2012patient	death_in_hospital	4000	4000.0	-	14.350000	0.8602	0.844859	0.831190
johnson2014data	death_in_hospital	4000	4000.0	-	14.350000	0.8457	0.844859	0.831190
joshi2012prognostic	death_in_hospital	10066	10696.0	12.0	4.141735	0.8900	0.888800	0.876938
lee2015customization_a	death_in_hospital	17490	20961.0	17.73	12.690234	0.7750	0.881796	0.865851
lehman2012risk	death_in_hospital	14739	21738.0	14.6	12.319441	0.8200	0.885441	0.870387
pirracchio2015mortality	death_in_hospital	24508	28795.0	12.2	12.720958	0.8800	0.907004	0.891246
ripoll2014sepsis	death_in_hospital	2002	2251.0	21.10	39.626833	0.8223	0.801198	0.787370
che2016recurrent_b	death_48hr_post_icu_admit	19714	4000.0	8.7	14.350000	0.8527	0.844859	0.831190
hug2009icu	death_30dy_post_icu_disch	10066	10696.0	17.0	6.348168	0.8790	0.859368	0.851372
luo2016predicting	death_30dy_post_icu_disch	7863	8931.0	17.0	6.449446	0.8480	0.830696	0.826844
joshi2016identifiable	death_30dy_post_icu_disch	17000	26508.0	-	14.950204	0.7200	0.876016	0.853926
ghassemi2014unfolding_c	death_30dy_post_hos_disch	19308	28172.0	3.23	16.917507	0.7610	0.868777	0.845200
grnarova2016neural_b	death_30dy_post_hos_disch	31244	29572.0	3.70	16.363452	0.8580	0.959645	0.953682
lee2015personalized	death_30dy_post_hos_disch	17490	23443.0	15.1	17.941390	0.7840	0.867168	0.850062
lee2015customization_b	death_30dy_post_hos_disch	17490	20961.0	23.56	17.861743	0.7620	0.865947	0.849294
lee2017patient	death_30dy_post_hos_disch	17152	23443.0	15.1	17.941390	0.8150	0.867168	0.850062
luo2016interpretable_a	death_30dy_post_hos_disch	18412	27747.0	3.4	17.054096	0.8600	0.929155	0.919350
wojtusiak2017c	death_30dy_post_hos_disch	21651	22699.0	NaN	7.736024	0.7340	0.798828	0.793172
luo2016interpretable_b	death_6mo_post_hos_disch	18412	27747.0	9.5	25.166685	0.8420	0.891435	0.876588
ghassemi2014unfolding_d	death_1yr_post_hos_disch	19308	28172.0	3.34	29.752946	0.7430	0.845817	0.819841
ghassemi2015multivariate_b	death_1yr_post_hos_disch	10202	21969.0	-	32.354682	0.6860	0.839746	0.818851
grnarova2016neural_c	death_1yr_post_hos_disch	31244	29572.0	12.06	24.628027	0.8530	0.912094	0.901945
lee2015customization_c	death_2yr_post_hos_disch	17152	20961.0	43.82	31.572921	0.8300	0.842981	0.823152