In [1]:
from __future__ import print_function
# Import libraries
import numpy as np
import pandas as pd
import sklearn
import psycopg2
import sys
import datetime as dt
import mp_utils as mp
# to display dataframes in notebooks
from IPython.display import display, HTML
from collections import OrderedDict
# used to print out pretty pandas dataframes
from IPython.display import display, HTML
from sklearn.pipeline import Pipeline
# used to impute mean for data and standardize for computational stability
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
# logistic regression is our favourite model ever
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV # l2 regularized regression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
# used to calculate AUROC/accuracy
from sklearn import metrics
# gradient boosting - must download package https://github.com/dmlc/xgboost
import xgboost as xgb
#import matplotlib
#import matplotlib.pyplot as plt
#from matplotlib.font_manager import FontProperties # for unicode fonts
#%matplotlib inline
# below config used on pc70
sqluser = 'alistairewj'
dbname = 'mimic'
schema_name = 'mimiciii'
query_schema = 'SET search_path to public,' + schema_name + ';'
# two options for loading data
# option 1) use SQL - requires database and to have run queries/make_all.sql
# option 2) use CSVs downloaded
USE_SQL=1
USE_CSV=0
In [2]:
if USE_SQL:
# Connect to local postgres version of mimic
con = psycopg2.connect(dbname=dbname, user=sqluser)
# exclusion criteria:
# - less than 15 years old
# - stayed in the ICU less than 4 hours
# - never have any chartevents data (i.e. likely administrative error)
# - organ donor accounts (administrative "readmissions" for patients who died in hospital)
query = query_schema + \
"""
select
*
from dm_cohort
"""
co = pd.read_sql_query(query,con)
# convert the inclusion flags to boolean
for c in co.columns:
if c[0:10]=='inclusion_':
co[c] = co[c].astype(bool)
# extract static vars into a separate dataframe
df_static = pd.read_sql_query(query_schema + 'select * from mp_static_data', con)
#for dtvar in ['intime','outtime','deathtime']:
# df_static[dtvar] = pd.to_datetime(df_static[dtvar])
vars_static = [u'is_male', u'emergency_admission', u'age',
# services
u'service_any_noncard_surg',
u'service_any_card_surg',
u'service_cmed',
u'service_traum',
u'service_nmed',
# ethnicities
u'race_black',u'race_hispanic',u'race_asian',u'race_other',
# phatness
u'height', u'weight', u'bmi']
# get ~5 million rows containing data from errbody
# this takes a little bit of time to load into memory (~2 minutes)
# %%time results
# CPU times: user 42.8 s, sys: 1min 3s, total: 1min 46s
# Wall time: 2min 7s
df = pd.read_sql_query(query_schema + 'select * from mp_data', con)
df.drop('subject_id',axis=1,inplace=True)
df.drop('hadm_id',axis=1,inplace=True)
df.sort_values(['icustay_id','hr'],axis=0,ascending=True,inplace=True)
# get death information
df_death = pd.read_sql_query(query_schema + """
select
co.subject_id, co.hadm_id, co.icustay_id
, ceil(extract(epoch from (co.outtime - co.intime))/60.0/60.0) as dischtime_hours
, ceil(extract(epoch from (adm.deathtime - co.intime))/60.0/60.0) as deathtime_hours
, case when adm.deathtime is null then 0 else 1 end as death
from dm_cohort co
inner join admissions adm
on co.hadm_id = adm.hadm_id
where co.excluded = 0
""", con)
# get censoring information
df_censor = pd.read_sql_query(query_schema + """
select co.icustay_id, min(cs.charttime) as censortime
, ceil(extract(epoch from min(cs.charttime-co.intime) )/60.0/60.0) as censortime_hours
from dm_cohort co
inner join mp_code_status cs
on co.icustay_id = cs.icustay_id
where cmo+dnr+dni+dncpr+cmo_notes>0
and co.excluded = 0
group by co.icustay_id
""", con)
# extract static vars into a separate dataframe
df_static = pd.read_sql_query(query_schema + 'select * from mp_static_data', con)
elif USE_CSV:
co = pd.read_csv('df_cohort.csv.gz')
# convert the inclusion flags to boolean
for c in co.columns:
if c[0:10]=='inclusion_':
co[c] = co[c].astype(bool)
df = pd.read_csv('df_data.csv.gz')
df_static = pd.read_csv('df_static_data.csv.gz')
df_censor = pd.read_csv('df_censor.csv.gz')
df_death = pd.read_csv('df_death.csv.gz')
else:
print('Must use SQL or CSV to load data!')
print(df.shape)
In [3]:
# print out the exclusions *SEQUENTIALLY* - i.e. if already excluded, don't re-print
print('Cohort - initial size: {} ICU stays'.format(co.shape[0]))
idxRem = np.zeros(co.shape[0],dtype=bool)
for c in co.columns:
if c[0:len('exclusion_')]=='exclusion_':
N_REM = np.sum( (co[c].values==1) )
print(' {:5g} ({:2.2f}%) - {}'.format(N_REM,N_REM*100.0/co.shape[0], c))
idxRem[co[c].values==1] = True
# summarize all exclusions
N_REM = np.sum( idxRem )
print(' {:5g} ({:2.2f}%) - {}'.format(N_REM,N_REM*100.0/co.shape[0], 'all exclusions'))
print('')
print('Final cohort size: {} ICU stays ({:2.2f}%).'.format(co.shape[0] - np.sum(idxRem), (1-np.mean(idxRem))*100.0))
co = co.loc[~idxRem,:]
In [4]:
# mortality stats for base cohort
for c in co.columns:
if c[0:len('death_')]=='death_':
N_ALL = co.shape[0]
N = co.set_index('icustay_id').loc[:,c].sum()
print('{:40s}{:5g} of {:5g} died ({:2.2f}%).'.format(c, N, N_ALL, N*100.0/N_ALL))
In [5]:
inclFcn = lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_stay_ge_24hr'],'icustay_id']
# mortality stats for base cohort
for c in co.columns:
if c[0:len('death_')]=='death_':
N_ALL = inclFcn(co).shape[0]
N = co.set_index('icustay_id').loc[inclFcn(co),c].sum()
print('{:40s}{:5g} of {:5g} died ({:2.2f}%).'.format(c, N, N_ALL, N*100.0/N_ALL))
Here we have the same function in a slightly more obscure way - with the benefit of being able to list all inclusions in a list. This just helps readability in the below code.
In [6]:
inclusions = ['inclusion_only_mimicii', 'inclusion_stay_ge_24hr']
inclFcn = lambda x: x.loc[x[inclusions].all(axis=1),'icustay_id']
# mortality stats for base cohort
for c in co.columns:
if c[0:len('death_')]=='death_':
N_ALL = inclFcn(co).shape[0]
N = co.set_index('icustay_id').loc[inclFcn(co),c].sum()
print('{:40s}{:5g} of {:5g} died ({:2.2f}%).'.format(c, N, N_ALL, N*100.0/N_ALL))
Each study has its own exclusion criteria (sometimes studies have multiple experiments). We define a dictionary of all exclusions with the dictionary key as the study name. Some studies have multiple experiments, so we append a, b, or c.
The dictionary stores a length 2 list. The first element defines the window for data extraction: it contains a dictionary of the windows and the corresponding window sizes. The second element is the exclusion criteria. Both are functions which use co or df as their input.
In [7]:
# first we can define the different windows: there aren't that many!
df_tmp=co.copy().set_index('icustay_id')
# admission+12 hours
time_12hr = df_tmp.copy()
time_12hr['windowtime'] = 12
time_12hr = time_12hr['windowtime'].to_dict()
# admission+24 hours
time_24hr = df_tmp.copy()
time_24hr['windowtime'] = 24
time_24hr = time_24hr['windowtime'].to_dict()
# admission+48 hours
time_48hr = df_tmp.copy()
time_48hr['windowtime'] = 48
time_48hr = time_48hr['windowtime'].to_dict()
# admission+72 hours
time_72hr = df_tmp.copy()
time_72hr['windowtime'] = 72
time_72hr = time_72hr['windowtime'].to_dict()
# admission+96 hours
time_96hr = df_tmp.copy()
time_96hr['windowtime'] = 96
time_96hr = time_96hr['windowtime'].to_dict()
# entire stay
time_all = df_tmp.copy()
time_all = time_all['dischtime_hours'].apply(np.ceil).astype(int).to_dict()
# 12 hours before the patient died/discharged
time_predeath = df_tmp.copy()
time_predeath['windowtime'] = time_predeath['dischtime_hours']
idx = time_predeath['deathtime_hours']<time_predeath['dischtime_hours']
time_predeath.loc[idx,'windowtime'] = time_predeath.loc[idx,'deathtime_hours']
# move from discharge/death time to 12 hours beforehand
time_predeath['windowtime'] = time_predeath['windowtime']-12
time_predeath = time_predeath['windowtime'].apply(np.ceil).astype(int).to_dict()
In [8]:
# example params used to extract patient data
# element 1: dictionary specifying end time of window for each patient
# element 2: size of window
# element 3: extra hours added to make it easier to get data on labs (and allows us to get labs pre-ICU)
# e.g. [time_24hr, 8, 24] is
# (1) window ends at admission+24hr
# (2) window is 8 hours long
# (3) lab window is 8+24=32 hours long
def inclFcn(x, inclusions):
return x.loc[x[inclusions].all(axis=1),'icustay_id']
# this one is used more than once, so we define it here
hugExclFcnMIMIC3 = lambda x: x.loc[x['inclusion_over_18']&x['inclusion_hug2009_obs']&x['inclusion_hug2009_not_nsicu_csicu']&x['inclusion_first_admission']&x['inclusion_full_code']&x['inclusion_not_brain_death']&x['inclusion_not_crf'],'icustay_id'].values
hugExclFcn = lambda x: np.intersect1d(hugExclFcnMIMIC3(x),x.loc[x['inclusion_only_mimicii'],'icustay_id'].values)
# physionet2012 subset - not exact but close
def physChallExclFcn(x):
out = x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_stay_ge_48hr']&x['inclusion_has_saps'],'icustay_id'].values
out = np.sort(out)
out = out[0:4000]
return out
# caballero2015 is a random subsample - then limits to 18yrs, resulting in 11648
def caballeroExclFcn(x):
out = x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18'],'icustay_id'].values
out = np.sort(out)
out = out[0:11648]
return out
np.random.seed(546345)
W_extra = 24
exclusions = OrderedDict([
['caballero2015dynamically_a', [[time_24hr, 24, W_extra], caballeroExclFcn, 'hospital_expire_flag']],
['caballero2015dynamically_b', [[time_48hr, 48, W_extra], caballeroExclFcn, 'hospital_expire_flag']],
['caballero2015dynamically_c', [[time_72hr, 72, W_extra], caballeroExclFcn, 'hospital_expire_flag']],
['calvert2016computational', [[time_predeath, 5, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_only_micu']&x['inclusion_calvert2016_obs']&x['inclusion_stay_ge_17hr']&x['inclusion_stay_le_500hr']&x['inclusion_non_alc_icd9'],'icustay_id'].values, 'hospital_expire_flag']],
['calvert2016using', [[time_predeath, 5, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_only_micu']&x['inclusion_calvert2016_obs']&x['inclusion_stay_ge_17hr']&x['inclusion_stay_le_500hr'],'icustay_id'].values, 'hospital_expire_flag']],
['celi2012database_a', [[time_72hr, 72, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_aki_icd9'],'icustay_id'].values , 'hospital_expire_flag']],
['celi2012database_b', [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_sah_icd9'],'icustay_id'].values , 'hospital_expire_flag']],
['che2016recurrent_a', [[time_48hr, 48, W_extra], lambda x: x.loc[x['inclusion_over_18'],'icustay_id'].values , 'death_48hr_post_icu_admit']],
['che2016recurrent_b', [[time_48hr, 48, W_extra], physChallExclFcn , 'hospital_expire_flag']],
['ding2016mortality', [[time_48hr, 48, W_extra], physChallExclFcn , 'hospital_expire_flag']],
['ghassemi2014unfolding_a', [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'hospital_expire_flag']],
['ghassemi2014unfolding_b', [[time_12hr, 12, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_stay_ge_12hr'],'icustay_id'].values, 'hospital_expire_flag']],
['ghassemi2014unfolding_c', [[time_12hr, 12, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_stay_ge_12hr'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['ghassemi2014unfolding_d', [[time_12hr, 12, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_stay_ge_12hr'],'icustay_id'].values, 'death_1yr_post_hos_disch']],
['ghassemi2015multivariate_a', [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_gt_6_notes']&x['inclusion_stay_ge_24hr']&x['inclusion_has_saps'],'icustay_id'].values, 'hospital_expire_flag']],
['ghassemi2015multivariate_b', [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_ge_100_non_stop_words']&x['inclusion_gt_6_notes']&x['inclusion_stay_ge_24hr']&x['inclusion_has_saps'],'icustay_id'].values, 'death_1yr_post_hos_disch']],
['grnarova2016neural_a', [[time_all, 24, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_multiple_hadm'],'icustay_id'].values, 'hospital_expire_flag']],
['grnarova2016neural_b', [[time_all, 24, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_multiple_hadm'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['grnarova2016neural_c', [[time_all, 24, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_multiple_hadm'],'icustay_id'].values, 'death_1yr_post_hos_disch']],
['harutyunyan2017multitask', [[time_48hr, 48, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_multiple_icustay'],'icustay_id'].values, 'hospital_expire_flag']],
['hoogendoorn2016prediction', [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_hug2009_obs']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'hospital_expire_flag']],
['hug2009icu', [[time_24hr, 24, W_extra], hugExclFcn, 'death_30dy_post_icu_disch']],
['johnson2012patient', [[time_48hr, 48, W_extra], physChallExclFcn, 'hospital_expire_flag']],
['johnson2014data', [[time_48hr, 48, W_extra], physChallExclFcn, 'hospital_expire_flag']],
['joshi2012prognostic', [[time_24hr, 24, W_extra], hugExclFcn, 'hospital_expire_flag']],
['joshi2016identifiable', [[time_48hr, 48, W_extra], lambda x: x.loc[x['inclusion_over_18']&x['inclusion_stay_ge_48hr'],'icustay_id'].values, 'hospital_expire_flag']],
['lee2015customization_a', [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_lee2015_service']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'hospital_expire_flag']],
['lee2015customization_b', [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_lee2015_service']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['lee2015customization_c', [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_lee2015_service']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'death_2yr_post_hos_disch']],
['lee2015personalized', [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['lee2017patient', [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['lehman2012risk', [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_saps']&x['inclusion_stay_ge_24hr']&x['inclusion_first_admission'],'icustay_id'].values, 'hospital_expire_flag']],
['luo2016interpretable_a', [[time_all, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_sapsii']&x['inclusion_no_disch_summary'],'icustay_id'].values, 'death_30dy_post_hos_disch']],
['luo2016interpretable_b', [[time_all, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_sapsii']&x['inclusion_no_disch_summary'],'icustay_id'].values, 'death_6mo_post_hos_disch']],
['luo2016predicting', [[time_24hr, 12, W_extra], lambda x: np.intersect1d(hugExclFcn(x),x.loc[x['inclusion_stay_ge_24hr'],'icustay_id'].values) , 'death_30dy_post_icu_disch']],
['pirracchio2015mortality', [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii'],'icustay_id'].values , 'hospital_expire_flag']],
['ripoll2014sepsis', [[time_24hr, 24, W_extra], lambda x: x.loc[x['inclusion_only_mimicii']&x['inclusion_over_18']&x['inclusion_has_saps']&x['inclusion_not_explicit_sepsis'],'icustay_id'].values, 'hospital_expire_flag']],
['wojtusiak2017c', [[time_all, 24, W_extra], lambda x: x.loc[x['inclusion_over_65']&x['inclusion_alive_hos_disch'],'icustay_id'].values, 'death_30dy_post_hos_disch']]
])
In [11]:
repro_stats = pd.DataFrame(None, columns=['N_Repro','Y_Repro'])
N = co.shape[0]
for current_study in exclusions:
params, iid_keep, y_outcome_label = exclusions[current_study]
# iid_keep is currently a function - apply it to co to get ICUSTAY_IDs to keep for this study
iid_keep = iid_keep(co)
N_STUDY = iid_keep.shape[0]
Y_STUDY = co.set_index('icustay_id').loc[iid_keep,y_outcome_label].mean()*100.0
# print size of cohort in study
print('{:5g} ({:5.2f}%) - Mortality = {:5.2f}% - {}'.format(
N_STUDY, N_STUDY*100.0/N, Y_STUDY,
current_study)
)
repro_stats.loc[current_study] = [N_STUDY, Y_STUDY]
With the above dataframe, repro_stats, we can compare our results to those extracted manually from the studies. We load in the manual extraction from the data subfolder, merge it with this dataframe, and output to CSV.
In [12]:
study_data = pd.read_csv('../data/study_data.csv')
study_data.set_index('Cohort',inplace=True)
# add in reproduction sample size // outcome
study_data_merged = study_data.merge(repro_stats, how='left',
left_index=True, right_index=True)
# print out the table as it was in the paper (maybe a bit more precision)
study_data_merged[ ['N_Study','N_Repro','Y_Study','Y_Repro'] ]
Out[12]:
In [13]:
# define var_static which is used later
#TODO: should refactor so this isn't needed
var_min, var_max, var_first, var_last, var_sum, var_first_early, var_last_early, var_static = mp.vars_of_interest()
K=5
np.random.seed(871)
# get unique subject_id (this is needed later)
sid = np.sort(np.unique(df_death['subject_id'].values))
# assign k-fold
idxK_sid = np.random.permutation(sid.shape[0])
idxK_sid = np.mod(idxK_sid,K)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, df_death['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
The below code cell:
The two models at the moment are Gradient Boosting (xgboost) and logistic regression (scikit-learn). This code cell only runs for one study.
In [14]:
# pick the study to run the example on
current_study = 'celi2012database_b'
# Rough timing info:
# rf - 3 seconds per fold
# xgb - 30 seconds per fold
# logreg - 4 seconds per fold
# lasso - 8 seconds per fold
models = OrderedDict([
['xgb', xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)],
#['lasso', LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000)],
#['rf', RandomForestClassifier()],
['logreg', LogisticRegression(fit_intercept=True)]
])
print('')
print('====================={}==========='.format('='*len(current_study)))
print('========== BEGINNING {}==========='.format(current_study))
print('====================={}==========='.format('='*len(current_study)))
params = exclusions[current_study][0]
df_data = mp.get_design_matrix(df, params[0], W=params[1], W_extra=params[2])
# get a list of icustay_id who stayed at least 12 hours
iid_keep = exclusions[current_study][1](co)
print('Reducing sample size from {} to {} ({:2.2f}%).'.format(
df_data.shape[0], iid_keep.shape[0], iid_keep.shape[0]*100.0 / df_data.shape[0]))
df_data = df_data.loc[iid_keep,:]
print('')
y_outcome_label = exclusions[current_study][2]
# load the data into a numpy array
# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)
# map above K-fold indices to this dataset
X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)
# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = [x for x in df_data.columns.values] + var_static
mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()
for mdl in models:
print('=============== {} ==============='.format(mdl))
mdl_val[mdl] = list()
results_val[mdl] = list() # initialize list for scores
pred_val[mdl] = list()
tar_val[mdl] = list()
if mdl == 'xgb':
# no pre-processing of data necessary for xgb
estimator = Pipeline([(mdl, models[mdl])])
else:
estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
strategy="mean",
axis=0)),
("scaler", StandardScaler()),
(mdl, models[mdl])])
for k in range(K):
# train the model using all but the kth fold
curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])
# get prediction on this dataset
if mdl == 'lasso':
curr_prob = curr_mdl.predict(X[idxK == k, :])
else:
curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
curr_prob = curr_prob[:,1]
pred_val[mdl].append(curr_prob)
tar_val[mdl].append(y[idxK == k])
# calculate score (AUROC)
curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)
# add score to list of scores
results_val[mdl].append(curr_score)
# save the current model
mdl_val[mdl].append(curr_mdl)
print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
The below code block is identical to the above code block, except it loops over all studies evaluated. This code block takes a while - it is training ~150 models of each type. The final AUROCs are output to the results.txt file. The final models/results/predictions/targets are saved in various dictionaries with the suffix _all.
In [15]:
mdl_val_all = dict()
results_val_all = dict()
pred_val_all = dict()
tar_val_all = dict()
# Rough timing info:
# rf - 3 seconds per fold
# xgb - 30 seconds per fold
# logreg - 4 seconds per fold
# lasso - 8 seconds per fold
models = OrderedDict([
['xgb', xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)],
#['lasso', LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000)],
#['rf', RandomForestClassifier()],
['logreg', LogisticRegression(fit_intercept=True)]
])
with open('results.txt','w') as fp:
fp.write('StudyName,SampleSize,Outcome')
for mdl in models:
fp.write(',{}'.format(mdl))
fp.write('\n')
for current_study in exclusions:
print('\n==================== {} =========='.format('='*len(current_study)))
print('========== BEGINNING {} =========='.format(current_study))
print('==================== {} =========='.format('='*len(current_study)))
params = exclusions[current_study][0]
df_data = mp.get_design_matrix(df, params[0], W=params[1], W_extra=params[2])
# get a list of icustay_id who stayed at least 12 hours
iid_keep = exclusions[current_study][1](co)
print('Reducing sample size from {} to {} ({:2.2f}%).'.format(
df_data.shape[0], iid_keep.shape[0], iid_keep.shape[0]*100.0 / df_data.shape[0]))
df_data = df_data.loc[iid_keep,:]
print('')
y_outcome_label = exclusions[current_study][2]
# load the data into a numpy array
# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)
# map above K-fold indices to this dataset
X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)
# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = [x for x in df_data.columns.values] + var_static
mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()
for mdl in models:
print('=============== {} ==============='.format(mdl))
mdl_val[mdl] = list()
results_val[mdl] = list() # initialize list for scores
pred_val[mdl] = list()
tar_val[mdl] = list()
if mdl == 'xgb':
# no pre-processing of data necessary for xgb
estimator = Pipeline([(mdl, models[mdl])])
else:
estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
strategy="mean",
axis=0)),
("scaler", StandardScaler()),
(mdl, models[mdl])])
for k in range(K):
# train the model using all but the kth fold
curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])
# get prediction on this dataset
if mdl == 'lasso':
curr_prob = curr_mdl.predict(X[idxK == k, :])
else:
curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
curr_prob = curr_prob[:,1]
pred_val[mdl].append(curr_prob)
tar_val[mdl].append(y[idxK == k])
# calculate score (AUROC)
curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)
# add score to list of scores
results_val[mdl].append(curr_score)
# save the current model
mdl_val[mdl].append(curr_mdl)
print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
# create a pointer for above dicts with new var names
# we will likely re-use the dicts in subsequent calls for getting model perfomances
mdl_val_all[current_study] = mdl_val
results_val_all[current_study] = results_val
pred_val_all[current_study] = pred_val
tar_val_all[current_study] = tar_val
# print to file
with open('results.txt','a') as fp:
# print study name, sample size and frequency of outcome
fp.write( '{},{},{:2.2f}'.format(current_study, X.shape[0], np.mean(y)*100.0 ) )
for i, mdl in enumerate(models):
fp.write(',{:0.6f}'.format( np.mean(results_val[mdl]) ))
fp.write('\n')
Move the results from the above dictionaries into a single dataframe.
In [20]:
mdl_list = models.keys()
study_data = pd.read_csv('../data/study_data.csv')
study_data.set_index('Cohort',inplace=True)
# add in reproduction stats from earlier
study_data_merged = study_data.merge(repro_stats, how='left',
left_index=True, right_index=True)
# add in AUROCs
for current_study in results_val_all:
results_val = results_val_all[current_study]
for mdl in results_val:
study_data_merged.loc[current_study, mdl] = np.mean(results_val[mdl])
columns = ['Outcome','N_Study','N_Repro','Y_Study','Y_Repro','AUROC_Study'] + mdl_list
display(HTML(study_data_merged[columns].to_html()))
study_data_merged.to_csv('results_final.csv')
The below code block builds a "baseline" model. This model has no exclusions past the base cohort exclusions. K-fold validation is done on the patient level to ensure no information leakage between training/validation sets. The outcome is in-hospital mortality, and the data window used is the first 24 hours. Labs are extracted from up to 24 hours before the ICU admission (this is defined by W_extra=24).
In [26]:
W = 24 # window size
W_extra = 24 # extra time backward for labs
y_outcome_label = 'death_in_hospital'
# admission+W hours
df_tmp=co.copy().set_index('icustay_id')
time_dict = df_tmp.copy()
time_dict['windowtime'] = W
time_dict = time_dict['windowtime'].to_dict()
# Rough timing info:
# rf - 3 seconds per fold
# xgb - 30 seconds per fold
# logreg - 4 seconds per fold
# lasso - 8 seconds per fold
models = OrderedDict([
['xgb', xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)],
#['lasso', LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000)],
#['rf', RandomForestClassifier()],
['logreg', LogisticRegression(fit_intercept=True)]
])
In [27]:
CENSOR_FLAG=False
current_study = 'baseline'
print('')
print('====================={}==========='.format('='*len(current_study)))
print('========== BEGINNING {} =========='.format(current_study))
print('====================={}==========='.format('='*len(current_study)))
# optionally remove patients who were DNR in first 24hrs
if CENSOR_FLAG:
exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values
else:
exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values
df_data = mp.get_design_matrix(df, time_dict, W=W, W_extra=W_extra)
iid_keep = exclFcn(co)
N_NEW=df_data.loc[iid_keep,:].shape[0]
print('Reducing sample size from {} to {} ({:2.2f}%).'.format(
co.shape[0], N_NEW, N_NEW*100.0 / df_data.shape[0]))
df_data = df_data.loc[iid_keep,:]
print('')
# load the data into a numpy array
# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)
# map above K-fold indices to this dataset
X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)
# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = [x for x in df_data.columns.values] + var_static
mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()
for mdl in models:
print('=============== {} ==============='.format(mdl))
mdl_val[mdl] = list()
results_val[mdl] = list() # initialize list for scores
pred_val[mdl] = list()
tar_val[mdl] = list()
if mdl == 'xgb':
# no pre-processing of data necessary for xgb
estimator = Pipeline([(mdl, models[mdl])])
else:
estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
strategy="mean",
axis=0)),
("scaler", StandardScaler()),
(mdl, models[mdl])])
for k in range(K):
# train the model using all but the kth fold
curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])
# get prediction on this dataset
if mdl == 'lasso':
curr_prob = curr_mdl.predict(X[idxK == k, :])
else:
curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
curr_prob = curr_prob[:,1]
pred_val[mdl].append(curr_prob)
tar_val[mdl].append(y[idxK == k])
# calculate score (AUROC)
curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)
# add score to list of scores
results_val[mdl].append(curr_score)
# save the current model
mdl_val[mdl].append(curr_mdl)
print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
# print final results
print('')
print('StudyName,SampleSize',end='')
for mdl in models:
print(',{}'.format(mdl),end='')
print('')
print( '{},{}'.format(current_study, X.shape[0] ), end='' )
for i, mdl in enumerate(models):
print(',{:0.6f}'.format( np.mean(results_val[mdl]) ), end='')
print('\n')
Patients who choose to have their care withdrawn will receive palliative measures in the ICU. These patients show markedly different physiology than those undergoing full interventions and a model which synthesizes severity should not incorporate their data. Here we remove data for patients at the time of their withdrawal of care. If this is before the end of the first 24 hours of their ICU admission, we remove the patient entirely.
In [28]:
CENSOR_FLAG = True
current_study = 'baseline_withdrawal'
print('====================={}==========='.format('='*len(current_study)))
print('========== BEGINNING {} =========='.format(current_study))
print('====================={}==========='.format('='*len(current_study)))
# optionally remove patients who were DNR in first 24hrs
if CENSOR_FLAG:
exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values
else:
exclFcn = lambda x: x.loc[x['inclusion_stay_ge_24hr']& ( (x['censortime_hours'].isnull()) | (x['censortime_hours']>=24) ) ,'icustay_id'].values
df_data = mp.get_design_matrix(df, time_dict, W=W, W_extra=W_extra)
iid_keep = exclFcn(co)
N_NEW=df_data.loc[iid_keep,:].shape[0]
print('Reducing sample size from {} to {} ({:2.2f}%).'.format(
co.shape[0], N_NEW, N_NEW*100.0 / df_data.shape[0]))
df_data = df_data.loc[iid_keep,:]
print('')
# load the data into a numpy array
# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(co.set_index('icustay_id')[[y_outcome_label]], left_index=True, right_index=True)
# map above K-fold indices to this dataset
X = X.merge(co.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)
# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = [x for x in df_data.columns.values] + var_static
mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()
for mdl in models:
print('=============== {} ==============='.format(mdl))
mdl_val[mdl] = list()
results_val[mdl] = list() # initialize list for scores
pred_val[mdl] = list()
tar_val[mdl] = list()
if mdl == 'xgb':
# no pre-processing of data necessary for xgb
estimator = Pipeline([(mdl, models[mdl])])
else:
estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
strategy="mean",
axis=0)),
("scaler", StandardScaler()),
(mdl, models[mdl])])
for k in range(K):
# train the model using all but the kth fold
curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])
# get prediction on this dataset
if mdl == 'lasso':
curr_prob = curr_mdl.predict(X[idxK == k, :])
else:
curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
curr_prob = curr_prob[:,1]
pred_val[mdl].append(curr_prob)
tar_val[mdl].append(y[idxK == k])
# calculate score (AUROC)
curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)
# add score to list of scores
results_val[mdl].append(curr_score)
# save the current model
mdl_val[mdl].append(curr_mdl)
print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
# print final results
print('')
print('StudyName,SampleSize',end='')
for mdl in models:
print(',{}'.format(mdl),end='')
print('')
print( '{},{}'.format(current_study, X.shape[0] ), end='' )
for i, mdl in enumerate(models):
print(',{:0.6f}'.format( np.mean(results_val[mdl]) ), end='')
print('\n')