We develop a few models for predicting mortality in the ICU based on labs and vital signs in the first 24 hours of entry to the ICU. We then use the ml_insights
package to visualize the effects of the different features in the model, and to be able to explain the main factors that contributed to a patient being high or low risk.
*MIMIC-III, a freely accessible critical care database. Johnson AEW, Pollard TJ, Shen L, Lehman L, Feng M, Ghassemi M, Moody B, Szolovits P, Celi LA, and Mark RG. Scientific Data (2016). https://mimic.physionet.org
In [1]:
# "pip install ml_insights" in terminal if needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ml_insights as mli
%matplotlib inline
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
In [2]:
# Load dataset derived from the MMIC database
lab_aug_df = pd.read_csv("data/lab_vital_icu_table.csv")
In [3]:
lab_aug_df.head(10)
Out[3]:
In [4]:
X = lab_aug_df.loc[:,['aniongap_min', 'aniongap_max',
'albumin_min', 'albumin_max', 'bicarbonate_min', 'bicarbonate_max',
'bilirubin_min', 'bilirubin_max', 'creatinine_min', 'creatinine_max',
'chloride_min', 'chloride_max',
'hematocrit_min', 'hematocrit_max', 'hemoglobin_min', 'hemoglobin_max',
'lactate_min', 'lactate_max', 'platelet_min', 'platelet_max',
'potassium_min', 'potassium_max', 'ptt_min', 'ptt_max', 'inr_min',
'inr_max', 'pt_min', 'pt_max', 'sodium_min', 'sodium_max', 'bun_min',
'bun_max', 'wbc_min', 'wbc_max','sysbp_max', 'sysbp_mean', 'diasbp_min', 'diasbp_max', 'diasbp_mean',
'meanbp_min', 'meanbp_max', 'meanbp_mean', 'resprate_min',
'resprate_max', 'resprate_mean', 'tempc_min', 'tempc_max', 'tempc_mean',
'spo2_min', 'spo2_max', 'spo2_mean']]
y = lab_aug_df['hospital_expire_flag']
In [5]:
# Impute the median for in each column to replace NA's
median_vec = [X.iloc[:,i].median() for i in range(len(X.columns))]
for i in range(len(X.columns)):
X.iloc[:,i].fillna(median_vec[i],inplace=True)
In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=9942)
In [7]:
gbmodel1 = GradientBoostingClassifier(n_estimators = 1000,
learning_rate = .005,
max_depth = 7)
gbmodel1.fit(X_train,y_train)
Out[7]:
In [8]:
test_res = np.round(gbmodel1.predict_proba(X_test),decimals=3)[:,1]
In [9]:
roc_auc_score(y_test,test_res)
Out[9]:
In [10]:
# Side by side histograms showing scores of positive vs negative cases
fig, ax = plt.subplots(1,2, figsize = (8,4))
ax[0].hist(test_res[np.where(y_test==0)],bins=20,range=[0,1]);
ax[1].hist(test_res[np.where(y_test==1)],bins=20,range=[0,1]);
In [11]:
mxr = mli.ModelXRay(gbmodel1, X_test.iloc[:500,:])
In [12]:
mxr.feature_effect_summary(num_features=55)
In [13]:
mxr.feature_effect_summary(num_features=15)
In [14]:
indices = mxr.feature_dependence_plots()
In [15]:
indices
Out[15]:
In [16]:
mxr.explain_prediction_difference(311,388,tol=.05)
Out[16]:
Now we do the same thing for a Random Forest Model
In [17]:
rfmodel1 = RandomForestClassifier(n_estimators = 500,class_weight='balanced')
rfmodel1.fit(X_train,y_train)
Out[17]:
In [18]:
test_res_rf = np.round(rfmodel1.predict_proba(X_test),decimals=3)[:,1]
roc_auc_score(y_test,test_res_rf)
Out[18]:
In [19]:
mxr_rf = mli.ModelXRay(rfmodel1, X_test.iloc[:500,:])
In [20]:
mxr_rf.feature_effect_summary(num_features=15)
In [21]:
indices
Out[21]:
In [22]:
mxr_rf.feature_dependence_plots(pts_selected = indices)
Out[22]: