In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
from sklearn.pipeline import Pipeline
# used for train/test splits
from sklearn.cross_validation import train_test_split
# used to impute mean for data
from sklearn.preprocessing import Imputer
# logistic regression is our model of choice
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
# used to calculate AUROC/accuracy
from sklearn import metrics
# used to create confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import cross_val_score
%matplotlib inline
In [2]:
# Connect to MIMIC
# be sure to add the password as appropriate!
con = psycopg2.connect(dbname='MIMIC', user='workshop', password=''
, host='<xxxxx>.amazonaws.com'
, port=5432)
cur = con.cursor()
cur.execute('SET search_path to ''mimiciii_workshop''')
In [3]:
query = """
with ce as
(
select
icustay_id, charttime, itemid, valuenum
from chartevents
-- specify what data we want from chartevents
where itemid in
(
211, -- Heart Rate
618, -- Respiratory Rate
615 -- Resp Rate (Total)
)
-- how did we know heart rates were stored using ITEMID 211? Simple, we looked in D_ITEMS!
-- Try it for yourself: select * from d_items where lower(label) like '%heart rate%'
)
select
-- ICUSTAY_ID identifies each unique patient ICU stay
-- note that if the same person stays in the ICU more than once, each stay would have a *different* ICUSTAY_ID
-- however, since it's the same person, all those stays would have the same SUBJECT_ID
ie.icustay_id
-- this is the outcome of interest: in-hospital mortality
, max(adm.HOSPITAL_EXPIRE_FLAG) as OUTCOME
-- this is a case statement - essentially an "if, else" clause
, min(
case
-- if the itemid is 211
when itemid = 211
-- then return the actual value stored in VALUENUM
then valuenum
-- otherwise, return 'null', which is SQL standard for an empty value
else null
-- end the case statement
end
) as HeartRate_Min
-- note we wrapped the above in "min()"
-- this takes the minimum of all values inside, and *ignores* nulls
-- by calling this on our case statement, we are ignoring all values except those with ITEMID = 211
-- since ITEMID 211 are heart rates, we take the minimum of only heart rates
, max(case when itemid = 211 then valuenum else null end) as HeartRate_Max
, min(case when itemid in (615,618) then valuenum else null end) as RespRate_Min
, max(case when itemid in (615,618) then valuenum else null end) as RespRate_Max
from icustays ie
-- join to the admissions table to get hospital outcome
inner join admissions adm
on ie.hadm_id = adm.hadm_id
-- join to the chartevents table to get the observations
left join ce
-- match the tables on the patient identifier
on ie.icustay_id = ce.icustay_id
-- and require that the observation be made after the patient is admitted to the ICU
and ce.charttime >= ie.intime
-- and *before* their admission time + 1 day, i.e. the observation must be made on their first day in the ICU
and ce.charttime <= ie.intime + interval '1' day
group by ie.icustay_id
order by ie.icustay_id
"""
data = pd.read_sql_query(query,con)
print(data.head())
In [4]:
# close the connection as we are done loading data from server
cur.close()
con.close()
In [5]:
# move from a data frame into a numpy array
X = data.values
y = X[:,1]
# delete first 2 columns: the ID and the outcome
X = np.delete(X,0,axis=1)
X = np.delete(X,0,axis=1)
In [6]:
# evaluate a logistic regression model using an 80%-20% training/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# impute mean for missing values
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X_train)
X_train = imp.transform(X_train)
X_test = imp.transform(X_test)
model = LogisticRegression(fit_intercept=True)
model = model.fit(X_train, y_train)
# predict class labels for the test set
y_pred = model.predict(X_test)
# generate class probabilities
y_prob = model.predict_proba(X_test)
# generate evaluation metrics
print('Accuracy = {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('AUROC = {}'.format(metrics.roc_auc_score(y_test, y_prob[:, 1])))
print('\nConfusion matrix')
print(metrics.confusion_matrix(y_test, y_pred))
print('\nClassification report')
print(metrics.classification_report(y_test, y_pred))
In [8]:
# evaluate a logistic regression with L1 regularization
# evaluate the model using 5-fold cross-validation
# see: http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
# for list of scoring parameters
estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
strategy="mean",
axis=0)),
("regression", LogisticRegressionCV(penalty='l1',
cv=5,
scoring='roc_auc',
solver='liblinear'))])
scores = cross_val_score(estimator
, X, y
, scoring='roc_auc', cv=5)
print('AUROC for all folds:')
print(scores)
print('Average AUROC across folds:')
print(scores.mean())