In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
from sklearn.pipeline import Pipeline

# used for train/test splits
from sklearn.cross_validation import train_test_split

# used to impute mean for data
from sklearn.preprocessing import Imputer

# logistic regression is our model of choice
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

# used to calculate AUROC/accuracy
from sklearn import metrics

# used to create confusion matrix
from sklearn.metrics import confusion_matrix

from sklearn.cross_validation import cross_val_score


%matplotlib inline

In [2]:
# Connect to MIMIC
# be sure to add the password as appropriate!
con = psycopg2.connect(dbname='MIMIC', user='workshop', password=''
                       , host='<xxxxx>.amazonaws.com'
                       , port=5432)
cur = con.cursor()
cur.execute('SET search_path to ''mimiciii_workshop''')

In [3]:
query = """
with ce as
(
  select
    icustay_id, charttime, itemid, valuenum
  from chartevents
  -- specify what data we want from chartevents
  where itemid in
  (
  211, -- Heart Rate
  618, --	Respiratory Rate
  615 --	Resp Rate (Total)
  )
  -- how did we know heart rates were stored using ITEMID 211? Simple, we looked in D_ITEMS!
  -- Try it for yourself: select * from d_items where lower(label) like '%heart rate%'
)
select
  -- ICUSTAY_ID identifies each unique patient ICU stay
  -- note that if the same person stays in the ICU more than once, each stay would have a *different* ICUSTAY_ID
  -- however, since it's the same person, all those stays would have the same SUBJECT_ID
  ie.icustay_id

  -- this is the outcome of interest: in-hospital mortality
  , max(adm.HOSPITAL_EXPIRE_FLAG) as OUTCOME

  -- this is a case statement - essentially an "if, else" clause
  , min(
      case
        -- if the itemid is 211
        when itemid = 211
          -- then return the actual value stored in VALUENUM
          then valuenum
        -- otherwise, return 'null', which is SQL standard for an empty value
        else null
      -- end the case statement
      end
    ) as HeartRate_Min

    -- note we wrapped the above in "min()"
    -- this takes the minimum of all values inside, and *ignores* nulls
    -- by calling this on our case statement, we are ignoring all values except those with ITEMID = 211
    -- since ITEMID 211 are heart rates, we take the minimum of only heart rates

  , max(case when itemid = 211 then valuenum else null end) as HeartRate_Max
  , min(case when itemid in (615,618) then valuenum else null end) as RespRate_Min
  , max(case when itemid in (615,618) then valuenum else null end) as RespRate_Max
from icustays ie

-- join to the admissions table to get hospital outcome
inner join admissions adm
  on ie.hadm_id = adm.hadm_id

-- join to the chartevents table to get the observations
left join ce
  -- match the tables on the patient identifier
  on ie.icustay_id = ce.icustay_id
  -- and require that the observation be made after the patient is admitted to the ICU
  and ce.charttime >= ie.intime
  -- and *before* their admission time + 1 day, i.e. the observation must be made on their first day in the ICU
  and ce.charttime <= ie.intime + interval '1' day
group by ie.icustay_id
order by ie.icustay_id
"""

data = pd.read_sql_query(query,con)
print(data.head())


   icustay_id  outcome  heartrate_min  heartrate_max  resprate_min  \
0      200006        0             62             84            14   
1      200030        0             83            115            11   
2      200068        0             67            112            20   
3      200071        0            118            130            16   
4      200102        1             71             87            13   

   resprate_max  
0            27  
1            28  
2            24  
3            25  
4            32  

In [4]:
# close the connection as we are done loading data from server
cur.close()
con.close()

In [5]:
# move from a data frame into a numpy array
X = data.values
y = X[:,1]

# delete first 2 columns: the ID and the outcome
X = np.delete(X,0,axis=1)
X = np.delete(X,0,axis=1)

In [6]:
# evaluate a logistic regression model using an 80%-20% training/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# impute mean for missing values
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X_train)

X_train = imp.transform(X_train)
X_test = imp.transform(X_test)

model = LogisticRegression(fit_intercept=True)
model = model.fit(X_train, y_train)

# predict class labels for the test set
y_pred = model.predict(X_test)

# generate class probabilities
y_prob = model.predict_proba(X_test)

# generate evaluation metrics
print('Accuracy = {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('AUROC = {}'.format(metrics.roc_auc_score(y_test, y_prob[:, 1])))

print('\nConfusion matrix')
print(metrics.confusion_matrix(y_test, y_pred))
print('\nClassification report')
print(metrics.classification_report(y_test, y_pred))


Accuracy = 0.784267912773
AUROC = 0.642288212031

Confusion matrix
[[977  17]
 [260  30]]

Classification report
             precision    recall  f1-score   support

        0.0       0.79      0.98      0.88       994
        1.0       0.64      0.10      0.18       290

avg / total       0.76      0.78      0.72      1284


In [8]:
# evaluate a logistic regression with L1 regularization

# evaluate the model using 5-fold cross-validation
# see: http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
# for list of scoring parameters

estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("regression", LogisticRegressionCV(penalty='l1',
                                                          cv=5,
                                                          scoring='roc_auc',
                                                          solver='liblinear'))])

scores = cross_val_score(estimator
                         , X, y
                         , scoring='roc_auc', cv=5)


print('AUROC for all folds:')
print(scores)
print('Average AUROC across folds:')
print(scores.mean())


AUROC for all folds:
[ 0.632241    0.66711432  0.65462583  0.63505984  0.64856111]
Average AUROC across folds:
0.647520418729