In [19]:
#TO RE-RUN
%reset -f

In [20]:
from sklearn import preprocessing
from time import time
import numpy as np
import csv
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE,ADASYN, RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline

from operator import truediv
from datetime import datetime
import pandas as pd
import time
import os

from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt


np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.2f}'.format
plt.style.use('classic')

%matplotlib inline

import sys
sys.path.insert(1, "../../src/") #"/home/ilmira/healthforecast/readmission/src/"
from TypeFeatImputer import TypeFeatImputer
from UnivCombineFilter import UnivCombineFilter

In [21]:
typeEncounter = "last" # ['first','last']
typeHypothesis = "all_readmisssion_vs_none" # ['all_readmisssion_vs_none','early_readmission_vs_none']
typeDataFeatures = "extended" # ["minimum,"extended']
typeDisease = "subset" # ["subset","individual"]

In [22]:
if typeDataFeatures == "minimum":
    df_all=pd.read_pickle(os.path.join('resources','clean_data_' + typeEncounter + '_hyp_1.pkl'))
if typeDataFeatures == "extended":
    df_all=pd.read_pickle(os.path.join('resources','clean_data_' + typeEncounter + '_hyp_1_' + typeDataFeatures + '.pkl'))
    
print df_all.shape
print df_all.columns
print df_all.readmitted.value_counts()
print df_all.readmitted.value_counts()/float(df_all.shape[0])


(71518, 30)
Index([u'diss_1', u'adm_src_ref', u'adm_src_em', u'race_AfricanAmerican',
       u'race_Caucasian', u'race_Other', u'medSpec_cardio',
       u'medSpec_Family/GeneralPractice', u'medSpec_InternalMedicine',
       u'medSpec_surgery', u'age_cat', u'Diabetis', u'Circulatory',
       u'Digestive', u'Genitourinary', u'Poisoning', u'Muscoskeletal',
       u'Neoplasms', u'Respiratory', u'HbA1c', u'Change', u'time_in_hospital',
       u'num_lab_procedures', u'num_procedures', u'num_medications',
       u'number_outpatient', u'number_emergency', u'number_inpatient',
       u'number_diagnoses', u'readmitted'],
      dtype='object')
0    42985
2    22240
1     6293
Name: readmitted, dtype: int64
0   0.60
2   0.31
1   0.09
Name: readmitted, dtype: float64

Compute class label


In [23]:
print df_all.loc[:,"readmitted"].sort_values().unique(), \
    np.sum(df_all["readmitted"] == 0), \
    np.sum(df_all["readmitted"] == 1), \
    np.sum(df_all["readmitted"] == 2)


[0 1 2] 42985 6293 22240

In [24]:
# Readmitted none vs readmitted
print typeHypothesis

if typeHypothesis == "all_readmisssion_vs_none":
    df_all["readmitted"][df_all["readmitted"].values > 0] = 1
    print df_all.iloc[:,-1].sort_values().unique(), \
            np.sum(df_all["readmitted"] == 0), \
            np.sum(df_all["readmitted"] == 1)
            
# Readmitted none vs early readmitted            
if typeHypothesis == "early_readmission_vs_none":
    df_all= df_all[df_all["readmitted"].isin([0,1])]
    print df_all.iloc[:,-1].sort_values().unique(), np.sum(df_all["readmitted"] == 0), np.sum(df_all["readmitted"] == 1)


all_readmisssion_vs_none
[0 1] 42985 28533
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """

Compute type fields


In [25]:
if typeDataFeatures == "minimum":
    numCols = ['time_in_hospital']
    
if typeDataFeatures == "extended":
    numCols = ['time_in_hospital','num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 
            'number_emergency', 'number_inpatient', 'number_diagnoses']
    
catCols = []
cols = df_all.columns
reducedCols = cols[:-1]

for i in range(len(cols)-1):
    if cols[i] not in numCols:
        catCols.append(1)
    else:
        catCols.append(0)
catCols = np.array(catCols)

print "Cat cols:", np.sum(catCols==1), "\n", reducedCols[catCols==1]
print "Num cols:", np.sum(catCols==0), "\n", reducedCols[catCols==0]
print len(reducedCols)


Cat cols: 21 
Index([u'diss_1', u'adm_src_ref', u'adm_src_em', u'race_AfricanAmerican',
       u'race_Caucasian', u'race_Other', u'medSpec_cardio',
       u'medSpec_Family/GeneralPractice', u'medSpec_InternalMedicine',
       u'medSpec_surgery', u'age_cat', u'Diabetis', u'Circulatory',
       u'Digestive', u'Genitourinary', u'Poisoning', u'Muscoskeletal',
       u'Neoplasms', u'Respiratory', u'HbA1c', u'Change'],
      dtype='object')
Num cols: 8 
Index([u'time_in_hospital', u'num_lab_procedures', u'num_procedures',
       u'num_medications', u'number_outpatient', u'number_emergency',
       u'number_inpatient', u'number_diagnoses'],
      dtype='object')
29

Compute partition (train, test)


In [26]:
ts_thr = 0.30

In [27]:
y = df_all.readmitted
print y.unique()
print y.value_counts()
y = y.values

X = df_all.iloc[:,:-1].values
sss = StratifiedShuffleSplit(y, 1, test_size=ts_thr, random_state=32) #random_state=42
for train_index, test_index in sss:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

print "\nTRAIN:"
print X_train.shape, y_train.shape
print np.sum(y_train == 0), round(np.sum(y_train == 0)/float(y_train.shape[0]),2), \
      np.sum(y_train > 0), round(np.sum(y_train > 0)/float(y_train.shape[0]),2)
print "\nTEST:"
print X_test.shape, y_test.shape
print np.sum(y_test == 0), round(np.sum(y_test == 0)/float(y_test.shape[0]),2), \
      np.sum(y_test > 0), round(np.sum(y_test > 0)/float(y_test.shape[0]),2)


[1 0]
0    42985
1    28533
Name: readmitted, dtype: int64

TRAIN:
(50062, 29) (50062,)
30089 0.6 19973 0.4

TEST:
(21456, 29) (21456,)
12896 0.6 8560 0.4

In [28]:
tr_thr = 0.20

In [29]:
sss = StratifiedShuffleSplit(y_train, 1, test_size=1-tr_thr, random_state=32) #random_state=42
for train_index, test_index in sss:
    X_train = X_train[train_index]
    y_train = y_train[train_index]
    
print "TRAIN:"
print X_train.shape, y_train.shape
print np.sum(y_train == 0), round(np.sum(y_train == 0)/float(y_train.shape[0]),2), \
      np.sum(y_train > 0), round(np.sum(y_train > 0)/float(y_train.shape[0]),2)


TRAIN:
(10012, 29) (10012,)
6018 0.6 3994 0.4

Config pipelines


In [30]:
verbose = True
cv_thr = 0.3
cv_folds = 5

fs_methods = ["none"] #["none","combine_fs","lasso_fs","rfe_rf_fs"]
cls_methods = ["logReg","rf","svmRBF"] #["rf","svmRBF","logReg","knn","nn"]
lms = ["roc_auc","recall"] #["f1_weighted","precision_weighted"]
sm_types = ["none","after"] #["none","after"]
sm_method = "sm_smote"
tr_size = tr_thr

In [31]:
basePipeline = Pipeline([
        ("Imputer", TypeFeatImputer(catCols, reducedCols)),
        ("Scaler", StandardScaler()),
        ("Variance", VarianceThreshold(threshold=0.0))
    ])

In [32]:
pipeline = [] 

for fs_method in fs_methods:
    for sm_type in sm_types:
        for cls_method in cls_methods:
            for lm in lms:
                if not (fs_method == "rfe_rf_fs" and cls_method == "rf"):
                    params = {}   
                    pipe = Pipeline(list(basePipeline.steps))

                    if fs_method == "combine_fs":
                        pipe.steps.insert(1,(fs_method, UnivCombineFilter(catCols,np.array(reducedCols))))
                        params.update({fs_method + '__percentile':[5,10,20,30,40,50]})

                    if fs_method == "rfe_rf_fs":
                        pipe.steps.append((fs_method, RFE(estimator=RandomForestClassifier(class_weight='balanced',
                                                                                           n_estimators=100,
                                                                                           random_state=33))))
                        params.update({fs_method + '__step':[0.1]})
                        params.update({fs_method + '__n_features_to_select':[
                                                        int(len(reducedCols)*0.4), 
                                                        int(len(reducedCols)*0.6), 
                                                        int(len(reducedCols)*0.8)]})

                    if fs_method == 'lasso_fs':
                        pipe.steps.append((fs_method, SelectFromModel(
                                    LogisticRegression(n_jobs=-1, penalty="l1", dual=False, random_state=42))))
                        params.update({fs_method + '__estimator__C': [0.001,0.01,0.1,1]})

                    #Add classifiers
                    if cls_method == "knn":
                        pipe.steps.append((cls_method, KNeighborsClassifier()))
                        params.update({'knn__n_neighbors':[1,3,5,7,9,11], 'knn__weights':['uniform', 'distance']})

                    if cls_method == "logReg":
                        pipe.steps.append((cls_method, LogisticRegression(random_state=42)))
                        params.update({'logReg__C': [0.00001,0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.5,1,5,10,15,30]})
                        params.update({'logReg__class_weight': [None, 'balanced']})
                        params.update({'logReg__penalty': ['l1', 'l2']})

                    if cls_method == "svmRBF":
                        pipe.steps.append((cls_method, SVC(random_state=42,probability=True)))
                        params.update({'svmRBF__C': [0.01,0.1,0.5,1,5,10,30,50,100], 
                         'svmRBF__gamma' : [0.0001,0.001,0.01, 0.1,1,5]})
                        params.update({'svmRBF__class_weight': [None, 'balanced']})

                    if cls_method == "rf":
                        pipe.steps.append((cls_method, RandomForestClassifier(n_jobs=-1,random_state=42)))
                        params.update({'rf__n_estimators': [100,150,200,250,500], 
                                       'rf__criterion': ['entropy','gini'],
                                       'rf__max_depth' : [None,4,6]})
                        params.update({'rf__class_weight': [None, 'balanced']})

                    if cls_method == "nn":
                        pipe.steps.append((cls_method, MLPClassifier(
                                    activation='logistic',
                                    solver='lbfgs', 
                                    hidden_layer_sizes=(5, 2), 
                                    random_state=13)))
                        params.update({
                                'nn__alpha': [1e-5,0.00001,0.0001,0.001,0.01,0.1,1,3,5,10],
                                'nn__hidden_layer_sizes':[(30,),(50,),(70,),(100,),(150,),
                                                          (30,30),(50,50),(70,70),(100,100),
                                                          (30,30,30),(50,50,50),(70,70,70)
                                                         ]
                                      })

                    #Add sampling
                    pipe_imb = make_pipeline(*[p[1] for p in pipe.steps])
                    stps = len(pipe_imb.steps)        
                    for s in range(stps):
                        pipe_imb.steps.remove(pipe_imb.steps[0])
                    for s in range(stps):
                        pipe_imb.steps.append(pipe.steps[s])

                    if sm_type == "after":                    
                        pipe_imb.steps.insert(stps - 1, 
                                              (sm_method, SMOTE(ratio='auto', kind='regular', random_state=32)))
                        params.update({sm_method + "__k_neighbors":[3,4,5]})


                    pipeline.append([fs_method,sm_type,cls_method,lm,pipe_imb,params])

In [33]:
pipeline = pd.DataFrame(pipeline, columns=["fs","sm","cls","metric","pipe","pipe_params"])
pipeline.sort_values("fs", inplace=True)
print pipeline.shape
pipeline


(12, 6)
Out[33]:
fs sm cls metric pipe pipe_params
0 none none logReg roc_auc Pipeline(memory=None,\n steps=[('Imputer',... {u'logReg__class_weight': [None, u'balanced'],...
1 none none logReg recall Pipeline(memory=None,\n steps=[('Imputer',... {u'logReg__class_weight': [None, u'balanced'],...
2 none none rf roc_auc Pipeline(memory=None,\n steps=[('Imputer',... {u'rf__criterion': [u'entropy', u'gini'], u'rf...
3 none none rf recall Pipeline(memory=None,\n steps=[('Imputer',... {u'rf__criterion': [u'entropy', u'gini'], u'rf...
4 none none svmRBF roc_auc Pipeline(memory=None,\n steps=[('Imputer',... {u'svmRBF__gamma': [0.0001, 0.001, 0.01, 0.1, ...
5 none none svmRBF recall Pipeline(memory=None,\n steps=[('Imputer',... {u'svmRBF__gamma': [0.0001, 0.001, 0.01, 0.1, ...
6 none after logReg roc_auc Pipeline(memory=None,\n steps=[('Imputer',... {u'logReg__class_weight': [None, u'balanced'],...
7 none after logReg recall Pipeline(memory=None,\n steps=[('Imputer',... {u'logReg__class_weight': [None, u'balanced'],...
8 none after rf roc_auc Pipeline(memory=None,\n steps=[('Imputer',... {u'rf__criterion': [u'entropy', u'gini'], u'rf...
9 none after rf recall Pipeline(memory=None,\n steps=[('Imputer',... {u'rf__criterion': [u'entropy', u'gini'], u'rf...
10 none after svmRBF roc_auc Pipeline(memory=None,\n steps=[('Imputer',... {u'svmRBF__gamma': [0.0001, 0.001, 0.01, 0.1, ...
11 none after svmRBF recall Pipeline(memory=None,\n steps=[('Imputer',... {u'svmRBF__gamma': [0.0001, 0.001, 0.01, 0.1, ...

Run experiments


In [ ]:
print "ALL TRAIN:", X_train.shape
print "TRAIN:", "[0's:", np.sum(y_train==0), "1's:", np.sum(y_train==1), "]"
print "ALL TEST:", X_test.shape
print "TEST:", "[0's:", np.sum(y_test==0), "1's:", np.sum(y_test==1), "]"

results = []
#folder = os.path.join('resources','results', datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
#os.makedirs(folder)

for num_exp in range(pipeline.shape[0]):

    # Run experiment
    start = time.time()

    #Prepare pipe_cls      
    pipeline_cls = pipeline["pipe"].iloc[num_exp]
    pipeline_params = pipeline["pipe_params"].iloc[num_exp]
    fs = pipeline["fs"].iloc[num_exp]
    sm = pipeline["sm"].iloc[num_exp]
    cls = pipeline["cls"].iloc[num_exp]
    lm = pipeline["metric"].iloc[num_exp]

    print "\n Num experiment:", num_exp
    print "****************"
    print "FS:",fs
    print "SM:",sm
    print "CLS:",cls
    print "METRIC:",lm

    #Prepare cv
    cv_inner = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr, random_state=24)
    cv_outer = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr, random_state=42)

    #Fit pipeline with CV                        
    grid_pipeline = GridSearchCV(pipeline_cls, param_grid=pipeline_params, verbose=verbose, 
                                 n_jobs=-1, cv=cv_inner, scoring= lm, error_score = 0) 
    grid_pipeline.fit(X_train, y_train)

    # Compute pipeline evaluation with CVmetric
    print "\nCV INNER metric: {}".format(lm)
    print "CV INNER selected params {}".format(grid_pipeline.best_params_.values())
    print "CV INNER score: {}".format(grid_pipeline.best_score_)

    cv_f1 = cross_val_score(grid_pipeline.best_estimator_, X_train, y_train, 
                                             cv=cv_outer, scoring='f1_weighted', n_jobs=-1)

    cv_prec = cross_val_score(grid_pipeline.best_estimator_, X_train, y_train, 
                                             cv=cv_outer, scoring='precision_weighted', n_jobs=-1)

    cv_rec = cross_val_score(grid_pipeline.best_estimator_, X_train, y_train, 
                                             cv=cv_outer, scoring='recall_weighted', n_jobs=-1)

    print "\nCV OUTER f1 score: %0.3f  (+/-%0.03f)" % (np.mean(cv_f1), np.std(cv_f1))
    print "CV OUTER prec score: %0.3f  (+/-%0.03f)" % (np.mean(cv_prec), np.std(cv_prec))
    print "CV OUTER rec score: %0.3f  (+/-%0.03f)" % (np.mean(cv_rec), np.std(cv_rec))
    print "Selected params (bests from CV) {}".format(grid_pipeline.best_params_.values())

    # Computel Train score (with best CV params)
    y_pred = grid_pipeline.best_estimator_.predict(X_train)
    train_prec_scores = metrics.precision_score(y_train, y_pred, average='weighted', pos_label=None)
    train_rec_scores = metrics.recall_score(y_train, y_pred, average='weighted', pos_label=None)    
    train_f1_scores = metrics.f1_score(y_train, y_pred, average='weighted', pos_label=None)

    print "\nTR F1 score:", train_f1_scores
    print "TR Prec score:", train_prec_scores
    print "TR Rec score:", train_rec_scores

    #Compute test score
    y_pred = grid_pipeline.best_estimator_.predict(X_test)
    test_f1 = metrics.f1_score(y_test, y_pred, average='weighted', pos_label=None)
    test_prec = metrics.recall_score(y_test, y_pred, average='weighted', pos_label=None)
    test_rec = metrics.precision_score(y_test, y_pred, average='weighted', pos_label=None)
    test_auc = metrics.roc_auc_score(y_test, y_pred, average='weighted')

    print "\nTest f1: %0.3f" % (test_f1)
    print "Test Precision: %0.3f" % (test_prec)
    print "Test Recall: %0.3f" % (test_rec)
    print "Test AUC: %0.3f" % (test_auc)

    print "with following performance in test:"
    print metrics.classification_report(y_test, y_pred)
    print metrics.confusion_matrix(y_test, y_pred)

    end = time.time()
    print "\nTotal time:", end - start
    results = [num_exp,
                   typeDisease,
                   typeEncounter,
                   typeHypothesis,
                   typeDataFeatures, 
                   tr_size,
                   fs,
                   sm,
                   cls,
                   lm,
                   grid_pipeline.best_params_.values(),
                   train_f1_scores,
                   train_prec_scores,
                   train_rec_scores,
                   np.mean(cv_f1), 
                   np.std(cv_f1),
                   np.mean(cv_prec), 
                   np.std(cv_prec),
                   np.mean(cv_rec), 
                   np.std(cv_rec),                    
                   test_f1,
                   test_prec,
                   test_rec,
                   test_auc,                    
                   end - start,
                   grid_pipeline.best_estimator_
                  ]

    #Save results
    df = pd.DataFrame(results, columns=
              ["exp", "typeDisease","typeEncounter","typeHypothesis","typeDataFeatures",
               "size_tr","fs","sm","cls","metric","params",
               "tr_f1","tr_prec","tr_rec",
               "cv_f1_mean","cv_f1_std","cv_prec_mean","cv_prec_std","cv_rec_mean","cv_rec_std",
               "test_f1","test_prec","test_rec","test_auc",
               "time","pipeline"])

    df.to_pickle(os.path.join("resources", "results",
                              'results_pipe_' + 
                              "test_" + str(ts_thr) + "_" +
                              "train_" + str(tr_thr) + "_" +
                              str(typeDisease) + '_' +
                              str(typeEncounter) + '_' +
                              str(typeHypothesis) + '_' +
                              str(typeDataFeatures) + '_' +
                              str(num_exp) + '.pkl'))


ALL TRAIN: (10012, 29)
TRAIN: [0's: 6018 1's: 3994 ]
ALL TEST: (21456, 29)
TEST: [0's: 12896 1's: 8560 ]

 Num experiment: 0
****************
FS: none
SM: none
CLS: logReg
METRIC: roc_auc
Fitting 5 folds for each of 56 candidates, totalling 280 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:   21.1s finished
CV INNER metric: roc_auc
CV INNER selected params ['balanced', 0.1, 'l1']
CV INNER score: 0.640156952248

CV OUTER f1 score: 0.611  (+/-0.002)
CV OUTER prec score: 0.615  (+/-0.002)
CV OUTER rec score: 0.609  (+/-0.002)
Selected params (bests from CV) ['balanced', 0.1, 'l1']

TR F1 score: 0.613508981989
TR Prec score: 0.617074862762
TR Rec score: 0.611266480224

Test f1: 0.604
Test Precision: 0.602
Test Recall: 0.608
Test AUC: 0.592
with following performance in test:
             precision    recall  f1-score   support

          0       0.68      0.64      0.66     12896
          1       0.50      0.54      0.52      8560

avg / total       0.61      0.60      0.60     21456

[[8291 4605]
 [3933 4627]]

Total time: 24.8265180588

 Num experiment: 1
****************
FS: none
SM: none
CLS: logReg
METRIC: recall
Fitting 5 folds for each of 56 candidates, totalling 280 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:   21.1s finished
CV INNER metric: recall
CV INNER selected params ['balanced', 1e-05, 'l2']
CV INNER score: 0.590150250417

CV OUTER f1 score: 0.585  (+/-0.005)
CV OUTER prec score: 0.599  (+/-0.004)
CV OUTER rec score: 0.581  (+/-0.005)
Selected params (bests from CV) ['balanced', 1e-05, 'l2']

TR F1 score: 0.589388469718
TR Prec score: 0.601976951578
TR Rec score: 0.585197762685

Test f1: 0.583
Test Precision: 0.579
Test Recall: 0.595
Test AUC: 0.578
with following performance in test:
             precision    recall  f1-score   support

          0       0.67      0.58      0.62     12896
          1       0.48      0.57      0.52      8560

avg / total       0.60      0.58      0.58     21456

[[7514 5382]
 [3645 4915]]

Total time: 24.6503970623

 Num experiment: 2
****************
FS: none
SM: none
CLS: rf
METRIC: roc_auc
Fitting 5 folds for each of 60 candidates, totalling 300 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   55.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.6min finished
CV INNER metric: roc_auc
CV INNER selected params ['entropy', 6, None, 500]
CV INNER score: 0.644915436765

CV OUTER f1 score: 0.559  (+/-0.005)
CV OUTER prec score: 0.635  (+/-0.010)
CV OUTER rec score: 0.631  (+/-0.004)
Selected params (bests from CV) ['entropy', 6, None, 500]

TR F1 score: 0.589288279436
TR Prec score: 0.68434205514
TR Rec score: 0.654514582501

Test f1: 0.557
Test Precision: 0.629
Test Recall: 0.629
Test AUC: 0.551
with following performance in test:
             precision    recall  f1-score   support

          0       0.63      0.94      0.75     12896
          1       0.63      0.17      0.26      8560

avg / total       0.63      0.63      0.56     21456

[[12058   838]
 [ 7130  1430]]

Total time: 112.443551064

 Num experiment: 3
****************
FS: none
SM: none
CLS: rf
METRIC: recall
Fitting 5 folds for each of 60 candidates, totalling 300 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   55.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.6min finished
CV INNER metric: recall
CV INNER selected params ['gini', 6, 'balanced', 150]
CV INNER score: 0.510350584307

CV OUTER f1 score: 0.619  (+/-0.005)
CV OUTER prec score: 0.619  (+/-0.005)
CV OUTER rec score: 0.619  (+/-0.006)
Selected params (bests from CV) ['gini', 6, 'balanced', 150]

TR F1 score: 0.649100513047
TR Prec score: 0.648640283421
TR Rec score: 0.649620455453

Test f1: 0.613
Test Precision: 0.614
Test Recall: 0.613
Test AUC: 0.595
with following performance in test:
             precision    recall  f1-score   support

          0       0.68      0.69      0.68     12896
          1       0.52      0.50      0.51      8560

avg / total       0.61      0.61      0.61     21456

[[8891 4005]
 [4267 4293]]

Total time: 103.269453049

 Num experiment: 4
****************
FS: none
SM: none
CLS: svmRBF
METRIC: roc_auc
Fitting 5 folds for each of 108 candidates, totalling 540 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 27.7min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 36.4min finished
CV INNER metric: roc_auc
CV INNER selected params [0.001, 'balanced', 50]
CV INNER score: 0.644158869434

CV OUTER f1 score: 0.617  (+/-0.005)
CV OUTER prec score: 0.617  (+/-0.005)
CV OUTER rec score: 0.618  (+/-0.006)
Selected params (bests from CV) [0.001, 'balanced', 50]

TR F1 score: 0.638062162305
TR Prec score: 0.637273859223
TR Rec score: 0.639033160208

Test f1: 0.617
Test Precision: 0.619
Test Recall: 0.616
Test AUC: 0.598
with following performance in test:
             precision    recall  f1-score   support

          0       0.68      0.70      0.69     12896
          1       0.52      0.50      0.51      8560

avg / total       0.62      0.62      0.62     21456

[[9018 3878]
 [4303 4257]]

Total time: 2360.15692616

 Num experiment: 5
****************
FS: none
SM: none
CLS: svmRBF
METRIC: recall
Fitting 5 folds for each of 108 candidates, totalling 540 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 27.7min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 36.3min finished
CV INNER metric: recall
CV INNER selected params [0.0001, 'balanced', 0.01]
CV INNER score: 1.0

CV OUTER f1 score: 0.227  (+/-0.000)
CV OUTER prec score: 0.159  (+/-0.000)
CV OUTER rec score: 0.399  (+/-0.000)
Selected params (bests from CV) [0.0001, 'balanced', 0.01]

TR F1 score: 0.22751558618
TR Prec score: 0.159138199163
TR Rec score: 0.398921294447

Test f1: 0.228
Test Precision: 0.399
Test Recall: 0.159
Test AUC: 0.500
with following performance in test:
             precision    recall  f1-score   support

          0       0.00      0.00      0.00     12896
          1       0.40      1.00      0.57      8560

avg / total       0.16      0.40      0.23     21456

[[    0 12896]
 [    0  8560]]

Total time: 2360.26619506

 Num experiment: 6
****************
FS: none
SM: after
CLS: logReg
METRIC: roc_auc
Fitting 5 folds for each of 168 candidates, totalling 840 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   57.3s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 840 out of 840 | elapsed:  1.9min finished
CV INNER metric: roc_auc
CV INNER selected params [None, 0.5, 4, 'l1']
CV INNER score: 0.638445951817

CV OUTER f1 score: 0.603  (+/-0.003)
CV OUTER prec score: 0.610  (+/-0.003)
CV OUTER rec score: 0.599  (+/-0.004)
Selected params (bests from CV) [None, 0.5, 4, 'l1']

TR F1 score: 0.608264569528
TR Prec score: 0.614794413084
TR Rec score: 0.605073911306

Test f1: 0.599
Test Precision: 0.596
Test Recall: 0.606
Test AUC: 0.590
with following performance in test:
             precision    recall  f1-score   support

          0       0.68      0.62      0.65     12896
          1       0.49      0.56      0.52      8560

avg / total       0.61      0.60      0.60     21456

[[7998 4898]
 [3772 4788]]

Total time: 122.833711147

 Num experiment: 7
****************
FS: none
SM: after
CLS: logReg
METRIC: recall
Fitting 5 folds for each of 168 candidates, totalling 840 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   57.5s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 840 out of 840 | elapsed:  1.9min finished
CV INNER metric: recall
CV INNER selected params [None, 1e-05, 5, 'l2']
CV INNER score: 0.599499165275

CV OUTER f1 score: 0.584  (+/-0.006)
CV OUTER prec score: 0.599  (+/-0.005)
CV OUTER rec score: 0.579  (+/-0.006)
Selected params (bests from CV) [None, 1e-05, 5, 'l2']

TR F1 score: 0.587479901832
TR Prec score: 0.601661411259
TR Rec score: 0.583200159808

Test f1: 0.581
Test Precision: 0.577
Test Recall: 0.595
Test AUC: 0.578
with following performance in test:
             precision    recall  f1-score   support

          0       0.67      0.57      0.62     12896
          1       0.48      0.58      0.52      8560

avg / total       0.59      0.58      0.58     21456

[[7389 5507]
 [3574 4986]]

Total time: 122.540338993

 Num experiment: 8
****************
FS: none
SM: after
CLS: rf
METRIC: roc_auc
Fitting 5 folds for each of 180 candidates, totalling 900 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  6.5min finished
CV INNER metric: roc_auc
CV INNER selected params ['gini', 6, None, 500, 5]
CV INNER score: 0.641633481051

CV OUTER f1 score: 0.625  (+/-0.007)
CV OUTER prec score: 0.623  (+/-0.007)
CV OUTER rec score: 0.630  (+/-0.006)
Selected params (bests from CV) ['gini', 6, None, 500, 5]

TR F1 score: 0.643828759299
TR Prec score: 0.642728739473
TR Rec score: 0.649920095885

Test f1: 0.615
Test Precision: 0.622
Test Recall: 0.614
Test AUC: 0.593
with following performance in test:
             precision    recall  f1-score   support

          0       0.67      0.74      0.70     12896
          1       0.53      0.45      0.49      8560

avg / total       0.61      0.62      0.62     21456

[[9516 3380]
 [4726 3834]]

Total time: 408.573647022

 Num experiment: 9
****************
FS: none
SM: after
CLS: rf
METRIC: recall
Fitting 5 folds for each of 180 candidates, totalling 900 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  6.5min finished
CV INNER metric: recall
CV INNER selected params ['gini', 4, None, 150, 4]
CV INNER score: 0.480801335559

CV OUTER f1 score: 0.622  (+/-0.007)
CV OUTER prec score: 0.620  (+/-0.007)
CV OUTER rec score: 0.625  (+/-0.007)
Selected params (bests from CV) ['gini', 4, None, 150, 4]

TR F1 score: 0.628260753208
TR Prec score: 0.626500003519
TR Rec score: 0.63204155014

Test f1: 0.612
Test Precision: 0.616
Test Recall: 0.610
Test AUC: 0.591
with following performance in test:
             precision    recall  f1-score   support

          0       0.67      0.72      0.69     12896
          1       0.52      0.46      0.49      8560

avg / total       0.61      0.62      0.61     21456

[[9259 3637]
 [4594 3966]]

Total time: 398.770281076

 Num experiment: 10
****************
FS: none
SM: after
CLS: svmRBF
METRIC: roc_auc
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed: 18.6min

In [ ]:
df = pd.DataFrame(results, columns=
                  ["exp",
                   "typeDisease","typeEncounter","typeHypothesis","typeDataFeatures",
                   "size_tr","fs","sm","cls","metric","params",
                   "tr_f1","tr_prec","tr_rec",
                   "cv_f1_mean","cv_f1_std","cv_prec_mean","cv_prec_std","cv_rec_mean","cv_rec_std",
                   "test_f1","test_prec","test_rec","test_auc",
                   "time","pipeline"])

In [ ]:
df.sort_values("cv_f1_mean", ascending=False).head(10).iloc[:,:-2]

In [ ]: