In [172]:
%reset -f

In [173]:
from sklearn import preprocessing
from time import time
import numpy as np
import csv
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE,ADASYN, RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline

from operator import truediv
from datetime import datetime
import pandas as pd
import time
import os

from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt


np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.2f}'.format
plt.style.use('classic')

%matplotlib inline

import sys
sys.path.insert(1, "../src/")
from TypeFeatImputer import TypeFeatImputer
from UnivCombineFilter import UnivCombineFilter
import MLpipeline as MLpipeline

In [174]:
#Local methods

def load_data(typeEncounter, typeDiagnosis, typeDataFeatures):

    if typeDataFeatures == "non_extended":
        df_all=pd.read_pickle(os.path.join('resources','prepared_clean_data_' + typeEncounter + "_" +  typeDiagnosis + '.pkl'))
    else:
        df_all=pd.read_pickle(os.path.join('resources','prepared_clean_data_' + typeEncounter + "_" +  typeDiagnosis + '_' + typeDataFeatures + '.pkl'))


    return df_all

def get_columns(df_all, typeDiagnosis):

    colsDiseases = []
    if typeDiagnosis == "diag_1":
        colsDiseases = [u'Diabetis_1', u'Circulatory_1', u'Digestive_1', u'Genitourinary_1', u'Poisoning_1', u'Muscoskeletal_1',
               u'Neoplasms_1', u'Respiratory_1']

    if typeDiagnosis == "diag_3":
        colsDiseases = [u'Diabetis_3', u'Circulatory_3', u'Digestive_3', u'Genitourinary_3', u'Poisoning_3', u'Muscoskeletal_3',
               u'Neoplasms_3', u'Respiratory_3']
    
    colsNonDiseases = [c for c in df_all.columns if c not in colsDiseases]
    
    return colsDiseases, colsNonDiseases

def filter_data_by_class(df_all, typeHypothesis):
    
    # Readmitted none vs readmitted
    if typeHypothesis == "all_readmisssion_vs_none":
        df_all["readmitted"][df_all["readmitted"].values > 0] = 1

    # Readmitted none vs early readmitted            
    if typeHypothesis == "early_readmission_vs_none":
        df_all= df_all[df_all["readmitted"].isin([0,1])]
        
    return df_all

def compute_type_features(df_all, typeDataFeatures):

    numCols = ['time_in_hospital','num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 
                'number_emergency', 'number_inpatient', 'number_diagnoses',
                'add_in_out', 'add_procs_meds', 'div_visits_time', 'div_em_time', 'div_visit_med', 'div_em_med',
                'number_treatment','number_treatment_0','number_treatment_1','number_treatment_2','number_treatment_3']

    catCols = []
    cols = df_all.columns
    reducedCols = cols[:-1]

    for i in range(len(cols)-1):
        if cols[i] not in numCols:
            catCols.append(1)
        else:
            catCols.append(0)
    catCols = np.array(catCols)
    
    return catCols, reducedCols

def get_diseases(colsDiseases, typeDisease):
    if typeDisease == "subset":
        return ["subset"]
    else:
        if typeDisease in colsDiseases:
            return [typeDisease]
        else:
            return colsDiseases

def filter_data_by_diseases(df_all, disease, typeDataExperiment, colsNonDiseases):
    if disease == "subset":
        df_all_filtered = df_all.copy()
    else:
        cols_filtered = colsNonDiseases[:]
        cols_filtered.insert(-1, disease)
        df_all_filtered = df_all[cols_filtered].copy()    
    
    if typeDataExperiment == "disease" and disease != "subset":
        df_all_filtered = df_all_filtered[df_all_filtered[disease] == 1]
        df_all_filtered = df_all_filtered[[c for c in df_all_filtered.columns if c != disease]]
    
    return df_all_filtered

In [197]:
#Generic methods
def train_test_partition(df_all, ts_thr=0.30):
    y = df_all.readmitted
    y = y.values

    X = df_all.iloc[:,:-1].values
    sss = StratifiedShuffleSplit(y, 1, test_size=ts_thr, random_state=32) #random_state=42
    for train_index, test_index in sss:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
    return X_train, X_test, y_train, y_test

def train_partition(X_train, y_train, tr_thr=0.10):
    X_train_aux = []
    y_train_aux = []
    if tr_thr >= 1.0:
            X_train_aux = X_train
            y_train_aux = y_train
    else:
        sss = StratifiedShuffleSplit(y_train, 1, test_size=1-tr_thr, random_state=32) #random_state=42
        for train_index, test_index in sss:
            X_train_aux = X_train[train_index]
            y_train_aux = y_train[train_index]

    return X_train_aux, y_train_aux

def create_pipelines(catCols,reducedCols, hyperparams, fs_methods, sm_method, sm_types, cls_methods, lms):
    
    basePipeline = Pipeline([
            ("Imputer", TypeFeatImputer(catCols, reducedCols)),
            ("Scaler", StandardScaler()),
            ("Variance", VarianceThreshold(threshold=0.0))
        ])

    pipeline = [] 

    for fs_method in fs_methods:
        for sm_type in sm_types:
            for cls_method in cls_methods:
                for lm in lms:
                    if not (fs_method == "rfe_rf_fs" and cls_method == "rf") and not(fs_method == "lasso_fs" and cls_method == "logReg"):
                        params = {}   
                        pipe = Pipeline(list(basePipeline.steps))

                        if fs_method == "combine_fs":
                            pipe.steps.insert(1,(fs_method, UnivCombineFilter(catCols,np.array(reducedCols))))
                            pm = hyperparams[hyperparams[:,1] == fs_method,2][0]
                            params.update(pm)                            


                        if fs_method == "rfe_rf_fs":
                            pipe.steps.append((fs_method, RFE(estimator=RandomForestClassifier(class_weight='balanced',
                                                                                               n_estimators=100,
                                                                                               random_state=33))))
                            pm = hyperparams[hyperparams[:,1] == fs_method,2][0]
                            params.update(pm) 
                            
                        if fs_method == 'lasso_fs':
                            pipe.steps.append((fs_method, SelectFromModel(
                                        LogisticRegression(n_jobs=-1, penalty="l1", dual=False, random_state=42))))
                            pm = hyperparams[hyperparams[:,1] == fs_method,2][0]
                            params.update(pm) 
                            
                        #Add classifiers
                        if cls_method == "knn":
                            pipe.steps.append((cls_method, KNeighborsClassifier()))
                            pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
                            params.update(pm) 
                            
                        if cls_method == "logReg":
                            pipe.steps.append((cls_method, LogisticRegression(random_state=42)))
                            pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
                            params.update(pm) 
                            
                        if cls_method == "svmRBF":
                            pipe.steps.append((cls_method, SVC(random_state=42,probability=True)))
                            pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
                            params.update(pm) 

                        if cls_method == "rf":
                            pipe.steps.append((cls_method, RandomForestClassifier(n_jobs=-1,class_weight='balanced',random_state=42)))
                            pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
                            params.update(pm) 
                            
                        if cls_method == "gbt":
                            pipe.steps.append((cls_method, GradientBoostingClassifier(random_state=42,subsample=0.1,loss="deviance")))
                            pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
                            params.update(pm) 

                        if cls_method == "nb":
                            pipe.steps.append((cls_method, GaussianNB()))
                            params.update({}) 
                            
                        if cls_method == "nn":
                            pipe.steps.append((cls_method, MLPClassifier(
                                        activation='logistic',
                                        solver='lbfgs', 
                                        hidden_layer_sizes=(5, 2), 
                                        random_state=13)))
                            pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
                            params.update(pm) 
                            

                        #Add sampling
                        pipe_imb = make_pipeline(*[p[1] for p in pipe.steps])
                        stps = len(pipe_imb.steps)        
                        for s in range(stps):
                            pipe_imb.steps.remove(pipe_imb.steps[0])
                        for s in range(stps):
                            pipe_imb.steps.append(pipe.steps[s])

                        if sm_type == "after":                    
                            pipe_imb.steps.insert(stps - 1, 
                                                  (sm_method, SMOTE(ratio='auto', kind='regular', random_state=32)))
                            pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
                            params.update(pm) 

                        pipeline.append([fs_method,sm_type,cls_method,lm,pipe_imb,params])

    pipelines = pd.DataFrame(pipeline, columns=["fs","sm","cls","metric","pipe","pipe_params"])
    pipelines.sort_values("fs", inplace=True)

    return pipelines

def precision_0(ground_truth, predictions):
    prec = metrics.precision_score(ground_truth, predictions,pos_label=0)
    return prec

def recall_0(ground_truth, predictions):
    rec = metrics.recall_score(ground_truth, predictions,pos_label=0)
    return rec

def specificity(ground_truth, predictions):
    cm_train = metrics.confusion_matrix(ground_truth, predictions)
    tn = cm_train[0,0]
    fp = cm_train[0,1]
    fn = cm_train[1,0]
    tp = cm_train[1,1]
    train_spec = tn / float(tn+fp)
    return train_spec

# One Experiment One file 
def run(name,df_all, catCols, reducedCols, hyperparams, 
        ts_thr, tr_thrs, fs_methods, sm_method, sm_types, cls_methods, lms, cv_folds, cv_thr, 
        verbose=True, save=False):

    results = []
    for tr_thr in tr_thrs:

            X_train, X_test, y_train, y_test = train_test_partition(df_all, ts_thr)
            X_train, y_train = train_partition(X_train, y_train, tr_thr)
            
            pipeline = create_pipelines(catCols, reducedCols, hyperparams, fs_methods, sm_method, sm_types, cls_methods, lms)

            print "\nDataSet:"
            print "**********"
            print "**********"
            print "SIZE:", tr_thr
            print "NAME:", name

            print df_all.shape
            print "ALL TRAIN:", X_train.shape
            print "TRAIN:", "[0's:", np.sum(y_train==0), "1's:", np.sum(y_train==1), "]"
            print "ALL TEST:", X_test.shape
            print "TEST:", "[0's:", np.sum(y_test==0), "1's:", np.sum(y_test==1), "]"

            for num_exp in range(pipeline.shape[0]):

                # Run experiment
                start = time.time()

                #Prepare pipe_cls      
                pipeline_cls = pipeline["pipe"].iloc[num_exp]
                pipeline_params = pipeline["pipe_params"].iloc[num_exp]
                fs = pipeline["fs"].iloc[num_exp]
                sm = pipeline["sm"].iloc[num_exp]
                cls = pipeline["cls"].iloc[num_exp]
                lm = pipeline["metric"].iloc[num_exp]

                print "\nNum experiment:", str(num_exp), "/", str(pipeline.shape[0] - 1)
                print "****************"

                print "FS:",fs
                print "SM:",sm
                print "CLS:",cls
                print "METRIC:",lm

                #Prepare cv
                cv_inner = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr, random_state=24)
                cv_outer = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr, random_state=42)

                #Fit pipeline with CV                        
                grid_pipeline = GridSearchCV(pipeline_cls, param_grid=pipeline_params, verbose=verbose, 
                                             n_jobs=-1, cv=cv_inner, scoring= lm, error_score = 0) 
                grid_pipeline.fit(X_train, y_train)
                
                
                # Compute Train scores (with best CV params)
                y_pred = grid_pipeline.best_estimator_.predict(X_train)  
                train_f1_w = metrics.f1_score(y_train, y_pred, average='weighted', pos_label=None)
                train_p, train_r, train_f1, train_s = metrics.precision_recall_fscore_support(y_train, y_pred,labels=None,average=None, sample_weight=None)
                fpr, tpr, _ = metrics.roc_curve(y_train, y_pred)
                train_auc = metrics.auc(fpr, tpr)                
                cm_train = metrics.confusion_matrix(y_train, y_pred)
                tn = cm_train[0,0]
                fp = cm_train[0,1]
                fn = cm_train[1,0]
                tp = cm_train[1,1]
                train_sens = train_r[1]
                train_spec = tn / float(tn+fp)                
                print "\nTRAIN f1 (weighted): %0.3f" % (train_f1_w)
                print "TRAIN Precision [c=0,1]:", train_p
                print "TRAIN Recall [c=0,1]:", train_r
                print "TRAIN AUC: %0.3f" % (train_auc)                 
                print "TRAIN Sensibility:", train_sens
                print "TRAIN Specificity: ", train_spec
                
                # Compute evaluation scores
                print "\nCV INNER metric: {}".format(lm)
                print "CV INNER selected params {}".format(grid_pipeline.best_params_.values())
                print "CV INNER score: {}".format(grid_pipeline.best_score_)

                scorings = {'roc_auc': 'roc_auc',
                            'f1_weighted':'f1_weighted',
                            'f1':'f1',
                            'precision_1':'precision',
                            'recall_1':'recall',
                            'precision_0' : metrics.make_scorer(precision_0),
                            'recall_0' : metrics.make_scorer(recall_0),
                            'spec': metrics.make_scorer(specificity)
                           } 
                
                cv_scores = cross_validate(grid_pipeline.best_estimator_, X_train, y_train, 
                                           cv=cv_outer, scoring=scorings, n_jobs=-1, 
                                           return_train_score = False)

                cv_f1_w_mean = np.mean(cv_scores["test_f1_weighted"])
                cv_f1_w_std = np.std(cv_scores["test_f1_weighted"])
                cv_p1_mean = np.mean(cv_scores["test_precision_1"])
                cv_p1_std = np.std(cv_scores["test_precision_1"])
                cv_r1_mean = np.mean(cv_scores["test_recall_1"])
                cv_r1_std = np.std(cv_scores["test_recall_1"])                
                cv_p0_mean = np.mean(cv_scores["test_precision_0"])
                cv_p0_std = np.std(cv_scores["test_precision_0"])
                cv_r0_mean = np.mean(cv_scores["test_recall_0"])
                cv_r0_std = np.std(cv_scores["test_recall_0"])
                
                cv_auc_mean = np.mean(cv_scores["test_roc_auc"])
                cv_auc_std = np.std(cv_scores["test_roc_auc"])                
                cv_spec_mean = np.mean(cv_scores["test_spec"])
                cv_spec_std = np.std(cv_scores["test_spec"])
                cv_sens_mean = cv_r1_mean
                cv_sens_std = cv_r1_std
                
                print "\nCV OUTER f1-weighted score: %0.3f  (+/-%0.03f)" % (cv_f1_w_mean,cv_f1_w_std)               
                print "CV OUTER prec score [c=0,1]: {:.3f} (+/- {:.3f}), {:.3f}  (+/- {:.3f})".format(cv_p0_mean,cv_p0_std,cv_p1_mean,cv_p1_std)                
                print "CV OUTER rec  score [c=0,1]: {:.3f} (+/- {:.3f}), {:.3f}  (+/- {:.3f})".format(cv_r0_mean,cv_r0_std,cv_r1_mean,cv_r1_std)
                print "CV OUTER AUC score: %0.3f  (+/-%0.03f)" % (cv_auc_mean,cv_auc_std) 
                print "CV OUTER Sensibility score: %0.3f  (+/-%0.03f)" % (cv_sens_mean,cv_sens_std) 
                print "CV OUTER Specificity score: %0.3f  (+/-%0.03f)" % (cv_spec_mean,cv_spec_std)
                print "Selected params (bests from CV) {}".format(grid_pipeline.best_params_.values())
               

                #Compute test scores
                y_pred = grid_pipeline.best_estimator_.predict(X_test)
                test_f1_w = metrics.f1_score(y_test, y_pred, average='weighted', pos_label=None)
                test_p, test_r, test_f1, test_s = metrics.precision_recall_fscore_support(y_test, y_pred,labels=None,average=None, sample_weight=None)
                fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
                test_auc = metrics.auc(fpr, tpr)                
                cm_test = metrics.confusion_matrix(y_test, y_pred)
                tn = cm_test[0,0]
                fp = cm_test[0,1]
                fn = cm_test[1,0]
                tp = cm_test[1,1]
                test_sens = test_r[1]
                test_spec = tn / float(tn+fp)
                
                print "\nTEST f1 (weighted): %0.3f" % (test_f1_w)
                print "TEST Precision [c=0,1]:", test_p
                print "TEST Recall [c=0,1]:", test_r                
                print "TEST AUC: %0.3f" % (test_auc)                
                print "TEST Sensibility:", test_sens
                print "TEST Specificity:", test_spec
                print "Confussion matrix:"
                print "         | PRED"
                print "REAL-->  v "
                print cm_test

                end = time.time()
                print "\nTotal time:", end - start
                
                res = [num_exp,
                       name,
                       tr_thr,
                       fs,
                       sm,
                       cls,
                       lm,
                       grid_pipeline.best_params_.values(),
                       train_sens,
                       train_spec,
                       train_auc,
                       train_r,
                       train_p,
                       train_f1_w,
                       cv_sens_mean,
                       cv_sens_std,
                       cv_spec_mean,
                       cv_spec_std,
                       cv_auc_mean,
                       cv_auc_std,
                       [cv_p0_mean,cv_p1_mean],
                       [cv_p0_std,cv_p1_std],
                       [cv_r0_mean,cv_r1_mean],
                       [cv_r1_std,cv_r0_std],
                       cv_f1_w_mean,
                       cv_f1_w_std,
                       test_sens,
                       test_spec,
                       test_auc,
                       test_r,
                       test_p,
                       test_f1_w,
                       cm_test,              
                       end - start,
                       grid_pipeline.best_estimator_
                      ]
                results.append(res)

                #Save results
                if save:
                    df = pd.DataFrame(np.array(res).reshape(1,35), columns=
                          ["exp", "name",
                           "size_tr","fs","sm","cls","metric","params",
                           "tr_sens","tr_spec","tr_auc",
                           "tr_prec","tr_rec","tr_f1",
                           "cv_sens_mean","cv_sens_std","cv_spec_mean","cv_spec_std","cv_auc_mean","cv_auc_std",
                           "cv_prec_mean","cv_prec_std","cv_rec_mean","cv_rec_std",
                           "cv_f1_mean","cv_f1_std",
                           "test_sens","test_spec","test_auc",
                           "test_rec","test_prec","test_f1",
                           "cm_test",
                           "time","pipeline"])

                    df.to_pickle(os.path.join("resources", "results",
                                          'results_pipe_' + 
                                          "test_" + str(ts_thr) + "_" +
                                          "train_" + str(tr_thr) + "_" +
                                          str(name) + '_' +
                                          str(fs) + '_' +
                                          str(sm) + '_' +
                                          str(lm) + '_' +
                                          str(cls) + '_' +
                                          time.strftime("%Y%m%d-%H%M%S") +
                                          '.pkl'))
    return results

Run experimentation


In [198]:
typeEncounter = "last" # ['first','last']
typeHypothesis = "early_readmission_vs_none" # ['all_readmisssion_vs_none','early_readmission_vs_none']
typeDataFeatures = "extended_extra" # ["non_extended","extended','extended_extra']
    #Extended -> Subset of columns
    #Minimum -> minimum set of columns 
typeDiagnosis = "none"  #["none","diag_1", "diag_3"]    
typeDisease = "subset" # ["subset","any",["Respiratory",...]]
    #subset -> Return subset of predefined disease features
    #any -> Return all disease features    
    #disease -> Return diseases feature
typeDataExperiment = "disease" #["all", "disease"] 
    #all -> Include all diagnosis as columns
    #disease -> Remove diagnosis as column and keep only rows with diagnosis == 1

In [199]:
verbose = True
cv_thr = 0.3
cv_folds = 5

tr_thrs = [0.01] # [0.1,0.2,0.4,0.6,1.0]
ts_thr = 0.30

In [200]:
fs_methods = ["none",] #["none","combine_fs","lasso_fs","rfe_rf_fs"]
cls_methods = ["logReg"] #["rf","svmRBF","logReg","knn","nn","gbt"]
lms = ["recall"] #["f1_weighted","precision_weighted","roc_auc","recall"]
sm_types = ["after"] #["none","after"]
sm_method = "sm_smote"

In [201]:
#Load default params
hyperparams = np.load("../src/default_hyperparams.npy")
print hyperparams


[['fs' 'combine_fs' {'combine_fs__percentile': [5, 10, 20, 30, 40, 50]}]
 ['fs' 'rfe_rf_fs'
  {'rfe_rf_fs__n_features_to_select': [5, 10, 15, 20], 'rfe_rf_fs__step': [0.1]}]
 ['fs' 'lasso_fs' {'lasso_fs__estimator__C': [0.001, 0.01, 0.1, 1]}]
 ['cls' 'knn'
  {'knn__weights': ['uniform', 'distance'], 'knn__n_neighbors': [1, 3, 5, 7, 9, 11]}]
 ['cls' 'logReg'
  {'logReg__class_weight': [None, 'balanced'], 'logReg__C': [1e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 15, 30], 'logReg__penalty': ['l1', 'l2']}]
 ['cls' 'svmRBF'
  {'svmRBF__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 5], 'svmRBF__class_weight': [None, 'balanced'], 'svmRBF__C': [0.01, 0.1, 0.5, 1, 5, 10, 30, 50, 100]}]
 ['cls' 'rf'
  {'rf__criterion': ['entropy', 'gini'], 'rf__max_depth': [None, 4, 8, 12], 'rf__n_estimators': [200, 250, 300, 350, 400, 500]}]
 ['cls' 'nn'
  {'nn__hidden_layer_sizes': [(30,), (50,), (70,), (100,), (150,), (30, 30), (50, 50), (70, 70), (100, 100), (30, 30, 30), (50, 50, 50), (70, 70, 70)], 'nn__alpha': [1e-05, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 3, 5, 10]}]
 ['cls' 'gbt'
  {'gbt__max_depth': [None, 8, 10, 12], 'gbt__learning_rate': [0.1, 0.01, 0.001], 'gbt__n_estimators': [300, 400, 500]}]
 ['after' 'sm_smote' {'sm_smote__k_neighbors': [3, 4, 5]}]]

In [202]:
#Load data
df_all = load_data(typeEncounter, typeDiagnosis, typeDataFeatures)
print "\nSHAPE:"
print df_all.shape
print "\nInitial columns:"
print df_all.columns

#Filter data by class
df_all = filter_data_by_class(df_all, typeHypothesis)
print "\nRows by class type:"
print df_all.iloc[:,-1].sort_values().unique(), np.sum(df_all["readmitted"] == 0), np.sum(df_all["readmitted"] == 1)
    
#Get columns
colsDiseases, colsNonDiseases = MLpipeline.get_columns(df_all,typeDiagnosis)
print "\nDiseases:", colsDiseases
print "\nNon-diseases:", colsNonDiseases
    
#Load diseases
diseases = get_diseases(colsDiseases, typeDisease)
print "\nTotal data:", df_all.shape
print diseases


SHAPE:
(67182, 61)

Initial columns:
Index([u'gender', u'age', u'race_AfricanAmerican', u'race_Caucasian',
       u'race_Other', u'HbA1c', u'Change', u'time_in_hospital', u'diabetesMed',
       u'diss_home', u'medSpec_cardio', u'medSpec_Family/GeneralPractice',
       u'medSpec_InternalMedicine', u'medSpec_surgery', u'adm_src_1',
       u'adm_src_2', u'adm_src_3', u'adm_src_4', u'adm_src_5', u'adm_src_6',
       u'adm_src_7', u'adm_src_8', u'adm_src_10', u'adm_src_11', u'adm_src_13',
       u'adm_src_14', u'adm_src_22', u'adm_src_25', u'adm_1', u'adm_2',
       u'adm_3', u'adm_4', u'adm_7', u'number_treatment',
       u'num_lab_procedures', u'num_procedures', u'num_medications',
       u'number_outpatient', u'number_emergency', u'number_inpatient',
       u'number_diagnoses', u'insulin', u'metformin', u'pioglitazone',
       u'glimepiride', u'glipizide', u'repaglinide', u'nateglinide',
       u'ComplexHbA1c', u'add_in_out', u'add_procs_meds', u'div_visits_time',
       u'div_em_time', u'div_visit_med', u'div_em_med', u'sum_ch_med',
       u'number_treatment_0', u'number_treatment_1', u'number_treatment_2',
       u'number_treatment_3', u'readmitted'],
      dtype='object')

Rows by class type:
[0 1] 39785 5994

Diseases: []

Non-diseases: ['gender', 'age', 'race_AfricanAmerican', 'race_Caucasian', 'race_Other', 'HbA1c', 'Change', 'time_in_hospital', 'diabetesMed', 'diss_home', 'medSpec_cardio', 'medSpec_Family/GeneralPractice', 'medSpec_InternalMedicine', 'medSpec_surgery', 'adm_src_1', 'adm_src_2', 'adm_src_3', 'adm_src_4', 'adm_src_5', 'adm_src_6', 'adm_src_7', 'adm_src_8', 'adm_src_10', 'adm_src_11', 'adm_src_13', 'adm_src_14', 'adm_src_22', 'adm_src_25', 'adm_1', 'adm_2', 'adm_3', 'adm_4', 'adm_7', 'number_treatment', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'insulin', 'metformin', 'pioglitazone', 'glimepiride', 'glipizide', 'repaglinide', 'nateglinide', 'ComplexHbA1c', 'add_in_out', 'add_procs_meds', 'div_visits_time', 'div_em_time', 'div_visit_med', 'div_em_med', 'sum_ch_med', 'number_treatment_0', 'number_treatment_1', 'number_treatment_2', 'number_treatment_3', 'readmitted']

Total data: (45779, 61)
['subset']

In [203]:
from IPython.display import display, HTML
for disease in diseases:
    
    df_all_filtered = filter_data_by_diseases(df_all, disease, typeDataExperiment, colsNonDiseases)
    catCols, reducedCols = compute_type_features(df_all_filtered, typeDataFeatures)
    
    #Apply hyperparams changes
    hyperparams = np.load("../src/default_hyperparams.npy")
    hyperparams[hyperparams[:,1] == 'rfe_rf_fs',2] =  [{'rfe_rf_fs__n_features_to_select': [int(len(reducedCols) * 0.2),
                                                                                         int(len(reducedCols) * 0.4),
                                                                                         int(len(reducedCols) * 0.6)], 
                                                                                         'rfe_rf_fs__step': [0.1]}]    

    p = create_pipelines(catCols,reducedCols, hyperparams, fs_methods, sm_method, sm_types, cls_methods, lms)    

    name = disease + "_" + typeDataFeatures + "_" +  typeDataExperiment + "_" + typeEncounter + "_" + \
           typeHypothesis + "_" + typeDiagnosis
        
    res = run(name, df_all_filtered, catCols, reducedCols, hyperparams, ts_thr, tr_thrs, 
                   fs_methods, sm_method, sm_types, 
                   cls_methods, lms, cv_folds, cv_thr, True, False)


DataSet:
**********
**********
SIZE: 0.01
NAME: subset_extended_extra_disease_last_early_readmission_vs_none_none
(45779, 61)
ALL TRAIN: (320, 60)
TRAIN: [0's: 278 1's: 42 ]
ALL TEST: (13734, 60)
TEST: [0's: 11936 1's: 1798 ]

Num experiment: 0 / 0
****************
FS: none
SM: after
CLS: logReg
METRIC: recall
Fitting 5 folds for each of 56 candidates, totalling 280 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:    6.3s finished
Train f1 (weighted): 0.686
Train Precision [c=0,1]: [ 0.92934783  0.21323529]
Train Recall [c=0,1]: [ 0.61510791  0.69047619]
Train AUC: 0.653
Train Sensibility: 0.690476190476
Train Specificity:  0.615107913669

CV INNER metric: recall
CV INNER selected params [None, 0.05, 'l1']
CV INNER score: 0.646153846154

CV OUTER f1-weighted score: 0.593  (+/-0.033)
CV OUTER prec score [c=0,1]: 0.887 (+/- 0.012), 0.158  (+/- 0.016)
CV OUTER rec  score [c=0,1]: 0.511 (+/- 0.041), 0.585  (+/- 0.038)
CV OUTER AUC score: 0.581  (+/-0.049)
CV OUTER Sensibility score: 0.585  (+/-0.038)
CV OUTER Specificity score: 0.511  (+/-0.041)
Selected params (bests from CV) [None, 0.05, 'l1']

Test f1 (weighted): 0.651
Test Precision [c=0,1]: [ 0.89353083  0.16401028]
Test Recall [c=0,1]: [ 0.59132038  0.53225806]
Test AUC: 0.562
Test Sensibility: 0.532258064516
Test Specificity: 0.591320375335
Confussion matrix:
         | PRED
REAL-->  v 
[[7058 4878]
 [ 841  957]]

Total time: 8.54507684708

In [196]:
res


Out[196]:
[[0,
  'subset_extended_extra_disease_last_early_readmission_vs_none_none',
  0.01,
  'none',
  'after',
  'logReg',
  'recall',
  [None, 0.05, 'l1'],
  0.69047619047619047,
  0,
  0.65279205207262758,
  array([ 0.61510791,  0.69047619]),
  array([ 0.92934783,  0.21323529]),
  0.68586750328323354,
  0.58461538461538454,
  0.037684457581279689,
  0.51084337349397591,
  0.041456989238759651,
  0.58109360518999087,
  0.049441996916956957,
  [0.88651270925178971, 0.15848582540404987],
  [0.012284391197954908, 0.015626190050576267],
  [0.51084337349397591, 0.58461538461538454],
  [0.037684457581279689, 0.041456989238759651],
  0.5934727755533229,
  0.033073194838440151,
  0.532258064516129,
  0.59132037533512061,
  0.56178921992562481,
  array([ 0.59132038,  0.53225806]),
  array([ 0.89353083,  0.16401028]),
  0.65132976138611676,
  array([[7058, 4878],
         [ 841,  957]]),
  8.549112796783447,
  Pipeline(memory=None,
       steps=[('Imputer', TypeFeatImputer(allNameCols=Index([u'gender', u'age', u'race_AfricanAmerican', u'race_Caucasian',
         u'race_Other', u'HbA1c', u'Change', u'time_in_hospital', u'diabetesMed',
         u'diss_home', u'medSpec_cardio', u'medSpec_Family/GeneralPractice',
         u'medSpec_InternalMed...alty='l1', random_state=42, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False))])]]

In [ ]: