In [1]:
    
#TO RE-RUN
%reset -f
    
In [2]:
    
from sklearn import preprocessing
from time import time
import numpy as np
import csv
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.model_selection import GridSearchCV, ParameterGrid,cross_validate
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE,ADASYN, RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
from sklearn.externals.joblib import Memory
from tempfile import mkdtemp
from mlxtend.classifier import StackingCVClassifier, StackingClassifier
from IPython.display import display, HTML
from operator import truediv
from datetime import datetime
import pandas as pd
import sklearn
import time
import os
from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.4f}'.format
plt.style.use('classic')
%matplotlib inline
print('The scikit-learn version is {}.'.format(sklearn.__version__))
    
    
    
In [3]:
    
import sys
sys.path.insert(1, "../src/")
from TypeFeatImputer import TypeFeatImputer
from UnivCombineFilter import UnivCombineFilter
import MLpipeline as MLpipeline
from FeatFilter import FeatFilter
import readmision_methods as rm
    
In [4]:
    
typeEncounter = "last" # ['first','last']
typeHypothesis = "early_readmission_vs_none" # ['all_readmisssion_vs_none','early_readmission_vs_none']
typeDataFeatures = "extended_extra_diag_3" # ["reduced","extended','extended_extra','extended_extra_diag_1','extended_extra_diag_3']
typeDataExperiment = "all" #["all", "disease"]
    
In [25]:
    
verbose = True
cv_thr = 0.3
cv_folds = 5
tr_thrs = 0.7 # [0.1,0.2,0.4,0.6,1.0]
ts_thr = 1 - tr_thrs
selected_filters = [3,6,7,8,9]
fs_methods = ["none"] #["none","combine_fs","lasso_fs","rfe_rf_fs"]
cls_methods = ["logReg"] #["rf","svmRBF","logReg","knn","nn","gbt"]
lms = ["recall"] #["f1_weighted","average_precision","roc_auc","recall"]
sm_types = ["none"] #["none","after"]tr_thrs
sm_method = "sm_smote"
    
In [26]:
    
#Load data
df_all = rm.load_data(typeEncounter, typeDataFeatures)
print "\nSHAPE:"
print df_all.shape
#Filter data by class
df_all = rm.filter_data_by_class(df_all, typeHypothesis)
print "\nSHAPE FILTERED:"
print df_all.shape
print "\nRows by class type:"
print df_all.iloc[:,-1].sort_values().unique(), np.sum(df_all["readmitted"] == 0), np.sum(df_all["readmitted"] == 1)
#Train (on ts_thr percentage) & Test
X_train, X_test, y_train, y_test = MLpipeline.train_test_partition(df_all, ts_thr)
columns = df_all.columns
print "\nTrain:", X_train.shape, "Test:",  X_test.shape
#Create filters
featFilters = rm.create_filters(df_all)
print [[f[0],np.sum(f[1])] for f in featFilters]
    
    
In [27]:
    
def create_inner_pipeline(param_prefix, catCols,reducedCols, hyperparams, cls_method, featsToFilter=None):
    
    # Create a temporary folder to store the transformers of the pipeline
    cachedir = mkdtemp()
    memory = Memory(cachedir=cachedir, verbose=0)                        
    
    basePipeline = Pipeline([
            ("FeatFilter", FeatFilter(featsToFilter)),
            ("Imputer", TypeFeatImputer(catCols, reducedCols)),
            ("Scaler", StandardScaler()),
            ("Variance", VarianceThreshold(threshold=0.0))
        ], memory = memory)
    params = {}
    pipe = Pipeline(list(basePipeline.steps))
    if cls_method == "knn":
        pipe.steps.append((cls_method, KNeighborsClassifier()))
                        
    if cls_method == "rf":
        pipe.steps.append((cls_method, RandomForestClassifier(n_jobs=-1,random_state=42)))
    if cls_method == "nb":
        pipe.steps.append((cls_method, GaussianNB()))
    if cls_method == "logReg":
        pipe.steps.append((cls_method, LogisticRegression(random_state=42)))
    #Add param prefix
    pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
    new_pm = {}
    for key, values in pm.iteritems():
        key = param_prefix + key
        new_pm[key] = values
    params.update(new_pm) 
                    
    return pipe,params
def evaluate(name, y_test, y_pred):
    test_f1_w = metrics.f1_score(y_test, y_pred, average='weighted', pos_label=None)
    test_p, test_r, test_f1, test_s = metrics.precision_recall_fscore_support(y_test, y_pred,labels=None,average=None, sample_weight=None)
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    test_auc = metrics.auc(fpr, tpr)                
    cm_test = metrics.confusion_matrix(y_test, y_pred)
    tn = cm_test[0,0]
    fp = cm_test[0,1]
    fn = cm_test[1,0]
    tp = cm_test[1,1]
    test_sens = test_r[1]
    test_spec = tn / float(tn+fp)
    print "\nEvaluation:", name
    print "*******************"
    print          
    print "TEST AUC: %0.3f" % (test_auc)                
    print "TEST sensitivity:", test_sens
    print "TEST Specificity:", test_spec
    print
    print "TEST f1 (weighted): %0.3f" % (test_f1_w)
    print "TEST f1 [c=0,1]", test_f1
    print "TEST Precision [c=0,1]:", test_p
    print "TEST Recall [c=0,1]:", test_r  
    
    print "Confussion matrix:"
    print "         | PRED"
    print "REAL-->  v "
    print cm_test
    
In [28]:
    
pipes = []
params = []
num_pipeline = 1
for sel in selected_filters:
    for cls_method  in cls_methods:
        
        n,f = featFilters[sel]
        #Get categoric/numeric
        f_cols = columns[:-1][f==1].values.tolist()
        f_cols.append("readmitted")       
        catCols, reducedCols = rm.compute_type_features(f_cols)
        #Get hyperparams
        hyperparams = np.load("../src/default_hyperparams.npy")
        #Create pipeline
        prefix = "pipeline-" + str(num_pipeline) + "__"
        pipe, param = create_inner_pipeline(prefix, catCols, reducedCols, hyperparams,cls_method, f)
        #Add results
        pipes.append(pipe)
        params.append(param)
        
        num_pipeline += 1
#Add meta hyperparams
param_prefix = "meta-"
meta_params = {}
new_pm = {}
pm = hyperparams[hyperparams[:,1] == "rf",2][0]
for key, values in pm.iteritems():
    key = param_prefix + key.replace("rf","randomforestclassifier")
    new_pm[key] = values
meta_params.update(new_pm)
params.append(meta_params)
print "Num pipelines:", num_pipeline
print "Selected filters:",selected_filters
print "Params:", params
    
    
In [ ]:
    
cv_inner = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr, random_state=24)
sclf = StackingClassifier(classifiers=pipes, verbose=0, use_probas=True, average_probas=False,
                            meta_classifier=RandomForestClassifier())    
grid = GridSearchCV(estimator=sclf, param_grid=params, n_jobs=-1,cv=cv_inner, 
                    scoring="recall",refit=True, verbose=1,error_score = 0)
grid.fit(X_train, y_train)
    
    
In [10]:
    
cv_outer = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr, random_state=42)
scorings = {'roc_auc': 'roc_auc',
            'f1_weighted':'f1_weighted',
            'precision_1':'precision',
            'recall_1':'recall',
            'precision_0' : metrics.make_scorer(MLpipeline.precision_0),
            'recall_0' : metrics.make_scorer(MLpipeline.recall_0),
            'spec': metrics.make_scorer(MLpipeline.specificity)
           }
    
In [17]:
    
for i, c in enumerate(grid.best_estimator_.clfs_):
    
    
    #Performance
    cv_scores = cross_validate(c, X_train, y_train, 
                               cv=cv_outer, scoring=scorings, n_jobs=-1, return_train_score = False)
    
    for pre in ["test"]:
        cv_f1_w_mean = np.mean(cv_scores[pre + "_f1_weighted"])
        cv_f1_w_std = np.std(cv_scores[pre + "_f1_weighted"])
        cv_p1_mean = np.mean(cv_scores[pre + "_precision_1"])
        cv_p1_std = np.std(cv_scores[pre + "_precision_1"])
        cv_r1_mean = np.mean(cv_scores[pre + "_recall_1"])
        cv_r1_std = np.std(cv_scores[pre + "_recall_1"])                
        cv_p0_mean = np.mean(cv_scores[pre + "_precision_0"])
        cv_p0_std = np.std(cv_scores[pre + "_precision_0"])
        cv_r0_mean = np.mean(cv_scores[pre + "_recall_0"])
        cv_r0_std = np.std(cv_scores[pre + "_recall_0"])
        cv_auc_mean = np.mean(cv_scores[pre + "_roc_auc"])
        cv_auc_std = np.std(cv_scores[pre + "_roc_auc"])                
        cv_spec_mean = np.mean(cv_scores[pre + "_spec"])
        cv_spec_std = np.std(cv_scores[pre + "_spec"])
        cv_sens_mean = cv_r1_mean
        cv_sens_std = cv_r1_std
        print "\nEvaluation: ", i
        print "*******************"
        print "\nf1-weighted score: %0.3f  (+/-%0.03f)" % (cv_f1_w_mean,cv_f1_w_std)               
        print "Prec score [c=0,1]: {:.3f} (+/- {:.3f}), {:.3f}  (+/- {:.3f})".format(cv_p0_mean,cv_p0_std,cv_p1_mean,cv_p1_std)                
        print "Rec  score [c=0,1]: {:.3f} (+/- {:.3f}), {:.3f}  (+/- {:.3f})".format(cv_r0_mean,cv_r0_std,cv_r1_mean,cv_r1_std)
        print "AUC score: %0.3f  (+/-%0.03f)" % (cv_auc_mean,cv_auc_std) 
        print "Sensitivity score: %0.3f  (+/-%0.03f)" % (cv_sens_mean,cv_sens_std) 
        print "Specificity score: %0.3f  (+/-%0.03f)" % (cv_spec_mean,cv_spec_std) 
        
    #Feature importances
    sel = selected_filters[i]
    n,f = featFilters[sel]
    f_cols = columns[:-1][f==1].values    
    f_cols = f_cols[c.steps[-2][1].get_support()]
    
    print i, sel, n, len(f_cols), len(c.steps[-1][1].coef_[0])
    
    res = np.hstack((f_cols.reshape(-1,1),c.steps[-1][1].coef_.reshape(-1,1)))
    print
    print res[abs(res[:,1])>0.1,:]
    
    
In [18]:
    
cv_scores = cross_validate(grid.best_estimator_, X_train, y_train, 
                           cv=cv_outer, scoring=scorings, n_jobs=-1, return_train_score = True)
res = [ts_thr,sm_method[0],fs_methods[0],str(selected_filters),lms[0],cls_methods[0]]
for pre in ["train","test"]:
    cv_f1_w_mean = np.mean(cv_scores[pre + "_f1_weighted"])
    cv_f1_w_std = np.std(cv_scores[pre + "_f1_weighted"])
    cv_p1_mean = np.mean(cv_scores[pre + "_precision_1"])
    cv_p1_std = np.std(cv_scores[pre + "_precision_1"])
    cv_r1_mean = np.mean(cv_scores[pre + "_recall_1"])
    cv_r1_std = np.std(cv_scores[pre + "_recall_1"])                
    cv_p0_mean = np.mean(cv_scores[pre + "_precision_0"])
    cv_p0_std = np.std(cv_scores[pre + "_precision_0"])
    cv_r0_mean = np.mean(cv_scores[pre + "_recall_0"])
    cv_r0_std = np.std(cv_scores[pre + "_recall_0"])
    cv_auc_mean = np.mean(cv_scores[pre + "_roc_auc"])
    cv_auc_std = np.std(cv_scores[pre + "_roc_auc"])                
    cv_spec_mean = np.mean(cv_scores[pre + "_spec"])
    cv_spec_std = np.std(cv_scores[pre + "_spec"])
    cv_sens_mean = cv_r1_mean
    cv_sens_std = cv_r1_std
    print "\nEvaluation: ", "CV" if pre == "test" else "Train"
    print "*******************"
    print "\nf1-weighted score: %0.3f  (+/-%0.03f)" % (cv_f1_w_mean,cv_f1_w_std)               
    print "Prec score [c=0,1]: {:.3f} (+/- {:.3f}), {:.3f}  (+/- {:.3f})".format(cv_p0_mean,cv_p0_std,cv_p1_mean,cv_p1_std)                
    print "Rec  score [c=0,1]: {:.3f} (+/- {:.3f}), {:.3f}  (+/- {:.3f})".format(cv_r0_mean,cv_r0_std,cv_r1_mean,cv_r1_std)
    print "AUC score: %0.3f  (+/-%0.03f)" % (cv_auc_mean,cv_auc_std) 
    print "Sensitivity score: %0.3f  (+/-%0.03f)" % (cv_sens_mean,cv_sens_std) 
    print "Specificity score: %0.3f  (+/-%0.03f)" % (cv_spec_mean,cv_spec_std)
    
    res.extend([cv_f1_w_mean,cv_f1_w_std,
                cv_p0_mean,cv_p0_std,cv_p1_mean,cv_p1_std,
                cv_r0_mean,cv_r0_std,cv_r1_mean,cv_r1_std,
                cv_auc_mean,cv_auc_std,
                cv_sens_mean,cv_sens_std,
                cv_spec_mean,cv_spec_std
               ])
print "\nSummary:"
print "*********"
print grid.best_estimator_
print grid.best_params_
print grid.best_score_
print res
    
    
In [19]:
    
#Predict test data    
y_pred = grid.predict(X_test)
#Evaluate results
evaluate("test", y_test, y_pred)
    
    
In [23]:
    
np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.2f}'.format
df_aux = pd.DataFrame(np.array(res).reshape(1,38),columns=["ts_thr","sm","fs","pipes","metric","cls",
"tr_f1_w_mean","tr_f1_w_std","tr_p0_mean","tr_p0_std","tr_p1_mean","tr_p1_std",
"tr_r0_mean","tr_r0_std","tr_r1_mean","tr_r1_std","tr_auc_mean","tr_auc_std",
"tr_sens_mean","tr_sens_std","tr_spec_mean","tr_spec_std",
"cv_f1_w_mean","cv_f1_w_std","cv_p0_mean","cv_p0_std","cv_p1_mean","cv_p1_std",
"cv_r0_mean","cv_r0_std","cv_r1_mean","cv_r1_std","cv_auc_mean","cv_auc_std",
"cv_sens_mean","cv_sens_std","cv_spec_mean","cv_spec_std"                                                       
 ])
df_aux[["ts_thr","sm","fs","pipes","metric","cls","tr_auc_mean",
        "tr_sens_mean","tr_spec_mean","cv_auc_mean","cv_sens_mean","cv_spec_mean"]].round(2)
    
    Out[23]:
In [24]:
    
df = pd.concat([df,df_aux])
df[["ts_thr","sm","fs","pipes","metric","cls",
    "tr_auc_mean","tr_sens_mean","tr_spec_mean","cv_auc_mean","cv_sens_mean","cv_spec_mean"]]
    
    Out[24]:
In [68]:
    
from sklearn.externals import joblib
joblib.dump(clf, os.path.join('resources','results','ensemble_train_0_2_inner_logReg_outer_rf_3_pipes.pkl'))
    
In [22]:
    
filename = "res_ensemble_20171031-124424"
#filename = 'res_ensemble_' + time.strftime("%Y%m%d-%H%M%S") + '.pkl'
df.to_pickle(os.path.join('resources','results', filename + ".pkl"))
    
In [ ]:
    
df = pd.read_pickle(os.path.join('resources','results',"res_ensemble_20171031-124424.pkl"))
    
In [ ]:
    
df[["tr_auc_mean","tr_sens_mean","tr_spec_mean","cv_auc_mean","cv_sens_mean","cv_spec_mean"]]
    
In [ ]: