In [1]:
#TO RE-RUN
%reset -f
In [2]:
from sklearn import preprocessing
from time import time
import numpy as np
import csv
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.model_selection import GridSearchCV, ParameterGrid,cross_validate
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE,ADASYN, RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
from sklearn.externals.joblib import Memory
from tempfile import mkdtemp
from mlxtend.classifier import StackingCVClassifier, StackingClassifier
from IPython.display import display, HTML
from operator import truediv
from datetime import datetime
import pandas as pd
import sklearn
import time
import os
from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.4f}'.format
plt.style.use('classic')
%matplotlib inline
print('The scikit-learn version is {}.'.format(sklearn.__version__))
In [3]:
import sys
sys.path.insert(1, "../src/")
from TypeFeatImputer import TypeFeatImputer
from UnivCombineFilter import UnivCombineFilter
import MLpipeline as MLpipeline
from FeatFilter import FeatFilter
import readmision_methods as rm
In [4]:
typeEncounter = "last" # ['first','last']
typeHypothesis = "early_readmission_vs_none" # ['all_readmisssion_vs_none','early_readmission_vs_none']
typeDataFeatures = "extended_extra_diag_3" # ["reduced","extended','extended_extra','extended_extra_diag_1','extended_extra_diag_3']
typeDataExperiment = "all" #["all", "disease"]
In [25]:
verbose = True
cv_thr = 0.3
cv_folds = 5
tr_thrs = 0.7 # [0.1,0.2,0.4,0.6,1.0]
ts_thr = 1 - tr_thrs
selected_filters = [3,6,7,8,9]
fs_methods = ["none"] #["none","combine_fs","lasso_fs","rfe_rf_fs"]
cls_methods = ["logReg"] #["rf","svmRBF","logReg","knn","nn","gbt"]
lms = ["recall"] #["f1_weighted","average_precision","roc_auc","recall"]
sm_types = ["none"] #["none","after"]tr_thrs
sm_method = "sm_smote"
In [26]:
#Load data
df_all = rm.load_data(typeEncounter, typeDataFeatures)
print "\nSHAPE:"
print df_all.shape
#Filter data by class
df_all = rm.filter_data_by_class(df_all, typeHypothesis)
print "\nSHAPE FILTERED:"
print df_all.shape
print "\nRows by class type:"
print df_all.iloc[:,-1].sort_values().unique(), np.sum(df_all["readmitted"] == 0), np.sum(df_all["readmitted"] == 1)
#Train (on ts_thr percentage) & Test
X_train, X_test, y_train, y_test = MLpipeline.train_test_partition(df_all, ts_thr)
columns = df_all.columns
print "\nTrain:", X_train.shape, "Test:", X_test.shape
#Create filters
featFilters = rm.create_filters(df_all)
print [[f[0],np.sum(f[1])] for f in featFilters]
In [27]:
def create_inner_pipeline(param_prefix, catCols,reducedCols, hyperparams, cls_method, featsToFilter=None):
# Create a temporary folder to store the transformers of the pipeline
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=0)
basePipeline = Pipeline([
("FeatFilter", FeatFilter(featsToFilter)),
("Imputer", TypeFeatImputer(catCols, reducedCols)),
("Scaler", StandardScaler()),
("Variance", VarianceThreshold(threshold=0.0))
], memory = memory)
params = {}
pipe = Pipeline(list(basePipeline.steps))
if cls_method == "knn":
pipe.steps.append((cls_method, KNeighborsClassifier()))
if cls_method == "rf":
pipe.steps.append((cls_method, RandomForestClassifier(n_jobs=-1,random_state=42)))
if cls_method == "nb":
pipe.steps.append((cls_method, GaussianNB()))
if cls_method == "logReg":
pipe.steps.append((cls_method, LogisticRegression(random_state=42)))
#Add param prefix
pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
new_pm = {}
for key, values in pm.iteritems():
key = param_prefix + key
new_pm[key] = values
params.update(new_pm)
return pipe,params
def evaluate(name, y_test, y_pred):
test_f1_w = metrics.f1_score(y_test, y_pred, average='weighted', pos_label=None)
test_p, test_r, test_f1, test_s = metrics.precision_recall_fscore_support(y_test, y_pred,labels=None,average=None, sample_weight=None)
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
test_auc = metrics.auc(fpr, tpr)
cm_test = metrics.confusion_matrix(y_test, y_pred)
tn = cm_test[0,0]
fp = cm_test[0,1]
fn = cm_test[1,0]
tp = cm_test[1,1]
test_sens = test_r[1]
test_spec = tn / float(tn+fp)
print "\nEvaluation:", name
print "*******************"
print
print "TEST AUC: %0.3f" % (test_auc)
print "TEST sensitivity:", test_sens
print "TEST Specificity:", test_spec
print
print "TEST f1 (weighted): %0.3f" % (test_f1_w)
print "TEST f1 [c=0,1]", test_f1
print "TEST Precision [c=0,1]:", test_p
print "TEST Recall [c=0,1]:", test_r
print "Confussion matrix:"
print " | PRED"
print "REAL--> v "
print cm_test
In [28]:
pipes = []
params = []
num_pipeline = 1
for sel in selected_filters:
for cls_method in cls_methods:
n,f = featFilters[sel]
#Get categoric/numeric
f_cols = columns[:-1][f==1].values.tolist()
f_cols.append("readmitted")
catCols, reducedCols = rm.compute_type_features(f_cols)
#Get hyperparams
hyperparams = np.load("../src/default_hyperparams.npy")
#Create pipeline
prefix = "pipeline-" + str(num_pipeline) + "__"
pipe, param = create_inner_pipeline(prefix, catCols, reducedCols, hyperparams,cls_method, f)
#Add results
pipes.append(pipe)
params.append(param)
num_pipeline += 1
#Add meta hyperparams
param_prefix = "meta-"
meta_params = {}
new_pm = {}
pm = hyperparams[hyperparams[:,1] == "rf",2][0]
for key, values in pm.iteritems():
key = param_prefix + key.replace("rf","randomforestclassifier")
new_pm[key] = values
meta_params.update(new_pm)
params.append(meta_params)
print "Num pipelines:", num_pipeline
print "Selected filters:",selected_filters
print "Params:", params
In [ ]:
cv_inner = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr, random_state=24)
sclf = StackingClassifier(classifiers=pipes, verbose=0, use_probas=True, average_probas=False,
meta_classifier=RandomForestClassifier())
grid = GridSearchCV(estimator=sclf, param_grid=params, n_jobs=-1,cv=cv_inner,
scoring="recall",refit=True, verbose=1,error_score = 0)
grid.fit(X_train, y_train)
In [10]:
cv_outer = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr, random_state=42)
scorings = {'roc_auc': 'roc_auc',
'f1_weighted':'f1_weighted',
'precision_1':'precision',
'recall_1':'recall',
'precision_0' : metrics.make_scorer(MLpipeline.precision_0),
'recall_0' : metrics.make_scorer(MLpipeline.recall_0),
'spec': metrics.make_scorer(MLpipeline.specificity)
}
In [17]:
for i, c in enumerate(grid.best_estimator_.clfs_):
#Performance
cv_scores = cross_validate(c, X_train, y_train,
cv=cv_outer, scoring=scorings, n_jobs=-1, return_train_score = False)
for pre in ["test"]:
cv_f1_w_mean = np.mean(cv_scores[pre + "_f1_weighted"])
cv_f1_w_std = np.std(cv_scores[pre + "_f1_weighted"])
cv_p1_mean = np.mean(cv_scores[pre + "_precision_1"])
cv_p1_std = np.std(cv_scores[pre + "_precision_1"])
cv_r1_mean = np.mean(cv_scores[pre + "_recall_1"])
cv_r1_std = np.std(cv_scores[pre + "_recall_1"])
cv_p0_mean = np.mean(cv_scores[pre + "_precision_0"])
cv_p0_std = np.std(cv_scores[pre + "_precision_0"])
cv_r0_mean = np.mean(cv_scores[pre + "_recall_0"])
cv_r0_std = np.std(cv_scores[pre + "_recall_0"])
cv_auc_mean = np.mean(cv_scores[pre + "_roc_auc"])
cv_auc_std = np.std(cv_scores[pre + "_roc_auc"])
cv_spec_mean = np.mean(cv_scores[pre + "_spec"])
cv_spec_std = np.std(cv_scores[pre + "_spec"])
cv_sens_mean = cv_r1_mean
cv_sens_std = cv_r1_std
print "\nEvaluation: ", i
print "*******************"
print "\nf1-weighted score: %0.3f (+/-%0.03f)" % (cv_f1_w_mean,cv_f1_w_std)
print "Prec score [c=0,1]: {:.3f} (+/- {:.3f}), {:.3f} (+/- {:.3f})".format(cv_p0_mean,cv_p0_std,cv_p1_mean,cv_p1_std)
print "Rec score [c=0,1]: {:.3f} (+/- {:.3f}), {:.3f} (+/- {:.3f})".format(cv_r0_mean,cv_r0_std,cv_r1_mean,cv_r1_std)
print "AUC score: %0.3f (+/-%0.03f)" % (cv_auc_mean,cv_auc_std)
print "Sensitivity score: %0.3f (+/-%0.03f)" % (cv_sens_mean,cv_sens_std)
print "Specificity score: %0.3f (+/-%0.03f)" % (cv_spec_mean,cv_spec_std)
#Feature importances
sel = selected_filters[i]
n,f = featFilters[sel]
f_cols = columns[:-1][f==1].values
f_cols = f_cols[c.steps[-2][1].get_support()]
print i, sel, n, len(f_cols), len(c.steps[-1][1].coef_[0])
res = np.hstack((f_cols.reshape(-1,1),c.steps[-1][1].coef_.reshape(-1,1)))
print
print res[abs(res[:,1])>0.1,:]
In [18]:
cv_scores = cross_validate(grid.best_estimator_, X_train, y_train,
cv=cv_outer, scoring=scorings, n_jobs=-1, return_train_score = True)
res = [ts_thr,sm_method[0],fs_methods[0],str(selected_filters),lms[0],cls_methods[0]]
for pre in ["train","test"]:
cv_f1_w_mean = np.mean(cv_scores[pre + "_f1_weighted"])
cv_f1_w_std = np.std(cv_scores[pre + "_f1_weighted"])
cv_p1_mean = np.mean(cv_scores[pre + "_precision_1"])
cv_p1_std = np.std(cv_scores[pre + "_precision_1"])
cv_r1_mean = np.mean(cv_scores[pre + "_recall_1"])
cv_r1_std = np.std(cv_scores[pre + "_recall_1"])
cv_p0_mean = np.mean(cv_scores[pre + "_precision_0"])
cv_p0_std = np.std(cv_scores[pre + "_precision_0"])
cv_r0_mean = np.mean(cv_scores[pre + "_recall_0"])
cv_r0_std = np.std(cv_scores[pre + "_recall_0"])
cv_auc_mean = np.mean(cv_scores[pre + "_roc_auc"])
cv_auc_std = np.std(cv_scores[pre + "_roc_auc"])
cv_spec_mean = np.mean(cv_scores[pre + "_spec"])
cv_spec_std = np.std(cv_scores[pre + "_spec"])
cv_sens_mean = cv_r1_mean
cv_sens_std = cv_r1_std
print "\nEvaluation: ", "CV" if pre == "test" else "Train"
print "*******************"
print "\nf1-weighted score: %0.3f (+/-%0.03f)" % (cv_f1_w_mean,cv_f1_w_std)
print "Prec score [c=0,1]: {:.3f} (+/- {:.3f}), {:.3f} (+/- {:.3f})".format(cv_p0_mean,cv_p0_std,cv_p1_mean,cv_p1_std)
print "Rec score [c=0,1]: {:.3f} (+/- {:.3f}), {:.3f} (+/- {:.3f})".format(cv_r0_mean,cv_r0_std,cv_r1_mean,cv_r1_std)
print "AUC score: %0.3f (+/-%0.03f)" % (cv_auc_mean,cv_auc_std)
print "Sensitivity score: %0.3f (+/-%0.03f)" % (cv_sens_mean,cv_sens_std)
print "Specificity score: %0.3f (+/-%0.03f)" % (cv_spec_mean,cv_spec_std)
res.extend([cv_f1_w_mean,cv_f1_w_std,
cv_p0_mean,cv_p0_std,cv_p1_mean,cv_p1_std,
cv_r0_mean,cv_r0_std,cv_r1_mean,cv_r1_std,
cv_auc_mean,cv_auc_std,
cv_sens_mean,cv_sens_std,
cv_spec_mean,cv_spec_std
])
print "\nSummary:"
print "*********"
print grid.best_estimator_
print grid.best_params_
print grid.best_score_
print res
In [19]:
#Predict test data
y_pred = grid.predict(X_test)
#Evaluate results
evaluate("test", y_test, y_pred)
In [23]:
np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.2f}'.format
df_aux = pd.DataFrame(np.array(res).reshape(1,38),columns=["ts_thr","sm","fs","pipes","metric","cls",
"tr_f1_w_mean","tr_f1_w_std","tr_p0_mean","tr_p0_std","tr_p1_mean","tr_p1_std",
"tr_r0_mean","tr_r0_std","tr_r1_mean","tr_r1_std","tr_auc_mean","tr_auc_std",
"tr_sens_mean","tr_sens_std","tr_spec_mean","tr_spec_std",
"cv_f1_w_mean","cv_f1_w_std","cv_p0_mean","cv_p0_std","cv_p1_mean","cv_p1_std",
"cv_r0_mean","cv_r0_std","cv_r1_mean","cv_r1_std","cv_auc_mean","cv_auc_std",
"cv_sens_mean","cv_sens_std","cv_spec_mean","cv_spec_std"
])
df_aux[["ts_thr","sm","fs","pipes","metric","cls","tr_auc_mean",
"tr_sens_mean","tr_spec_mean","cv_auc_mean","cv_sens_mean","cv_spec_mean"]].round(2)
Out[23]:
In [24]:
df = pd.concat([df,df_aux])
df[["ts_thr","sm","fs","pipes","metric","cls",
"tr_auc_mean","tr_sens_mean","tr_spec_mean","cv_auc_mean","cv_sens_mean","cv_spec_mean"]]
Out[24]:
In [68]:
from sklearn.externals import joblib
joblib.dump(clf, os.path.join('resources','results','ensemble_train_0_2_inner_logReg_outer_rf_3_pipes.pkl'))
In [22]:
filename = "res_ensemble_20171031-124424"
#filename = 'res_ensemble_' + time.strftime("%Y%m%d-%H%M%S") + '.pkl'
df.to_pickle(os.path.join('resources','results', filename + ".pkl"))
In [ ]:
df = pd.read_pickle(os.path.join('resources','results',"res_ensemble_20171031-124424.pkl"))
In [ ]:
df[["tr_auc_mean","tr_sens_mean","tr_spec_mean","cv_auc_mean","cv_sens_mean","cv_spec_mean"]]
In [ ]: