In [172]:
%reset -f
In [173]:
from sklearn import preprocessing
from time import time
import numpy as np
import csv
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE,ADASYN, RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
from operator import truediv
from datetime import datetime
import pandas as pd
import time
import os
from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.2f}'.format
plt.style.use('classic')
%matplotlib inline
import sys
sys.path.insert(1, "../src/")
from TypeFeatImputer import TypeFeatImputer
from UnivCombineFilter import UnivCombineFilter
import MLpipeline as MLpipeline
In [174]:
#Local methods
def load_data(typeEncounter, typeDiagnosis, typeDataFeatures):
if typeDataFeatures == "non_extended":
df_all=pd.read_pickle(os.path.join('resources','prepared_clean_data_' + typeEncounter + "_" + typeDiagnosis + '.pkl'))
else:
df_all=pd.read_pickle(os.path.join('resources','prepared_clean_data_' + typeEncounter + "_" + typeDiagnosis + '_' + typeDataFeatures + '.pkl'))
return df_all
def get_columns(df_all, typeDiagnosis):
colsDiseases = []
if typeDiagnosis == "diag_1":
colsDiseases = [u'Diabetis_1', u'Circulatory_1', u'Digestive_1', u'Genitourinary_1', u'Poisoning_1', u'Muscoskeletal_1',
u'Neoplasms_1', u'Respiratory_1']
if typeDiagnosis == "diag_3":
colsDiseases = [u'Diabetis_3', u'Circulatory_3', u'Digestive_3', u'Genitourinary_3', u'Poisoning_3', u'Muscoskeletal_3',
u'Neoplasms_3', u'Respiratory_3']
colsNonDiseases = [c for c in df_all.columns if c not in colsDiseases]
return colsDiseases, colsNonDiseases
def filter_data_by_class(df_all, typeHypothesis):
# Readmitted none vs readmitted
if typeHypothesis == "all_readmisssion_vs_none":
df_all["readmitted"][df_all["readmitted"].values > 0] = 1
# Readmitted none vs early readmitted
if typeHypothesis == "early_readmission_vs_none":
df_all= df_all[df_all["readmitted"].isin([0,1])]
return df_all
def compute_type_features(df_all, typeDataFeatures):
numCols = ['time_in_hospital','num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient',
'number_emergency', 'number_inpatient', 'number_diagnoses',
'add_in_out', 'add_procs_meds', 'div_visits_time', 'div_em_time', 'div_visit_med', 'div_em_med',
'number_treatment','number_treatment_0','number_treatment_1','number_treatment_2','number_treatment_3']
catCols = []
cols = df_all.columns
reducedCols = cols[:-1]
for i in range(len(cols)-1):
if cols[i] not in numCols:
catCols.append(1)
else:
catCols.append(0)
catCols = np.array(catCols)
return catCols, reducedCols
def get_diseases(colsDiseases, typeDisease):
if typeDisease == "subset":
return ["subset"]
else:
if typeDisease in colsDiseases:
return [typeDisease]
else:
return colsDiseases
def filter_data_by_diseases(df_all, disease, typeDataExperiment, colsNonDiseases):
if disease == "subset":
df_all_filtered = df_all.copy()
else:
cols_filtered = colsNonDiseases[:]
cols_filtered.insert(-1, disease)
df_all_filtered = df_all[cols_filtered].copy()
if typeDataExperiment == "disease" and disease != "subset":
df_all_filtered = df_all_filtered[df_all_filtered[disease] == 1]
df_all_filtered = df_all_filtered[[c for c in df_all_filtered.columns if c != disease]]
return df_all_filtered
In [197]:
#Generic methods
def train_test_partition(df_all, ts_thr=0.30):
y = df_all.readmitted
y = y.values
X = df_all.iloc[:,:-1].values
sss = StratifiedShuffleSplit(y, 1, test_size=ts_thr, random_state=32) #random_state=42
for train_index, test_index in sss:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
return X_train, X_test, y_train, y_test
def train_partition(X_train, y_train, tr_thr=0.10):
X_train_aux = []
y_train_aux = []
if tr_thr >= 1.0:
X_train_aux = X_train
y_train_aux = y_train
else:
sss = StratifiedShuffleSplit(y_train, 1, test_size=1-tr_thr, random_state=32) #random_state=42
for train_index, test_index in sss:
X_train_aux = X_train[train_index]
y_train_aux = y_train[train_index]
return X_train_aux, y_train_aux
def create_pipelines(catCols,reducedCols, hyperparams, fs_methods, sm_method, sm_types, cls_methods, lms):
basePipeline = Pipeline([
("Imputer", TypeFeatImputer(catCols, reducedCols)),
("Scaler", StandardScaler()),
("Variance", VarianceThreshold(threshold=0.0))
])
pipeline = []
for fs_method in fs_methods:
for sm_type in sm_types:
for cls_method in cls_methods:
for lm in lms:
if not (fs_method == "rfe_rf_fs" and cls_method == "rf") and not(fs_method == "lasso_fs" and cls_method == "logReg"):
params = {}
pipe = Pipeline(list(basePipeline.steps))
if fs_method == "combine_fs":
pipe.steps.insert(1,(fs_method, UnivCombineFilter(catCols,np.array(reducedCols))))
pm = hyperparams[hyperparams[:,1] == fs_method,2][0]
params.update(pm)
if fs_method == "rfe_rf_fs":
pipe.steps.append((fs_method, RFE(estimator=RandomForestClassifier(class_weight='balanced',
n_estimators=100,
random_state=33))))
pm = hyperparams[hyperparams[:,1] == fs_method,2][0]
params.update(pm)
if fs_method == 'lasso_fs':
pipe.steps.append((fs_method, SelectFromModel(
LogisticRegression(n_jobs=-1, penalty="l1", dual=False, random_state=42))))
pm = hyperparams[hyperparams[:,1] == fs_method,2][0]
params.update(pm)
#Add classifiers
if cls_method == "knn":
pipe.steps.append((cls_method, KNeighborsClassifier()))
pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
params.update(pm)
if cls_method == "logReg":
pipe.steps.append((cls_method, LogisticRegression(random_state=42)))
pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
params.update(pm)
if cls_method == "svmRBF":
pipe.steps.append((cls_method, SVC(random_state=42,probability=True)))
pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
params.update(pm)
if cls_method == "rf":
pipe.steps.append((cls_method, RandomForestClassifier(n_jobs=-1,class_weight='balanced',random_state=42)))
pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
params.update(pm)
if cls_method == "gbt":
pipe.steps.append((cls_method, GradientBoostingClassifier(random_state=42,subsample=0.1,loss="deviance")))
pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
params.update(pm)
if cls_method == "nb":
pipe.steps.append((cls_method, GaussianNB()))
params.update({})
if cls_method == "nn":
pipe.steps.append((cls_method, MLPClassifier(
activation='logistic',
solver='lbfgs',
hidden_layer_sizes=(5, 2),
random_state=13)))
pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
params.update(pm)
#Add sampling
pipe_imb = make_pipeline(*[p[1] for p in pipe.steps])
stps = len(pipe_imb.steps)
for s in range(stps):
pipe_imb.steps.remove(pipe_imb.steps[0])
for s in range(stps):
pipe_imb.steps.append(pipe.steps[s])
if sm_type == "after":
pipe_imb.steps.insert(stps - 1,
(sm_method, SMOTE(ratio='auto', kind='regular', random_state=32)))
pm = hyperparams[hyperparams[:,1] == cls_method,2][0]
params.update(pm)
pipeline.append([fs_method,sm_type,cls_method,lm,pipe_imb,params])
pipelines = pd.DataFrame(pipeline, columns=["fs","sm","cls","metric","pipe","pipe_params"])
pipelines.sort_values("fs", inplace=True)
return pipelines
def precision_0(ground_truth, predictions):
prec = metrics.precision_score(ground_truth, predictions,pos_label=0)
return prec
def recall_0(ground_truth, predictions):
rec = metrics.recall_score(ground_truth, predictions,pos_label=0)
return rec
def specificity(ground_truth, predictions):
cm_train = metrics.confusion_matrix(ground_truth, predictions)
tn = cm_train[0,0]
fp = cm_train[0,1]
fn = cm_train[1,0]
tp = cm_train[1,1]
train_spec = tn / float(tn+fp)
return train_spec
# One Experiment One file
def run(name,df_all, catCols, reducedCols, hyperparams,
ts_thr, tr_thrs, fs_methods, sm_method, sm_types, cls_methods, lms, cv_folds, cv_thr,
verbose=True, save=False):
results = []
for tr_thr in tr_thrs:
X_train, X_test, y_train, y_test = train_test_partition(df_all, ts_thr)
X_train, y_train = train_partition(X_train, y_train, tr_thr)
pipeline = create_pipelines(catCols, reducedCols, hyperparams, fs_methods, sm_method, sm_types, cls_methods, lms)
print "\nDataSet:"
print "**********"
print "**********"
print "SIZE:", tr_thr
print "NAME:", name
print df_all.shape
print "ALL TRAIN:", X_train.shape
print "TRAIN:", "[0's:", np.sum(y_train==0), "1's:", np.sum(y_train==1), "]"
print "ALL TEST:", X_test.shape
print "TEST:", "[0's:", np.sum(y_test==0), "1's:", np.sum(y_test==1), "]"
for num_exp in range(pipeline.shape[0]):
# Run experiment
start = time.time()
#Prepare pipe_cls
pipeline_cls = pipeline["pipe"].iloc[num_exp]
pipeline_params = pipeline["pipe_params"].iloc[num_exp]
fs = pipeline["fs"].iloc[num_exp]
sm = pipeline["sm"].iloc[num_exp]
cls = pipeline["cls"].iloc[num_exp]
lm = pipeline["metric"].iloc[num_exp]
print "\nNum experiment:", str(num_exp), "/", str(pipeline.shape[0] - 1)
print "****************"
print "FS:",fs
print "SM:",sm
print "CLS:",cls
print "METRIC:",lm
#Prepare cv
cv_inner = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr, random_state=24)
cv_outer = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr, random_state=42)
#Fit pipeline with CV
grid_pipeline = GridSearchCV(pipeline_cls, param_grid=pipeline_params, verbose=verbose,
n_jobs=-1, cv=cv_inner, scoring= lm, error_score = 0)
grid_pipeline.fit(X_train, y_train)
# Compute Train scores (with best CV params)
y_pred = grid_pipeline.best_estimator_.predict(X_train)
train_f1_w = metrics.f1_score(y_train, y_pred, average='weighted', pos_label=None)
train_p, train_r, train_f1, train_s = metrics.precision_recall_fscore_support(y_train, y_pred,labels=None,average=None, sample_weight=None)
fpr, tpr, _ = metrics.roc_curve(y_train, y_pred)
train_auc = metrics.auc(fpr, tpr)
cm_train = metrics.confusion_matrix(y_train, y_pred)
tn = cm_train[0,0]
fp = cm_train[0,1]
fn = cm_train[1,0]
tp = cm_train[1,1]
train_sens = train_r[1]
train_spec = tn / float(tn+fp)
print "\nTRAIN f1 (weighted): %0.3f" % (train_f1_w)
print "TRAIN Precision [c=0,1]:", train_p
print "TRAIN Recall [c=0,1]:", train_r
print "TRAIN AUC: %0.3f" % (train_auc)
print "TRAIN Sensibility:", train_sens
print "TRAIN Specificity: ", train_spec
# Compute evaluation scores
print "\nCV INNER metric: {}".format(lm)
print "CV INNER selected params {}".format(grid_pipeline.best_params_.values())
print "CV INNER score: {}".format(grid_pipeline.best_score_)
scorings = {'roc_auc': 'roc_auc',
'f1_weighted':'f1_weighted',
'f1':'f1',
'precision_1':'precision',
'recall_1':'recall',
'precision_0' : metrics.make_scorer(precision_0),
'recall_0' : metrics.make_scorer(recall_0),
'spec': metrics.make_scorer(specificity)
}
cv_scores = cross_validate(grid_pipeline.best_estimator_, X_train, y_train,
cv=cv_outer, scoring=scorings, n_jobs=-1,
return_train_score = False)
cv_f1_w_mean = np.mean(cv_scores["test_f1_weighted"])
cv_f1_w_std = np.std(cv_scores["test_f1_weighted"])
cv_p1_mean = np.mean(cv_scores["test_precision_1"])
cv_p1_std = np.std(cv_scores["test_precision_1"])
cv_r1_mean = np.mean(cv_scores["test_recall_1"])
cv_r1_std = np.std(cv_scores["test_recall_1"])
cv_p0_mean = np.mean(cv_scores["test_precision_0"])
cv_p0_std = np.std(cv_scores["test_precision_0"])
cv_r0_mean = np.mean(cv_scores["test_recall_0"])
cv_r0_std = np.std(cv_scores["test_recall_0"])
cv_auc_mean = np.mean(cv_scores["test_roc_auc"])
cv_auc_std = np.std(cv_scores["test_roc_auc"])
cv_spec_mean = np.mean(cv_scores["test_spec"])
cv_spec_std = np.std(cv_scores["test_spec"])
cv_sens_mean = cv_r1_mean
cv_sens_std = cv_r1_std
print "\nCV OUTER f1-weighted score: %0.3f (+/-%0.03f)" % (cv_f1_w_mean,cv_f1_w_std)
print "CV OUTER prec score [c=0,1]: {:.3f} (+/- {:.3f}), {:.3f} (+/- {:.3f})".format(cv_p0_mean,cv_p0_std,cv_p1_mean,cv_p1_std)
print "CV OUTER rec score [c=0,1]: {:.3f} (+/- {:.3f}), {:.3f} (+/- {:.3f})".format(cv_r0_mean,cv_r0_std,cv_r1_mean,cv_r1_std)
print "CV OUTER AUC score: %0.3f (+/-%0.03f)" % (cv_auc_mean,cv_auc_std)
print "CV OUTER Sensibility score: %0.3f (+/-%0.03f)" % (cv_sens_mean,cv_sens_std)
print "CV OUTER Specificity score: %0.3f (+/-%0.03f)" % (cv_spec_mean,cv_spec_std)
print "Selected params (bests from CV) {}".format(grid_pipeline.best_params_.values())
#Compute test scores
y_pred = grid_pipeline.best_estimator_.predict(X_test)
test_f1_w = metrics.f1_score(y_test, y_pred, average='weighted', pos_label=None)
test_p, test_r, test_f1, test_s = metrics.precision_recall_fscore_support(y_test, y_pred,labels=None,average=None, sample_weight=None)
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
test_auc = metrics.auc(fpr, tpr)
cm_test = metrics.confusion_matrix(y_test, y_pred)
tn = cm_test[0,0]
fp = cm_test[0,1]
fn = cm_test[1,0]
tp = cm_test[1,1]
test_sens = test_r[1]
test_spec = tn / float(tn+fp)
print "\nTEST f1 (weighted): %0.3f" % (test_f1_w)
print "TEST Precision [c=0,1]:", test_p
print "TEST Recall [c=0,1]:", test_r
print "TEST AUC: %0.3f" % (test_auc)
print "TEST Sensibility:", test_sens
print "TEST Specificity:", test_spec
print "Confussion matrix:"
print " | PRED"
print "REAL--> v "
print cm_test
end = time.time()
print "\nTotal time:", end - start
res = [num_exp,
name,
tr_thr,
fs,
sm,
cls,
lm,
grid_pipeline.best_params_.values(),
train_sens,
train_spec,
train_auc,
train_r,
train_p,
train_f1_w,
cv_sens_mean,
cv_sens_std,
cv_spec_mean,
cv_spec_std,
cv_auc_mean,
cv_auc_std,
[cv_p0_mean,cv_p1_mean],
[cv_p0_std,cv_p1_std],
[cv_r0_mean,cv_r1_mean],
[cv_r1_std,cv_r0_std],
cv_f1_w_mean,
cv_f1_w_std,
test_sens,
test_spec,
test_auc,
test_r,
test_p,
test_f1_w,
cm_test,
end - start,
grid_pipeline.best_estimator_
]
results.append(res)
#Save results
if save:
df = pd.DataFrame(np.array(res).reshape(1,35), columns=
["exp", "name",
"size_tr","fs","sm","cls","metric","params",
"tr_sens","tr_spec","tr_auc",
"tr_prec","tr_rec","tr_f1",
"cv_sens_mean","cv_sens_std","cv_spec_mean","cv_spec_std","cv_auc_mean","cv_auc_std",
"cv_prec_mean","cv_prec_std","cv_rec_mean","cv_rec_std",
"cv_f1_mean","cv_f1_std",
"test_sens","test_spec","test_auc",
"test_rec","test_prec","test_f1",
"cm_test",
"time","pipeline"])
df.to_pickle(os.path.join("resources", "results",
'results_pipe_' +
"test_" + str(ts_thr) + "_" +
"train_" + str(tr_thr) + "_" +
str(name) + '_' +
str(fs) + '_' +
str(sm) + '_' +
str(lm) + '_' +
str(cls) + '_' +
time.strftime("%Y%m%d-%H%M%S") +
'.pkl'))
return results
In [198]:
typeEncounter = "last" # ['first','last']
typeHypothesis = "early_readmission_vs_none" # ['all_readmisssion_vs_none','early_readmission_vs_none']
typeDataFeatures = "extended_extra" # ["non_extended","extended','extended_extra']
#Extended -> Subset of columns
#Minimum -> minimum set of columns
typeDiagnosis = "none" #["none","diag_1", "diag_3"]
typeDisease = "subset" # ["subset","any",["Respiratory",...]]
#subset -> Return subset of predefined disease features
#any -> Return all disease features
#disease -> Return diseases feature
typeDataExperiment = "disease" #["all", "disease"]
#all -> Include all diagnosis as columns
#disease -> Remove diagnosis as column and keep only rows with diagnosis == 1
In [199]:
verbose = True
cv_thr = 0.3
cv_folds = 5
tr_thrs = [0.01] # [0.1,0.2,0.4,0.6,1.0]
ts_thr = 0.30
In [200]:
fs_methods = ["none",] #["none","combine_fs","lasso_fs","rfe_rf_fs"]
cls_methods = ["logReg"] #["rf","svmRBF","logReg","knn","nn","gbt"]
lms = ["recall"] #["f1_weighted","precision_weighted","roc_auc","recall"]
sm_types = ["after"] #["none","after"]
sm_method = "sm_smote"
In [201]:
#Load default params
hyperparams = np.load("../src/default_hyperparams.npy")
print hyperparams
In [202]:
#Load data
df_all = load_data(typeEncounter, typeDiagnosis, typeDataFeatures)
print "\nSHAPE:"
print df_all.shape
print "\nInitial columns:"
print df_all.columns
#Filter data by class
df_all = filter_data_by_class(df_all, typeHypothesis)
print "\nRows by class type:"
print df_all.iloc[:,-1].sort_values().unique(), np.sum(df_all["readmitted"] == 0), np.sum(df_all["readmitted"] == 1)
#Get columns
colsDiseases, colsNonDiseases = MLpipeline.get_columns(df_all,typeDiagnosis)
print "\nDiseases:", colsDiseases
print "\nNon-diseases:", colsNonDiseases
#Load diseases
diseases = get_diseases(colsDiseases, typeDisease)
print "\nTotal data:", df_all.shape
print diseases
In [203]:
from IPython.display import display, HTML
for disease in diseases:
df_all_filtered = filter_data_by_diseases(df_all, disease, typeDataExperiment, colsNonDiseases)
catCols, reducedCols = compute_type_features(df_all_filtered, typeDataFeatures)
#Apply hyperparams changes
hyperparams = np.load("../src/default_hyperparams.npy")
hyperparams[hyperparams[:,1] == 'rfe_rf_fs',2] = [{'rfe_rf_fs__n_features_to_select': [int(len(reducedCols) * 0.2),
int(len(reducedCols) * 0.4),
int(len(reducedCols) * 0.6)],
'rfe_rf_fs__step': [0.1]}]
p = create_pipelines(catCols,reducedCols, hyperparams, fs_methods, sm_method, sm_types, cls_methods, lms)
name = disease + "_" + typeDataFeatures + "_" + typeDataExperiment + "_" + typeEncounter + "_" + \
typeHypothesis + "_" + typeDiagnosis
res = run(name, df_all_filtered, catCols, reducedCols, hyperparams, ts_thr, tr_thrs,
fs_methods, sm_method, sm_types,
cls_methods, lms, cv_folds, cv_thr, True, False)
In [196]:
res
Out[196]:
In [ ]: