In [1]:
#TO RE-RUN
%reset -f
In [6]:
from sklearn import preprocessing
from time import time
import numpy as np
import csv
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE,ADASYN, RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
from operator import truediv
from sklearn import metrics
import pandas as pd
import time
import os
from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.2f}'.format
plt.style.use('classic')
%matplotlib inline
import sys
sys.path.insert(1, "../../src/")
from TypeFeatImputer import TypeFeatImputer
from UnivCombineFilter import UnivCombineFilter
In [7]:
#df_all=pd.read_csv(os.path.join('resources','diabetic_data_processed_withweight.csv'),';')
df_all=pd.read_pickle(os.path.join('resources','clean_data.pkl'))
print df_all.shape
print df_all.columns
print df_all.readmitted.value_counts()
print df_all.readmitted.value_counts()/float(df_all.shape[0])
In [8]:
# Readmitted
print df_all.loc[:,"readmitted"].sort_values().unique(), np.sum(df_all["readmitted"] == 0), np.sum(df_all["readmitted"] == 1), np.sum(df_all["readmitted"] == 2)
df_all["readmitted"][df_all["readmitted"].values > 0] = 1
print df_all.iloc[:,-1].sort_values().unique(), np.sum(df_all["readmitted"] == 0), np.sum(df_all["readmitted"] == 1)
In [98]:
numCols = [
"time_in_hospital","num_lab_procedures","num_procedures","num_medications",
"number_outpatient","number_emergency","number_inpatient","number_diagnoses"
]
catCols = []
cols = df_all.columns
reducedCols = cols[:-1]
for i in range(len(cols)-1):
if cols[i] not in numCols:
catCols.append(1)
else:
catCols.append(0)
catCols = np.array(catCols)
print "Cat cols:", np.sum(catCols==1), "\n", reducedCols[catCols==1]
print "Num cols:", np.sum(catCols==0), "\n", reducedCols[catCols==0]
print len(reducedCols)
In [99]:
y = df_all.readmitted
print y.unique()
print y.value_counts()
y = y.values
X = df_all.iloc[:,:-1].values
sss = StratifiedShuffleSplit(y, 1, test_size=0.30, random_state=32) #random_state=42
for train_index, test_index in sss:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print
print X_train.shape, y_train.shape
print np.sum(y_train == 0), round(np.sum(y_train == 0)/float(y_train.shape[0]),2), \
np.sum(y_train > 0), round(np.sum(y_train > 0)/float(y_train.shape[0]),2)
print X_test.shape, y_test.shape
print np.sum(y_test == 0), round(np.sum(y_test == 0)/float(y_test.shape[0]),2), \
np.sum(y_test > 0), round(np.sum(y_test > 0)/float(y_test.shape[0]),2)
In [146]:
basePipeline = Pipeline([
("Imputer", TypeFeatImputer(catCols, reducedCols)),
("Variance", VarianceThreshold(threshold=(.995 * (1 - .995)))),
("Scaler", StandardScaler())
])
params = {}
pipeline = []
pipe = Pipeline(list(basePipeline.steps))
In [131]:
fs_method = "combine_fs"
pipe.steps.insert(1,(fs_method, UnivCombineFilter(catCols,np.array(reducedCols))))
params.update({fs_method + '__percentile':[30,50,70,80]})
In [139]:
cls_method = "logReg"
pipe.steps.append((cls_method, LogisticRegression(random_state=42)))
params.update({cls_method + '__C': [1e-5,0.001,0.01,0.1,1,5,10]})
params.update({cls_method + '__class_weight': [None, 'balanced']})
params.update({cls_method + '__penalty': ["l1","l2"]})
In [111]:
cls_method = "rf"
pipe.steps.append((cls_method, RandomForestClassifier(n_jobs=-1,random_state=42,class_weight="balanced")))
params.update({cls_method + '__n_estimators': [150,300,500,700],
cls_method + '__criterion': ['gini'],
cls_method + '__max_depth' : [None,4,6,8,10]})
In [302]:
cls_method = "knn"
pipe.steps.append((cls_method, KNeighborsClassifier(n_jobs=-1)))
params.update({cls_method + '__n_neighbors': [3,5,7,9],
cls_method + '__weights': ['uniform', 'distance']})
In [419]:
cls_method = "svm"
pipe.steps.append((cls_method, SVC(kernel = "rbf", random_state=42,probability=True)))
params.update({cls_method + '__C': [0.01,0.1,0.5,1,5,10,15,30,50],
cls_method + '__gamma' : [0.0001,0.001,0.01, 0.1,1,5],
cls_method + '__class_weight': [None, 'balanced']})
In [47]:
cls_method = "nb"
pipe.steps.append((cls_method, GaussianNB()))
#params.update({cls_method + '__alpha': [1e-3,0.001,0.01,0.1,0.5,1,5]})
In [140]:
#Post process pipeline
pipe_imb = make_pipeline(*[p[1] for p in pipe.steps])
stps = len(pipe_imb.steps)
for s in range(stps):
pipe_imb.steps.remove(pipe_imb.steps[0])
for s in range(stps):
pipe_imb.steps.append(pipe.steps[s])
In [141]:
#Add sampling
sm_method = "smote"
pipe_imb.steps.insert(stps - 1,
(sm_method, SMOTE(ratio='auto', kind='regular', random_state=32)))
params.update({sm_method + "__k_neighbors":[3,4,5]})
In [142]:
verbose = False
mtrs = ["f1_weighted"] #"f1","recall","precision"
cv_thr = 0.3
cv_folds = 5
print pipe.steps
In [143]:
print "ALL TRAIN:", X_train.shape
print "TRAIN:", "[0's:", np.sum(y_train==0), "1's:", np.sum(y_train==1), "]"
print "ALL TEST:", X_test.shape
print "TEST:", "[0's:", np.sum(y_test==0), "1's:", np.sum(y_test==1), "]"
print "TEST:", "[0's:", np.sum(y_test==0)/float(y_test.shape[0]), "1's:", np.sum(y_test==1)/float(y_test.shape[0]), "]"
# Run experiment
start = time.time()
#Prepare pipe_cls
pipeline_cls = pipe_imb
pipeline_params = params
if verbose:
print "\n",pipeline_cls.steps
#Prepare cv
cv_inner = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr,random_state=24)
print "\nCV TRAIN:", cv_inner.n_train
print "CV_TEST:", cv_inner.n_test
#Fit pipeline with CV
grid_pipelines = []
for m in mtrs:
grid_pipeline = GridSearchCV(pipeline_cls, param_grid=pipeline_params, verbose=1,
n_jobs=-1, cv=cv_inner, scoring= m, error_score = 0,
refit=True)
grid_pipeline.fit(X_train, y_train)
grid_pipelines.append([m,grid_pipeline])
end = time.time()
print "Total time:", end - start
In [144]:
for gp in grid_pipelines:
print
print gp[0]
print "*****\n"
print gp[1].best_estimator_.steps[-1]
print
if gp[1].best_estimator_.steps[1][0] == "combine_fs":
varFilter = gp[1].best_estimator_.steps[1][1]
print "Selected thr:", varFilter.percentile
print "Selected columns:"
feats = reducedCols[varFilter.ixCols].tolist()
print feats
print "Num useful features:", len(feats), feats
if gp[1].best_estimator_.steps[1][0] == "Variance":
varFilter = gp[1].best_estimator_.steps[1][1]
feats = reducedCols[varFilter.get_support() == False]
print "Discarded feats:", len(feats), feats
In [145]:
# Computel Train score (with best CV params)
for gp in grid_pipelines:
cls = gp[1]
y_pred = cls.predict(X_train)
train_prec_scores = metrics.precision_score(y_train, y_pred, average='weighted', pos_label=None)
train_rec_scores = metrics.recall_score(y_train, y_pred, average='weighted', pos_label=None)
train_f1_scores = metrics.f1_score(y_train, y_pred, average='weighted', pos_label=None)
print "\nTRAIN: "
print "**********\n"
print "Metric:",gp[0]
print "TR Prec score:", train_prec_scores
print "TR Rec score:", train_rec_scores
print "TR F1 score:", train_f1_scores
In [125]:
# Compute pipeline evaluation with CV
for gp in grid_pipelines:
cls = gp[1]
print "\nCV: "
print "******\n"
print "Metric:", gp[0]
print "CV selected params {}".format(cls.best_params_.values())
cv_inner_f1 = cross_val_score(cls.best_estimator_, X_train, y_train,
cv=cv_inner, scoring='f1_weighted', n_jobs=-1)
print "CV {} score: {}".format(gp[0], cls.best_score_)
print "CV f1 score: %0.3f (+/-%0.03f)" % (np.mean(cv_inner_f1), np.std(cv_inner_f1))
In [108]:
# Compute pipeline evaluation with CV
dd = []
for gp in grid_pipelines:
cls = gp[1]
params = np.array([str(d.values()) for d in np.array(cls.grid_scores_[:])[:,0]])
mean = np.array(cls.grid_scores_)[:,1]
values = np.array(cls.grid_scores_)[:,2]
std = np.array([np.std(v) for v in values])
dd = np.hstack((params.reshape(-1,1), mean.reshape(-1,1), std.reshape(-1,1), values.reshape(-1,1)))
res = pd.DataFrame(dd,columns=["params","score_mean","score_std","scores"])
print gp[0]
print res
In [109]:
#Compute test score
for gp in grid_pipelines:
cls = gp[1]
y_pred =cls.predict(X_test)
test_f1 = metrics.f1_score(y_test, y_pred, average='weighted', pos_label=None)
print "\n",gp[0],":"
print "**********\n"
print "Test f1: %0.3f" % (test_f1)
print "with following performance in test:"
print metrics.classification_report(y_test, y_pred)
cm = metrics.confusion_matrix(y_test, y_pred)
print "\nConfusion matrix:"
print cm
print "\nAccuracy:", (cm[0,0] + cm[1,1])/ float(cm[0,0] + cm[1,1]+cm[0,1] + cm[1,0])
print "Sensitivity:", cm[1,1] / float(cm[1,1] + cm[1,0]) #Reduce FN (recall)
print "Specificity:", cm[0,0] / float(cm[0,0] + cm[0,1]) #Reduce FP
y_probs = cls.best_estimator_.predict_proba(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_probs[:,1], pos_label=1)
print "AUC:", metrics.auc(fpr, tpr)
In [149]:
cls = grid_pipelines[1][1]
print cls
In [145]:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure(figsize=(8,6))
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("% Training set")
plt.ylabel("F1-score")
train_sizes_lc, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,scoring="f1_weighted")
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid(True)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="b",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.axhline(0.5,color='r',ls='--', label="random")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
return plt
In [148]:
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
title = ""
plot_learning_curve(cls.best_estimator_, title, X_train, y_train, ylim=(0.4, 1.01),
cv=cv_inner,
train_sizes=[0.05,0.10,0.15,0.25,0.50,0.75,0.80,1.0],
n_jobs=-1)
plt.show()
In [ ]: