In [64]:
%reset -f

In [75]:
from IPython.display import display, HTML
from sklearn import metrics
from sklearn import cross_validation
from scipy import interp
import pandas as pd
import numpy as np
import os

from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score
from sklearn.model_selection import cross_validate

from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt


np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.4f}'.format
plt.style.use('classic')

%matplotlib inline

Load domain data & methods


In [66]:
import sys
sys.path.insert(1, "../src/")
from TypeFeatImputer import TypeFeatImputer
from UnivCombineFilter import UnivCombineFilter
import MLpipeline as MLpipeline
import readmision_methods as rm

In [77]:
verbose = True
cv_thr = 0.3
cv_folds = 5

tr_thrs = [1.0] # [0.1,0.2,0.4,0.6,1.0]
ts_thr = 0.30

typeEncounter = "last" # ['first','last']
typeHypothesis = "all_readmisssion_vs_none" # ['all_readmisssion_vs_none','early_readmission_vs_none']
typeDataFeatures = "reduced" # ["reduced","extended','extended_extra','extended_extra_diag_1','extended_extra_diag_3']
    #Extended_Extra_diag_1 -> Extended extra columns with disease in diag_3
    #Extended_Extra_diag_1 -> Extended extra columns with disease in diag_1
    #Extended_Extra -> Extended extra of columns
    #Extended -> Extended columns
    #Reduced -> minimum set of columns 
typeDataExperiment = "disease" #["all", "disease"] 
    #all -> No filter disease column
    #disease -> Filter/Remove disease column and keep only rows with diagnosis column == 1  

#name = typeDataExperiment + "_" + typeDataFeatures + "_" +  typeEncounter + "_" + typeHypothesis
name = typeDataFeatures + "_" +  typeEncounter + "_" + typeHypothesis

print name


reduced_last_all_readmisssion_vs_none

In [81]:
#Load data
df_all = rm.load_data(typeEncounter, typeDataFeatures)
print "\nSHAPE:"
print df_all.shape

#Filter data by class
df_all = rm.filter_data_by_class(df_all, typeHypothesis)
print "\nSHAPE AFTER FILTERING:"
print df_all.shape

print "\nRows by class type:"
print df_all.iloc[:,-1].sort_values().unique(), np.sum(df_all["readmitted"] == 0), np.sum(df_all["readmitted"] == 1)

#Train & Test
X_train, X_test, y_train, y_test = MLpipeline.train_test_partition(df_all)
print "\nTrain:", X_train.shape, "Test:",  X_test.shape

#CV data
cv_outer = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr, random_state=42)
print "\n CV:", "Train", cv_outer.n_train, "test:", cv_outer.n_test


SHAPE:
(67182, 42)

SHAPE AFTER FILTERING:
(67182, 42)

Rows by class type:
[0 1] 39785 27397

Train: (47027, 41) Test: (20155, 41)

 CV: 32918 14109

Load pipeline results


In [93]:
fname = "summary_reduced_last_all_readmisssion_vs_none_20171025-115456.pkl"
class_names = ["Non-readmission","Readmission"]


pathDir = os.path.join('resources','results') 


dd = pd.read_pickle(os.path.join(pathDir,fname))
dd.index = range(0,dd.shape[0])
print dd.columns
print dd.shape
dd[["sm","fs","metric","cls","params",
    "tr_auc","tr_sens","tr_spec",
    "cv_auc_mean","cv_f1_mean","cv_sens_mean","cv_spec_mean",
    "test_auc","test_f1","test_sens","test_spec"]]


Index([u'exp', u'name', u'size_tr', u'fs', u'sm', u'cls', u'metric', u'params',
       u'tr_sens', u'tr_spec', u'tr_auc', u'tr_prec', u'tr_rec', u'tr_f1',
       u'cv_sens_mean', u'cv_sens_std', u'cv_spec_mean', u'cv_spec_std',
       u'cv_auc_mean', u'cv_auc_std', u'cv_prec_mean', u'cv_prec_std',
       u'cv_rec_mean', u'cv_rec_std', u'cv_f1_mean', u'cv_f1_std',
       u'test_sens', u'test_spec', u'test_auc', u'test_rec', u'test_prec',
       u'test_f1', u'cm_test', u'time', u'pipeline', u'cv_auc', u'cv_f1',
       u'cv_sens', u'cv_spec', u'disease'],
      dtype='object')
(6, 40)
Out[93]:
sm fs metric cls params tr_auc tr_sens tr_spec cv_auc_mean cv_f1_mean cv_sens_mean cv_spec_mean test_auc test_f1 test_sens test_spec
0 none none f1_weighted rf [gini, 12, 400] 0.6936 0.6732 0.7140 0.6536 0.6153 0.5659 0.6462 0.6101 0.6175 0.5833 0.6369
1 none none f1 rf [gini, 8, 350] 0.6234 0.6062 0.6407 0.6493 0.6121 0.5836 0.6271 0.6077 0.6131 0.5947 0.6207
2 none none recall rf [gini, 8, 350] 0.6236 0.6064 0.6409 0.6493 0.6121 0.5836 0.6271 0.6078 0.6131 0.5947 0.6208
3 none none f1_weighted logReg [balanced, 0.005, l2] 0.5978 0.5673 0.6282 0.6382 0.6027 0.5607 0.6276 0.5971 0.6050 0.5678 0.6263
4 none none f1 logReg [balanced, 0.0001, l2] 0.5940 0.5807 0.6073 0.6307 0.5941 0.5772 0.6003 0.5930 0.5984 0.5806 0.6053
5 none none recall logReg [balanced, 1e-05, l2] 0.5858 0.5911 0.5805 0.6207 0.5854 0.5881 0.5776 0.5884 0.5906 0.5957 0.5811

Plot SELECTED pipelines


In [73]:
plt.figure(figsize=(8,6))
ax = plt.subplot(1,1,1)
dd[["test_auc"]].plot(ax=ax,y="test_auc", color="g")
dd[["cv_auc_mean", "cv_auc_std"]].plot(ax=ax, y="cv_auc_mean", yerr="cv_auc_std",color="b")
ax.fill_between(dd.index,dd.cv_auc_mean - dd.cv_auc_std, dd.cv_auc_mean + dd.cv_auc_std, alpha=0.1,color="b")


txt = [dd["cls"].iloc[i] + "_" + str(dd.iloc[i].metric) + "_"+ str(dd.iloc[i].params) for i in range(dd.shape[0])]
plt.xticks(dd.index,txt,rotation=90)
plt.xlim(-0.05,5.05)


Out[73]:
(-0.05, 5.05)

Plot best pipeline


In [84]:
cls_name = dd["cls"].iloc[0] + "_" + str(dd.iloc[0].metric) + "_"+ str(dd.iloc[0].params)
cls = dd.iloc[0].pipeline


Pipeline(memory=None,
     steps=[('Imputer', TypeFeatImputer(allNameCols=Index([u'gender', u'age', u'race_AfricanAmerican', u'race_Caucasian',
       u'race_Other', u'HbA1c', u'Change', u'time_in_hospital', u'diabetesMed',
       u'diss_home', u'medSpec_cardio', u'medSpec_Family/GeneralPractice',
       u'medSpec_InternalMed...imators=400, n_jobs=-1, oob_score=False, random_state=42,
            verbose=0, warm_start=False))])

ROC curve


In [142]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

#y_score = cls.decision_function(X_test) #For D0 & D1
y_pred = cls.predict(X_test)
y_prob = cls.predict_proba(X_test)
    
fpr_test, tpr_test, _ = metrics.roc_curve(y_test, y_prob[:,1])
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
#fpr_test, tpr_test, _ = metrics.roc_curve(y_test, y_score, pos_label=1)

test_auc_w = metrics.auc(fpr, tpr) 
#test_auc_w = metrics.roc_auc_score(y_test, y_score, average='weighted')

print y_test
print y_pred
print
print test_auc_w
print tpr_test
print fpr_test

plt.figure(figsize=(12,6))

ax = plt.subplot(1,2,1)
ax.plot(fpr_test, tpr_test, color="g",
         label='AUC Test = %0.2f' % (test_auc_w),lw=2, alpha=.8)

ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='random', alpha=.3)
ax.legend(loc='lower right')
ax.set_xlabel("fpr")
ax.set_ylabel("tpr")
ax.set_xlim(-0.05,1.05)
ax.set_ylim(0,1.05)
plt.show()


[0 0 1 ..., 1 0 0]
[1 0 0 ..., 0 1 0]

0.610028875665
[ 0.00012167  0.00024334  0.00024334 ...,  0.99987833  0.99987833  1.        ]
[ 0.          0.          0.00016756 ...,  0.9991622   1.          1.        ]

In [82]:
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 9)

i = 0

plt.figure(figsize=(8,6))
ax = plt.subplot(1,1,1)
for exp in dd.exp.values:
    
    cls_name = dd[dd.exp == exp].index.values
    cls = dd[dd.exp == exp].pipeline.iloc[0]

    for train, test in cv_outer:

        y_pred = cls.fit(X_train[train],y_train[train]).predict(X_train[test])

        # Compute ROC curve and area under the curve
        fpr, tpr, thresholds = metrics.roc_curve(y_train[test], y_pred)

        tprs.append(interp(mean_fpr, fpr, tpr))  
        tprs[-1][0] = 0.0
        roc_auc = metrics.auc(fpr, tpr)    

        aucs.append(roc_auc)
        i += 1

    #Plot auc mean line and shadow mean area
    mean_auc = np.mean(aucs)
    std_auc = np.std(aucs)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    display(mean_tpr)

    ax.plot(mean_fpr, mean_tpr,
             label=r'Mean ROC %s (AUC = %0.2f $\pm$ %0.2f)' % (cls_name,mean_auc, std_auc),
             lw=2, alpha=.8)
    
#Plot guess line
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='', alpha=.8)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for 10-fold CV')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()


/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/sklearn/utils/validation.py:444: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
array([ 0.        ,  0.19993264,  0.39986528,  0.58000838,  0.66400671,
        0.74800503,  0.83200335,  0.91600168,  1.        ])
array([ 0.        ,  0.19779388,  0.39558775,  0.58176802,  0.66598912,
        0.74949184,  0.83299456,  0.91649728,  1.        ])
array([ 0.        ,  0.19707312,  0.39414624,  0.58233158,  0.66663154,
        0.74997366,  0.83331577,  0.91665789,  1.        ])
array([ 0.        ,  0.19486268,  0.38972536,  0.57730406,  0.66248184,
        0.74686138,  0.83124092,  0.91562046,  1.        ])
array([ 0.        ,  0.19199937,  0.38399873,  0.57017091,  0.65955426,
        0.74466569,  0.82977713,  0.91488856,  1.        ])
array([ 0.        ,  0.18900741,  0.37801482,  0.56216624,  0.65686229,
        0.74264672,  0.82843115,  0.91421557,  1.        ])

Classification probabilities


In [92]:
# Plot the two-class decision scores
plot_colors = "br"
plot_step = 0.02

twoclass_output = cls.predict_proba(X_test)[:,1]
plot_range = (twoclass_output.min(), twoclass_output.max())
print cls
print twoclass_output


plt.figure(figsize=(10, 5))
for i, n, c in zip(range(2), class_names, plot_colors):
    plt.hist(twoclass_output[y_test == i],
             bins=20,
             range=(0,1.0),
             facecolor=c,
             label='Class %s' % n,
             alpha=.5,
             edgecolor='k')
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, y1, y2 * 1.2))
plt.legend(loc='upper right')
plt.ylabel('Samples')
plt.xlabel('Score')
plt.title('Predicted probabilities Scores')
plt.axvline(0.5,label="Default threshold",c="r",ls="--")
plt.xticks(np.arange(0,1.05,0.05), np.arange(0,1.05,0.05))
plt.xlim(0,1)
plt.tight_layout()


Pipeline(memory=None,
     steps=[('Imputer', TypeFeatImputer(allNameCols=Index([u'gender', u'age', u'race_AfricanAmerican', u'race_Caucasian',
       u'race_Other', u'HbA1c', u'Change', u'time_in_hospital', u'diabetesMed',
       u'diss_home', u'medSpec_cardio', u'medSpec_Family/GeneralPractice',
       u'medSpec_InternalMed...imators=400, n_jobs=-1, oob_score=False, random_state=42,
            verbose=0, warm_start=False))])
[ 0.52728229  0.45320818  0.37108323 ...,  0.44520537  0.53490864
  0.45934331]

In [96]:
if "rf" in cls_name:    
    y_prob = cls.predict_proba(X_test)
    average_precision = metrics.average_precision_score(y_test, y_prob[:,1], average="macro")
    precision, recall, _ = metrics.precision_recall_curve(y_test, y_prob[:,1])
else: 
    y_score = cls.decision_function(X_test)
    average_precision = metrics.average_precision_score(y_test, y_score, average="macro")
    precision, recall, _ = metrics.precision_recall_curve(y_test, y_score)
    

plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,
                 color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Average precision-recall score: {0:0.2f}'.format(average_precision))
plt.show()



In [97]:
### Classification boundaries

In [101]:
#pca
from sklearn.decomposition import PCA

print X_test.shape

X_test_trans = X_test.copy()
for s in range(len(cls.steps) - 1):
    if cls.steps[s][0] != "sm_smote":
        X_test_trans = cls.steps[s][1].transform(X_test_trans)  

X_train_trans = X_train.copy()
for s in range(len(cls.steps) - 1):
    if cls.steps[s][0] != "sm_smote":
        X_train_trans = cls.steps[s][1].transform(X_train_trans)  

pca = PCA(n_components=2)
X_r = pca.fit(X_train_trans).transform(X_test_trans)


print('explained variance ratio (first components): %s'
      % str(pca.explained_variance_ratio_))
print "explained variance ratio of 2 pcas:", np.sum(pca.explained_variance_ratio_[:2])


(20155, 41)
explained variance ratio (first components): [ 0.08093311  0.06435579]
explained variance ratio of 2 pcas: 0.145288900178

In [102]:
#Make meshgrid

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .1     # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = X_r[:, 0].min() - 2, X_r[:, 0].max() + 2
y_min, y_max = X_r[:, 1].min() - 2, X_r[:, 1].max() + 2
print x_min, x_max
print y_min, y_max

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
print xx.shape, np.min(xx), np.max(xx)
print yy.shape, np.min(yy), np.max(yy)

X_extra = np.c_[xx.ravel(), yy.ravel()]
x_min_extra, x_max_extra = X_extra[:, 0].min(), X_extra[:, 0].max()
y_min_extra, y_max_extra = X_extra[:, 1].min(), X_extra[:, 1].max()
print x_min_extra, x_max_extra
print y_min_extra, y_max_extra
print X_extra.shape

X_extra_trans = pca.inverse_transform(X_extra)
print X_extra_trans.shape


-6.2695459338 5.34219449514
-6.54248087982 8.23930802014
(148, 117) -6.2695459338 5.3304540662
(148, 117) -6.54248087982 8.15751912018
-6.2695459338 5.3304540662
-6.54248087982 8.15751912018
(17316, 2)
(17316, 41)

In [103]:
# Obtain labels for each point in mesh. Use last trained model
Z = cls.steps[-1][1].predict(X_extra_trans)
print Z.shape
print np.sum(Z==0), np.sum(Z==1)

y_pred_ts = cls.predict(X_test)
print np.sum(y_pred_ts==0), np.sum(y_pred_ts==1)


y_pred_cs = []
for d in range(len(y_pred_ts)):
    if y_pred_ts[d] == y_test[d]:
        if y_pred_ts[d] == 0:
            y_pred_cs.append("blue")
        else:
            y_pred_cs.append("g")
    else:
        y_pred_cs.append("r")

y_pred_cs = np.array(y_pred_cs)
print y_pred_cs
print "Pred errors:", np.sum(y_pred_cs=="r")


(17316,)
9191 8125
11028 9127
['r' 'blue' 'r' ..., 'r' 'r' 'blue']
Pred errors: 7758

In [104]:
plt.figure(figsize=(15,5))
ax = plt.subplot(1,3,1)

#Predicted
#ax.scatter(X_r[:, 0], X_r[:, 1], c=y_pred_cs)
ax.scatter(X_r[y_pred_cs=="g", 0], X_r[y_pred_cs=="g", 1], c=["g" for y in y_pred_ts[y_pred_cs=="g"]],label="pred. compliant", alpha=0.5, s=48)
ax.scatter(X_r[y_pred_cs=="blue", 0], X_r[y_pred_cs=="blue", 1], c=["blue" for y in y_pred_ts[y_pred_cs=="blue"]],label="pred. non-compliant", alpha=0.5, s=48)
ax.scatter(X_r[y_pred_cs=="r", 0], X_r[y_pred_cs=="r", 1], c=["r" for y in y_pred_ts[y_pred_cs=="r"]],label="pred. error", s=48)

ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_title("Combine Predicted vs Truth")
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),fancybox=True, shadow=True, ncol=1)

ax = plt.subplot(1,3,2)
ax.scatter(X_r[y_pred_ts==1, 0], X_r[y_pred_ts==1, 1], c=["g" for y in y_pred_ts[y_pred_ts==1]],label="pred. compliant", alpha=0.5, s=48)
ax.scatter(X_r[y_pred_ts==0, 0], X_r[y_pred_ts==0, 1], c=["blue" for y in y_pred_ts[y_pred_ts==0]],label="pred. non-compliant", alpha=0.5, s=48)
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_title("Predicted")
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),fancybox=True, shadow=True, ncol=1)

ax = plt.subplot(1,3,3)
#ax.scatter(X_r[:, 0], X_r[:, 1], c=["g" if y else "blue" for y in y_test], alpha=0.3)
ax.scatter(X_r[y_test==1, 0], X_r[y_test==1, 1], c=["g" for y in y_pred_ts[y_test==1]],label="real compliant", alpha=0.5, s=48)
ax.scatter(X_r[y_test==0, 0], X_r[y_test==0, 1], c=["blue" for y in y_pred_ts[y_test==0]],label="real non-compliant", alpha=0.5, s=48)
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_title("Truth")
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),fancybox=True, shadow=True, ncol=1)

plt.show()



In [105]:
#### Interpretability

In [126]:
display(dd[["tr_auc","tr_sens","tr_spec",
    "cv_auc_mean","cv_f1_mean","cv_sens_mean","cv_spec_mean",
    "test_auc","test_f1","test_sens","test_spec"]].iloc[0,:])


tr_auc         0.6936
tr_sens        0.6732
tr_spec        0.7140
cv_auc_mean    0.6536
cv_f1_mean     0.6153
cv_sens_mean   0.5659
cv_spec_mean   0.6462
test_auc       0.6101
test_f1        0.6175
test_sens      0.5833
test_spec      0.6369
Name: 0, dtype: object

In [117]:
# Plot feature importance
cols = df_all.columns[:-1]

cls_int = dd.iloc[0].pipeline
clf = cls_int.steps[-1][1]
var = cls_int.steps[2][1]
var_cols = cols[var.get_support()]

print cols[var.get_support() == 1]
print len(cols)
print len(var_cols)

fs_cols = var_cols
fs_name = cls_int.steps[3][0]
if fs_name in ["lasso_fs","rfe_fs","combined_fs"]:
    fs = cls.steps[3][1]
    fs_cols = var_cols[fs.get_support()]
    print fs_cols, np.sum(fs.get_support()), len(fs_cols)
    
plt.figure(figsize=(15,5))    
if "rf" in cls_name:
    feature_importance = clf.feature_importances_
    feature_importance = (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)[::-1]
    pos = np.arange(sorted_idx.shape[0]) + .5
    
    plt.bar(np.arange(len(feature_importance)), feature_importance[sorted_idx])
    plt.xticks(np.arange(len(feature_importance))+0.4, fs_cols[sorted_idx], rotation=90)
    plt.xlabel('')
else:
    #feature_importance = clf.coef_[0,:]
    weights = clf.coef_
    weights /= weights.max()
    print weights.shape
    
    sorted_idx = np.argsort(weights[::])[::-1]
    
    plt.bar(np.arange(weights.shape[0])+0.3, weights[sorted_idx])
    plt.xticks(np.arange(weights.shape[0])+0.3,fs_cols[sorted_idx], rotation=90)

plt.axhline(0.5, c="red", ls="--")    
plt.title('')
plt.show()


Index([u'gender', u'age', u'race_AfricanAmerican', u'race_Caucasian',
       u'race_Other', u'HbA1c', u'Change', u'time_in_hospital', u'diabetesMed',
       u'diss_home', u'medSpec_cardio', u'medSpec_Family/GeneralPractice',
       u'medSpec_InternalMedicine', u'medSpec_surgery', u'adm_src_1',
       u'adm_src_2', u'adm_src_3', u'adm_src_4', u'adm_src_5', u'adm_src_6',
       u'adm_src_7', u'adm_src_8', u'adm_src_10', u'adm_src_11', u'adm_src_13',
       u'adm_src_14', u'adm_src_22', u'adm_src_25', u'adm_1', u'adm_2',
       u'adm_3', u'adm_4', u'adm_7', u'number_treatment',
       u'num_lab_procedures', u'num_procedures', u'num_medications',
       u'number_outpatient', u'number_emergency', u'number_inpatient',
       u'number_diagnoses'],
      dtype='object')
41
41

In [118]:
#### Feature stability

In [137]:
k = 0
stab = 100
stab_thr = 0.4
stabArr = dict()
cv_int = cross_validation.StratifiedShuffleSplit(y_train, n_iter=stab, test_size=0.30, random_state=100)
for train_ix, test_ix in cv_int:    
    cls_int.fit(X_train[train_ix],y_train[train_ix])
    
    clf = cls_int.steps[-1][1]
    var = cls_int.steps[2][1]
    var_cols = cols[var.get_support()]

    fs_cols = var_cols
    fs_name = cls_int.steps[3][0]
    if fs_name in ["lasso_fs","rfe_fs","combined_fs"]:
        fs = cls.steps[3][1]
        fs_cols = var_cols[fs.get_support()]
    
    if "rf" in cls_name:
        feature_importance = np.abs(clf.feature_importances_)
        weights = (feature_importance / float(feature_importance.max()))
    else:
        weights = np.abs(clf.coef_)
        weights /= float(weights.max())
        weights = weights[0,:]

    for i,w in enumerate(weights):
        if w >= stab_thr:
            col = fs_cols[i]
            if col in stabArr.keys():
                stabArr[col].append(w)
            else:
                stabArr[col] = [w]
    if k % 10 == 0:
        print "Num epochs:", k
    k += 1

dfStab = []        
for j in stabArr.keys():
    k = len(stabArr[j])/100.0
    mean = np.mean(stabArr[j])
    std = np.std(stabArr[j])
    max = np.max(stabArr[j])
    min = np.min(stabArr[j])
    dfStab.append([j, k, mean,std,max,min])

dfStab = pd.DataFrame(dfStab, columns = ["column", "kont", "mean", "std", "max", "min"]) 
dfStab.sort_values(["kont","mean"], ascending = False, inplace=True)


Num epochs: 0
Num epochs: 10
Num epochs: 20
Num epochs: 30
Num epochs: 40
Num epochs: 50
Num epochs: 60
Num epochs: 70
Num epochs: 80
Num epochs: 90

In [138]:
#Plot stability
plt.figure(figsize=(15,5))
ax = plt.subplot(1,1,1)
ax.bar(np.arange(dfStab.shape[0]), dfStab["kont"], alpha=0.5, label="stability")
ax.plot(np.arange(dfStab.shape[0])+0.3, dfStab["mean"],c="orange", marker='o', label="weights_mean")
ax.fill_between(np.arange(dfStab.shape[0])+0.3, 
                dfStab["mean"] - dfStab["std"],
                dfStab["mean"] + dfStab["std"],
                alpha=0.6, color="orange")
    
plt.xticks(np.arange(dfStab.shape[0])+0.3,dfStab["column"],rotation=90)
plt.ylim(0.0,1.05)
plt.legend()
plt.show()

#Stability table
print dfStab.shape
print dfStab[dfStab["kont"]>=0.4].shape
display(dfStab[dfStab["kont"]>=0.4])
display(dfStab)


(10, 6)
(10, 6)
column kont mean std max min
5 num_lab_procedures 1.0000 0.9985 0.0060 1.0000 0.9580
8 number_inpatient 1.0000 0.9492 0.0330 1.0000 0.8524
0 num_medications 1.0000 0.8988 0.0171 0.9431 0.8627
3 number_diagnoses 1.0000 0.7378 0.0242 0.8099 0.6835
1 age 1.0000 0.6919 0.0177 0.7341 0.6500
2 time_in_hospital 1.0000 0.6111 0.0110 0.6499 0.5768
7 number_treatment 1.0000 0.4431 0.0083 0.4620 0.4266
9 num_procedures 1.0000 0.4311 0.0080 0.4531 0.4132
4 HbA1c 0.8100 0.4225 0.0127 0.4551 0.4020
6 number_outpatient 0.5100 0.4165 0.0133 0.4563 0.4004
column kont mean std max min
5 num_lab_procedures 1.0000 0.9985 0.0060 1.0000 0.9580
8 number_inpatient 1.0000 0.9492 0.0330 1.0000 0.8524
0 num_medications 1.0000 0.8988 0.0171 0.9431 0.8627
3 number_diagnoses 1.0000 0.7378 0.0242 0.8099 0.6835
1 age 1.0000 0.6919 0.0177 0.7341 0.6500
2 time_in_hospital 1.0000 0.6111 0.0110 0.6499 0.5768
7 number_treatment 1.0000 0.4431 0.0083 0.4620 0.4266
9 num_procedures 1.0000 0.4311 0.0080 0.4531 0.4132
4 HbA1c 0.8100 0.4225 0.0127 0.4551 0.4020
6 number_outpatient 0.5100 0.4165 0.0133 0.4563 0.4004

Learning curve


In [133]:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5), metric="f1"):

    plt.figure(figsize=(8,6))
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("% Training set")
    plt.ylabel(metric + "-score")
    train_sizes_lc, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,scoring="recall")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid(True)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="b")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="b",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")    
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2, color="g")
    plt.axhline(0.5,color='r',ls='--', label="random")
    
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    return plt

In [134]:
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
title = ""
plot_learning_curve(cls, title, X_train, y_train, ylim=(0.4, 1.01), 
                    cv=cv_outer,
                    train_sizes=[0.2,0.4,0.6,0.8,1.0], 
                    n_jobs=-1, metric="f1")

plt.show()



In [ ]: