In [153]:
#TO RE-RUN
%reset -f

In [208]:
from sklearn import preprocessing
from time import time
import numpy as np
import csv
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE,ADASYN, RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline

from operator import truediv
from sklearn import metrics
import pandas as pd
import time
import os

from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt


np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.2f}'.format
plt.style.use('classic')

%matplotlib inline

import sys
sys.path.insert(1, "../../src/")
from TypeFeatImputer import TypeFeatImputer
from UnivCombineFilter import UnivCombineFilter

In [209]:
typeEncounter = "first" # ['first','last']
typeHypothesis = "all_readmisssion_vs_none" # ['all_readmisssion_vs_none','early_readmission_vs_none']
typeDataFeatures = "extended" # ["minimum,"extended']

In [210]:
if typeDataFeatures == "minimum":
    df_all=pd.read_pickle(os.path.join('resources','clean_data_' + typeEncounter + '_hyp_1.pkl'))
if typeDataFeatures == "extended":
    df_all=pd.read_pickle(os.path.join('resources','clean_data_' + typeEncounter + '_hyp_1_' + typeDataFeatures + '.pkl'))
    
print df_all.shape
print df_all.columns
print df_all.readmitted.value_counts()
print df_all.readmitted.value_counts()/float(df_all.shape[0])


(71518, 28)
Index([u'diss_1', u'race_AfricanAmerican', u'race_Caucasian', u'race_Other',
       u'medSpec_cardio', u'medSpec_Family/GeneralPractice',
       u'medSpec_InternalMedicine', u'medSpec_surgery', u'age_cat',
       u'Diabetis', u'Circulatory', u'Digestive', u'Genitourinary',
       u'Poisoning', u'Muscoskeletal', u'Neoplasms', u'Respiratory', u'HbA1c',
       u'Change', u'time_in_hospital', u'num_lab_procedures',
       u'num_procedures', u'num_medications', u'number_outpatient',
       u'number_emergency', u'number_inpatient', u'number_diagnoses',
       u'readmitted'],
      dtype='object')
0    54374
2    13920
1     3224
Name: readmitted, dtype: int64
0   0.76
2   0.19
1   0.05
Name: readmitted, dtype: float64

Compute class label


In [182]:
print df_all.loc[:,"readmitted"].sort_values().unique(), \
    np.sum(df_all["readmitted"] == 0), \
    np.sum(df_all["readmitted"] == 1), \
    np.sum(df_all["readmitted"] == 2)


[0 1 2] 54374 3224 13920

In [183]:
# Readmitted none vs readmitted
if typeHypothesis == "all_readmisssion_vs_none":
    df_all["readmitted"][df_all["readmitted"].values > 0] = 1
    print df_all.iloc[:,-1].sort_values().unique(), \
            np.sum(df_all["readmitted"] == 0), \
            np.sum(df_all["readmitted"] == 1)
            
# Readmitted none vs early readmitted            
if typeHypothesis == "early_readmission_vs_none":
    df_all= df_all[df_all["readmitted"].isin([0,1])]
    print df_all.iloc[:,-1].sort_values().unique(), np.sum(df_all["readmitted"] == 0), np.sum(df_all["readmitted"] == 1)


[0 1] 54374 17144
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until

Compute type fields


In [184]:
if typeDataFeatures == "minimum":
    numCols = ['time_in_hospital']
    
if typeDataFeatures == "extended":
    numCols = ['time_in_hospital','num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 
            'number_emergency', 'number_inpatient', 'number_diagnoses']
    
catCols = []
cols = df_all.columns
reducedCols = cols[:-1]

for i in range(len(cols)-1):
    if cols[i] not in numCols:
        catCols.append(1)
    else:
        catCols.append(0)
catCols = np.array(catCols)

print "Cat cols:", np.sum(catCols==1), "\n", reducedCols[catCols==1]
print "Num cols:", np.sum(catCols==0), "\n", reducedCols[catCols==0]
print len(reducedCols)


Cat cols: 19 
Index([u'diss_1', u'race_AfricanAmerican', u'race_Caucasian', u'race_Other',
       u'medSpec_cardio', u'medSpec_Family/GeneralPractice',
       u'medSpec_InternalMedicine', u'medSpec_surgery', u'age_cat',
       u'Diabetis', u'Circulatory', u'Digestive', u'Genitourinary',
       u'Poisoning', u'Muscoskeletal', u'Neoplasms', u'Respiratory', u'HbA1c',
       u'Change'],
      dtype='object')
Num cols: 8 
Index([u'time_in_hospital', u'num_lab_procedures', u'num_procedures',
       u'num_medications', u'number_outpatient', u'number_emergency',
       u'number_inpatient', u'number_diagnoses'],
      dtype='object')
27

Compute partition (train, test)


In [185]:
y = df_all.readmitted
print y.unique()
print y.value_counts()
y = y.values

X = df_all.iloc[:,:-1].values
sss = StratifiedShuffleSplit(y, 1, test_size=0.30, random_state=32) #random_state=42
for train_index, test_index in sss:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

print
print X_train.shape, y_train.shape
print np.sum(y_train == 0), round(np.sum(y_train == 0)/float(y_train.shape[0]),2), \
      np.sum(y_train > 0), round(np.sum(y_train > 0)/float(y_train.shape[0]),2)
print X_test.shape, y_test.shape
print np.sum(y_test == 0), round(np.sum(y_test == 0)/float(y_test.shape[0]),2), \
      np.sum(y_test > 0), round(np.sum(y_test > 0)/float(y_test.shape[0]),2)


[1 0]
0    54374
1    17144
Name: readmitted, dtype: int64

(50062, 27) (50062,)
38061 0.76 12001 0.24
(21456, 27) (21456,)
16313 0.76 5143 0.24

Simple pipeline


In [200]:
basePipeline = Pipeline([
        ("Imputer", TypeFeatImputer(catCols, reducedCols)),
        ("Variance", VarianceThreshold(threshold=(.995 * (1 - .995)))),
        ("Scaler", StandardScaler())
    ])

params = {}
pipeline = []
pipe = Pipeline(list(basePipeline.steps))

In [107]:
fs_method = "combine_fs"
pipe.steps.insert(1,(fs_method, UnivCombineFilter(catCols,np.array(reducedCols))))
params.update({fs_method + '__percentile':[30,50,70,80]})

In [201]:
cls_method = "logReg"
pipe.steps.append((cls_method, LogisticRegression(random_state=42)))
params.update({cls_method + '__C': [1e-10,1e-8,1e-6,1e-5,0.001,0.01,0.1,1,5]})
params.update({cls_method + '__class_weight': [None, 'balanced']})
params.update({cls_method + '__penalty': ["l1","l2"]})

In [194]:
cls_method = "rf"
pipe.steps.append((cls_method, RandomForestClassifier(n_jobs=-1,random_state=42,class_weight="balanced")))
params.update({cls_method + '__n_estimators': [150,300,500,700,1000], 
               cls_method + '__criterion': ['gini','entropy'],
               cls_method + '__max_depth' : [None,4,6]})

In [302]:
cls_method = "knn"
pipe.steps.append((cls_method, KNeighborsClassifier(n_jobs=-1)))

params.update({cls_method + '__n_neighbors': [3,5,7,9], 
               cls_method + '__weights': ['uniform', 'distance']})

In [419]:
cls_method = "svm"
pipe.steps.append((cls_method, SVC(kernel = "rbf", random_state=42,probability=True)))
params.update({cls_method + '__C': [0.01,0.1,0.5,1,5,10,15,30,50], 
               cls_method + '__gamma' : [0.0001,0.001,0.01, 0.1,1,5],
               cls_method + '__class_weight': [None, 'balanced']})

In [90]:
cls_method = "nb"
pipe.steps.append((cls_method, GaussianNB()))
#params.update({cls_method + '__alpha': [1e-3,0.001,0.01,0.1,0.5,1,5]})

In [202]:
#Post process pipeline
pipe_imb = make_pipeline(*[p[1] for p in pipe.steps])
stps = len(pipe_imb.steps)        
for s in range(stps):
    pipe_imb.steps.remove(pipe_imb.steps[0])
for s in range(stps):
    pipe_imb.steps.append(pipe.steps[s])

In [203]:
#Add sampling
sm_method = "smote"                
pipe_imb.steps.insert(stps - 1, 
                      (sm_method, SMOTE(ratio='auto', kind='regular', random_state=32)))
params.update({sm_method + "__k_neighbors":[3,5,7]})

Pipeline setup


In [204]:
verbose = False
mtrs = ["roc_auc","recall"] #"f1_weighted","recall","precision","f1_macro"
cv_thr = 0.3
cv_folds = 5

print pipe_imb.steps


[('Imputer', TypeFeatImputer(allNameCols=Index([u'diss_1', u'race_AfricanAmerican', u'race_Caucasian', u'race_Other',
       u'medSpec_cardio', u'medSpec_Family/GeneralPractice',
       u'medSpec_InternalMedicine', u'medSpec_surgery', u'age_cat',
       u'Diabetis', u'Circulatory', u'Digestive', u'Genitourinary',
       u'Poiso...tient',
       u'number_emergency', u'number_inpatient', u'number_diagnoses'],
      dtype='object'),
        dataCatCols=array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0]))), ('Variance', VarianceThreshold(threshold=0.004975)), ('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('smote', SMOTE(k=None, k_neighbors=5, kind='regular', m=None, m_neighbors=10, n_jobs=1,
   out_step=0.5, random_state=32, ratio='auto', svm_estimator=None)), ('logReg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]

Run pipeline


In [205]:
print "ALL TRAIN:", X_train.shape
print "TRAIN:", "[0's:", np.sum(y_train==0), "1's:", np.sum(y_train==1), "]"
print "ALL TEST:", X_test.shape
print "TEST:", "[0's:", np.sum(y_test==0), "1's:", np.sum(y_test==1), "]"
print "TEST:", "[0's:", np.sum(y_test==0)/float(y_test.shape[0]), "1's:", np.sum(y_test==1)/float(y_test.shape[0]), "]"

# Run experiment
start = time.time()

#Prepare pipe_cls      
pipeline_cls = pipe_imb
pipeline_params = params

if verbose:
    print "\n",pipeline_cls.steps


#Prepare cv
cv_inner = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr,random_state=24)

print "\nCV TRAIN:", cv_inner.n_train
print "CV_TEST:", cv_inner.n_test

#Fit pipeline with CV                        
grid_pipelines = []

for m in mtrs:
    grid_pipeline = GridSearchCV(pipeline_cls, param_grid=pipeline_params, verbose=1, 
                                 n_jobs=-1, cv=cv_inner, scoring= m, error_score = 0,
                                 refit=True) 
    grid_pipeline.fit(X_train, y_train)
    grid_pipelines.append([m,grid_pipeline])       

end = time.time()
print "Total time:", end - start


ALL TRAIN: (50062, 27)
TRAIN: [0's: 38061 1's: 12001 ]
ALL TEST: (21456, 27)
TEST: [0's: 16313 1's: 5143 ]
TEST: [0's: 0.760300149142 1's: 0.239699850858 ]

CV TRAIN: 35043
CV_TEST: 15019
Fitting 5 folds for each of 108 candidates, totalling 540 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   30.4s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 10.0min finished
Fitting 5 folds for each of 108 candidates, totalling 540 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  9.9min finished
Total time: 1213.21440482

In [206]:
for gp in grid_pipelines:
    print
    print gp[0]
    print "*******\n"
    print [gp[1].best_estimator_.steps[i][1].get_params() for i in range(3,len(gp[1].best_estimator_.steps))]
    print
    
    if gp[1].best_estimator_.steps[1][0] == "combine_fs":
        varFilter = gp[1].best_estimator_.steps[1][1]

        print "Selected thr:", varFilter.percentile
        print "Selected columns:"
        feats = reducedCols[varFilter.ixCols].tolist()
        print feats
        print "Num useful features:", len(feats), feats
    
    if gp[1].best_estimator_.steps[1][0] == "Variance":
        varFilter = gp[1].best_estimator_.steps[1][1]
        feats = reducedCols[varFilter.get_support() == False]
        print "Discarded feats:", len(feats), feats


roc_auc
*******

[{'kind': 'regular', 'n_jobs': 1, 'ratio': 'auto', 'k': None, 'm': None, 'out_step': 0.5, 'svm_estimator': None, 'random_state': 32, 'k_neighbors': 5, 'm_neighbors': 10}, {'warm_start': False, 'C': 0.01, 'n_jobs': 1, 'verbose': 0, 'intercept_scaling': 1, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l1', 'multi_class': 'ovr', 'random_state': 42, 'dual': False, 'tol': 0.0001, 'solver': 'liblinear', 'class_weight': None}]

Discarded feats: 0 Index([], dtype='object')

recall
*******

[{'kind': 'regular', 'n_jobs': 1, 'ratio': 'auto', 'k': None, 'm': None, 'out_step': 0.5, 'svm_estimator': None, 'random_state': 32, 'k_neighbors': 7, 'm_neighbors': 10}, {'warm_start': False, 'C': 1e-10, 'n_jobs': 1, 'verbose': 0, 'intercept_scaling': 1, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'multi_class': 'ovr', 'random_state': 42, 'dual': False, 'tol': 0.0001, 'solver': 'liblinear', 'class_weight': None}]

Discarded feats: 0 Index([], dtype='object')

In [207]:
# Computel Train score (with best CV params)
for gp in grid_pipelines:
    
    cls = gp[1]
    y_pred = cls.predict(X_train)                
    train_prec_scores = metrics.precision_score(y_train, y_pred, average='weighted', pos_label=1)
    train_rec_scores = metrics.recall_score(y_train, y_pred, average='weighted', pos_label=1)
    train_f1_scores = metrics.f1_score(y_train, y_pred, average='weighted', pos_label=1)
    train_auc_scores = metrics.roc_auc_score(y_train, y_pred, average='weighted')

    print "\nTRAIN: "
    print "**********\n"

    print "Metric:",gp[0]
    print "TR Prec score:", train_prec_scores
    print "TR Rec score:", train_rec_scores
    print "\nTR F1 score:", train_f1_scores
    print "TR AUC score:", train_auc_scores


TRAIN: 
**********

Metric: roc_auc
TR Prec score: 0.709784701273
TR Rec score: 0.616755223523

TR F1 score: 0.643659768689
TR AUC score: 0.607949714112

TRAIN: 
**********

Metric: recall
TR Prec score: 0.706460983055
TR Rec score: 0.576145579481

TR F1 score: 0.60706440651
TR AUC score: 0.598215786958

In [124]:
# Compute pipeline evaluation with CV
for gp in grid_pipelines:
    
    cls = gp[1]
    print "\nCV: "
    print "******\n"
    
    print "Metric:", gp[0]
    print "CV selected params {}".format(cls.best_params_.values())
    cv_inner_f1 = cross_val_score(cls.best_estimator_, X_train, y_train, 
                                             cv=cv_inner, scoring='f1_weighted', n_jobs=-1)
    print "CV {} score: {}".format(gp[0], cls.best_score_)
    print "CV f1 score: %0.3f  (+/-%0.03f)" % (np.mean(cv_inner_f1), np.std(cv_inner_f1))


CV: 
******

Metric: roc_auc
CV selected params ['gini', 6, 700]
CV roc_auc score: 0.617359352435
CV f1 score: 0.671  (+/-0.002)

CV: 
******

Metric: recall
CV selected params ['gini', 4, 150]
CV recall score: 0.576853252648
CV f1 score: 0.654  (+/-0.003)

In [76]:
# Compute pipeline evaluation with CV
dd = []
for gp in grid_pipelines:
    
    cls = gp[1]   
    params =  np.array([str(d.values()) for d in np.array(cls.grid_scores_[:])[:,0]])
    mean = np.array(cls.grid_scores_)[:,1]
    values = np.array(cls.grid_scores_)[:,2]
    std = np.array([np.std(v) for v in values])
    
    dd = np.hstack((params.reshape(-1,1), mean.reshape(-1,1), std.reshape(-1,1), values.reshape(-1,1)))
    

    res = pd.DataFrame(dd,columns=["params","score_mean","score_std","scores"])
    print gp[0]
    print res


f1_weighted
                 params score_mean score_std  \
0   ['gini', None, 150]       0.74      0.00   
1   ['gini', None, 300]       0.74      0.00   
2   ['gini', None, 500]       0.74      0.00   
3   ['gini', None, 700]       0.74      0.00   
4      ['gini', 4, 150]       0.66      0.00   
5      ['gini', 4, 300]       0.66      0.00   
6      ['gini', 4, 500]       0.66      0.00   
7      ['gini', 4, 700]       0.66      0.00   
8      ['gini', 6, 150]       0.67      0.00   
9      ['gini', 6, 300]       0.67      0.00   
10     ['gini', 6, 500]       0.67      0.00   
11     ['gini', 6, 700]       0.67      0.00   
12     ['gini', 8, 150]       0.68      0.00   
13     ['gini', 8, 300]       0.68      0.00   
14     ['gini', 8, 500]       0.68      0.00   
15     ['gini', 8, 700]       0.68      0.00   
16    ['gini', 10, 150]       0.71      0.00   
17    ['gini', 10, 300]       0.71      0.00   
18    ['gini', 10, 500]       0.71      0.00   
19    ['gini', 10, 700]       0.71      0.00   

                                               scores  
0   [0.744569531767, 0.734633676937, 0.74545493196...  
1   [0.742389739632, 0.739061339686, 0.74776026621...  
2   [0.742426355894, 0.734922506482, 0.74742355402...  
3   [0.742630797509, 0.737628699479, 0.74718252543...  
4   [0.660396030411, 0.657572229078, 0.66152250041...  
5   [0.662239781819, 0.657004963191, 0.66231982013...  
6   [0.659597093711, 0.656432651474, 0.66303601201...  
7   [0.658148368032, 0.654564787537, 0.66023692921...  
8   [0.668355775528, 0.660724935929, 0.66455463563...  
9   [0.665741048386, 0.664586390777, 0.66805081825...  
10  [0.66541671466, 0.663864565302, 0.667265021757...  
11  [0.663911584171, 0.664426561578, 0.66717693106...  
12  [0.680911229647, 0.68573704344, 0.680743502935...  
13  [0.682071114652, 0.684618435987, 0.68240790110...  
14  [0.679368055588, 0.688535279647, 0.68307531330...  
15  [0.68026504623, 0.689396483153, 0.683993905354...  
16  [0.699977849179, 0.708235094923, 0.70761014765...  
17  [0.699922060787, 0.706772011298, 0.70787798292...  
18  [0.699275917016, 0.709079443396, 0.70876920486...  
19  [0.699365822195, 0.708555609394, 0.70899709661...  
recall
                 params score_mean score_std  \
0   ['gini', None, 150]       0.31      0.01   
1   ['gini', None, 300]       0.31      0.01   
2   ['gini', None, 500]       0.31      0.01   
3   ['gini', None, 700]       0.31      0.01   
4      ['gini', 4, 150]       0.56      0.01   
5      ['gini', 4, 300]       0.56      0.01   
6      ['gini', 4, 500]       0.56      0.01   
7      ['gini', 4, 700]       0.56      0.01   
8      ['gini', 6, 150]       0.56      0.01   
9      ['gini', 6, 300]       0.56      0.01   
10     ['gini', 6, 500]       0.56      0.01   
11     ['gini', 6, 700]       0.56      0.01   
12     ['gini', 8, 150]       0.52      0.01   
13     ['gini', 8, 300]       0.52      0.01   
14     ['gini', 8, 500]       0.52      0.00   
15     ['gini', 8, 700]       0.52      0.01   
16    ['gini', 10, 150]       0.45      0.01   
17    ['gini', 10, 300]       0.45      0.01   
18    ['gini', 10, 500]       0.45      0.01   
19    ['gini', 10, 700]       0.45      0.01   

                                               scores  
0   [0.307110438729, 0.309379727685, 0.31013615733...  
1   [0.298033282905, 0.298033282905, 0.30711043872...  
2   [0.301815431165, 0.304841149773, 0.30862329803...  
3   [0.303328290469, 0.301059001513, 0.30635400907...  
4   [0.555219364599, 0.580181543116, 0.55900151285...  
5   [0.555975794251, 0.580181543116, 0.55597579425...  
6   [0.561270801815, 0.583207261725, 0.55370650529...  
7   [0.56278366112, 0.582450832073, 0.556732223903...  
8   [0.548411497731, 0.581694402421, 0.55446293494...  
9   [0.550680786687, 0.575642965204, 0.55219364599...  
10  [0.548411497731, 0.576399394856, 0.55521936459...  
11  [0.552193645991, 0.576399394856, 0.55219364599...  
12  [0.522692889561, 0.525718608169, 0.52118003025...  
13  [0.513615733737, 0.530257186082, 0.51664145234...  
14  [0.514372163389, 0.520423600605, 0.51361573373...  
15  [0.510590015129, 0.521936459909, 0.51285930408...  
16  [0.456883509834, 0.447049924357, 0.43494704992...  
17  [0.462178517398, 0.444024205749, 0.43040847201...  
18  [0.463691376702, 0.441754916793, 0.44099848714...  
19  [0.468986384266, 0.444780635401, 0.44478063540...  

In [125]:
#Compute test score
for gp in grid_pipelines:
    
    cls = gp[1]
    y_pred =cls.predict(X_test)
    test_f1 = metrics.f1_score(y_test, y_pred, average='weighted', pos_label=None)

    print "\n",gp[0],":"
    print "**********\n"
    print "Test f1: %0.3f" % (test_f1)
    print "with following performance in test:"
    print metrics.classification_report(y_test, y_pred)
    cm = metrics.confusion_matrix(y_test, y_pred)
    print "\nConfusion matrix:"
    print cm

    print "\nAccuracy:", (cm[0,0] + cm[1,1])/ float(cm[0,0] + cm[1,1]+cm[0,1] + cm[1,0])
    print "Sensitivity:", cm[1,1] / float(cm[1,1] + cm[1,0]) #Reduce FN (recall)
    print "Specificity:", cm[0,0] / float(cm[0,0] + cm[0,1]) #Reduce FP

    y_probs = cls.best_estimator_.predict_proba(X_test)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_probs[:,1], pos_label=1)
    print "AUC:", metrics.auc(fpr, tpr)


roc_auc :
**********

Test f1: 0.673
with following performance in test:
             precision    recall  f1-score   support

          0       0.91      0.61      0.73     12896
          1       0.18      0.57      0.27      1888

avg / total       0.81      0.61      0.67     14784


Confusion matrix:
[[7918 4978]
 [ 811 1077]]

Accuracy: 0.608428030303
Sensitivity: 0.570444915254
Specificity: 0.613988833747
AUC: 0.627638961266

recall :
**********

Test f1: 0.662
with following performance in test:
             precision    recall  f1-score   support

          0       0.91      0.59      0.72     12896
          1       0.18      0.59      0.27      1888

avg / total       0.81      0.59      0.66     14784


Confusion matrix:
[[7673 5223]
 [ 779 1109]]

Accuracy: 0.594020562771
Sensitivity: 0.587394067797
Specificity: 0.594990694789
AUC: 0.622913679383

Learning curve


In [149]:
cls = grid_pipelines[1][1]
print cls


GridSearchCV(cv=StratifiedShuffleSplit(labels=[0 1 ..., 0 0], n_iter=10, test_size=0.2, random_state=24),
       error_score=0,
       estimator=Pipeline(steps=[('Imputer', TypeFeatImputer(allNameCols=Index([u'gender', u'age', u'time_in_hospital', u'num_lab_procedures',
       u'num_procedures', u'num_medications', u'number_outpatient',
       u'number_emergency', u'number_inpatient', u'number_diagnoses',
       u'metformin', u'repaglinide', u'glimep...alty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'logReg__class_weight': [None, 'balanced'], 'logReg__C': [0.001, 1, 10], 'logReg__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, scoring='recall', verbose=1)

In [145]:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):

    plt.figure(figsize=(8,6))
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("% Training set")
    plt.ylabel("F1-score")
    train_sizes_lc, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,scoring="f1_weighted")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid(True)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="b",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.axhline(0.5,color='r',ls='--', label="random")
    
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    return plt

In [148]:
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
title = ""
plot_learning_curve(cls.best_estimator_, title, X_train, y_train, ylim=(0.4, 1.01), 
                    cv=cv_inner,
                    train_sizes=[0.05,0.10,0.15,0.25,0.50,0.75,0.80,1.0], 
                    n_jobs=-1)

plt.show()


/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [ ]: