In [1]:
#TO RE-RUN
%reset -f

In [6]:
from sklearn import preprocessing
from time import time
import numpy as np
import csv
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE,ADASYN, RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline

from operator import truediv
from sklearn import metrics
import pandas as pd
import time
import os

from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt


np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.2f}'.format
plt.style.use('classic')

%matplotlib inline

import sys
sys.path.insert(1, "../../src/")
from TypeFeatImputer import TypeFeatImputer
from UnivCombineFilter import UnivCombineFilter

In [7]:
#df_all=pd.read_csv(os.path.join('resources','diabetic_data_processed_withweight.csv'),';')
df_all=pd.read_pickle(os.path.join('resources','clean_data.pkl'))
print df_all.shape
print df_all.columns
print df_all.readmitted.value_counts()
print df_all.readmitted.value_counts()/float(df_all.shape[0])


(71518, 20)
Index([u'diss_1', u'race_AfricanAmerican', u'race_Caucasian', u'race_Other',
       u'medSpec_cardio', u'medSpec_Family/GeneralPractice',
       u'medSpec_InternalMedicine', u'medSpec_surgery', u'age_cat',
       u'Diabetis', u'Circulatory', u'Digestive', u'Genitourinary',
       u'Poisoning', u'Muscoskeletal', u'Neoplasms', u'Respiratory', u'HbA1c',
       u'Change', u'readmitted'],
      dtype='object')
0    42985
2    22240
1     6293
Name: readmitted, dtype: int64
0   0.60
2   0.31
1   0.09
Name: readmitted, dtype: float64

Compute class label


In [8]:
# Readmitted
print df_all.loc[:,"readmitted"].sort_values().unique(), np.sum(df_all["readmitted"] == 0), np.sum(df_all["readmitted"] == 1), np.sum(df_all["readmitted"] == 2)
df_all["readmitted"][df_all["readmitted"].values > 0] = 1
print df_all.iloc[:,-1].sort_values().unique(), np.sum(df_all["readmitted"] == 0), np.sum(df_all["readmitted"] == 1)


[0 1 2] 42985 6293 22240
[0 1] 42985 28533
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until

Compute type fields


In [98]:
numCols = [
    "time_in_hospital","num_lab_procedures","num_procedures","num_medications",
    "number_outpatient","number_emergency","number_inpatient","number_diagnoses"
]

catCols = []
cols = df_all.columns
reducedCols = cols[:-1]

for i in range(len(cols)-1):
    if cols[i] not in numCols:
        catCols.append(1)
    else:
        catCols.append(0)
catCols = np.array(catCols)

print "Cat cols:", np.sum(catCols==1), "\n", reducedCols[catCols==1]
print "Num cols:", np.sum(catCols==0), "\n", reducedCols[catCols==0]
print len(reducedCols)


Cat cols: 66 
Index([u'gender', u'age', u'metformin', u'repaglinide', u'nateglinide',
       u'chlorpropamide', u'glimepiride', u'acetohexamide', u'glipizide',
       u'glyburide', u'tolbutamide', u'pioglitazone', u'rosiglitazone',
       u'acarbose', u'miglitol', u'troglitazone', u'tolazamide', u'examide',
       u'citoglipton', u'insulin', u'glyburide-metformin',
       u'glipizide-metformin', u'glimepiride-pioglitazone',
       u'metformin-rosiglitazone', u'metformin-pioglitazone', u'Change',
       u'diabetesMed', u'Diabetis', u'Infectious and parasitic diseases',
       u'Neoplasms', u'Endocrine', u'Blood', u'Mental', u'Nervous', u'Organs',
       u'Circulatory', u'Respiratory', u'Digestive', u'Genitourinary',
       u'Pregnancy', u'Skin', u'Muscoskeletal', u'Congenital', u'Perinatal',
       u'Ill-defined', u'race_AfricanAmerican', u'race_Asian',
       u'race_Caucasian', u'race_Hispanic', u'race_Other', u'adm_1', u'adm_2',
       u'adm_3', u'adm_4', u'adm_7', u'adm_src_1', u'adm_src_2', u'adm_src_3',
       u'adm_src_4', u'adm_src_5', u'adm_src_6', u'adm_src_7', u'adm_src_8',
       u'adm_src_10', u'Poisoning', u'External_causes'],
      dtype='object')
Num cols: 8 
Index([u'time_in_hospital', u'num_lab_procedures', u'num_procedures',
       u'num_medications', u'number_outpatient', u'number_emergency',
       u'number_inpatient', u'number_diagnoses'],
      dtype='object')
74

Compute partition (train, test)


In [99]:
y = df_all.readmitted
print y.unique()
print y.value_counts()
y = y.values

X = df_all.iloc[:,:-1].values
sss = StratifiedShuffleSplit(y, 1, test_size=0.30, random_state=32) #random_state=42
for train_index, test_index in sss:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

print
print X_train.shape, y_train.shape
print np.sum(y_train == 0), round(np.sum(y_train == 0)/float(y_train.shape[0]),2), \
      np.sum(y_train > 0), round(np.sum(y_train > 0)/float(y_train.shape[0]),2)
print X_test.shape, y_test.shape
print np.sum(y_test == 0), round(np.sum(y_test == 0)/float(y_test.shape[0]),2), \
      np.sum(y_test > 0), round(np.sum(y_test > 0)/float(y_test.shape[0]),2)


[0 1]
0    17030
1    10775
Name: readmitted, dtype: int64

(19463, 74) (19463,)
11921 0.61 7542 0.39
(8342, 74) (8342,)
5109 0.61 3233 0.39

Simple pipeline


In [146]:
basePipeline = Pipeline([
        ("Imputer", TypeFeatImputer(catCols, reducedCols)),
        ("Variance", VarianceThreshold(threshold=(.995 * (1 - .995)))),
        ("Scaler", StandardScaler())
    ])

params = {}
pipeline = []
pipe = Pipeline(list(basePipeline.steps))

In [131]:
fs_method = "combine_fs"
pipe.steps.insert(1,(fs_method, UnivCombineFilter(catCols,np.array(reducedCols))))
params.update({fs_method + '__percentile':[30,50,70,80]})

In [139]:
cls_method = "logReg"
pipe.steps.append((cls_method, LogisticRegression(random_state=42)))
params.update({cls_method + '__C': [1e-5,0.001,0.01,0.1,1,5,10]})
params.update({cls_method + '__class_weight': [None, 'balanced']})
params.update({cls_method + '__penalty': ["l1","l2"]})

In [111]:
cls_method = "rf"
pipe.steps.append((cls_method, RandomForestClassifier(n_jobs=-1,random_state=42,class_weight="balanced")))
params.update({cls_method + '__n_estimators': [150,300,500,700], 
               cls_method + '__criterion': ['gini'],
               cls_method + '__max_depth' : [None,4,6,8,10]})

In [302]:
cls_method = "knn"
pipe.steps.append((cls_method, KNeighborsClassifier(n_jobs=-1)))

params.update({cls_method + '__n_neighbors': [3,5,7,9], 
               cls_method + '__weights': ['uniform', 'distance']})

In [419]:
cls_method = "svm"
pipe.steps.append((cls_method, SVC(kernel = "rbf", random_state=42,probability=True)))
params.update({cls_method + '__C': [0.01,0.1,0.5,1,5,10,15,30,50], 
               cls_method + '__gamma' : [0.0001,0.001,0.01, 0.1,1,5],
               cls_method + '__class_weight': [None, 'balanced']})

In [47]:
cls_method = "nb"
pipe.steps.append((cls_method, GaussianNB()))
#params.update({cls_method + '__alpha': [1e-3,0.001,0.01,0.1,0.5,1,5]})

In [140]:
#Post process pipeline
pipe_imb = make_pipeline(*[p[1] for p in pipe.steps])
stps = len(pipe_imb.steps)        
for s in range(stps):
    pipe_imb.steps.remove(pipe_imb.steps[0])
for s in range(stps):
    pipe_imb.steps.append(pipe.steps[s])

In [141]:
#Add sampling
sm_method = "smote"                
pipe_imb.steps.insert(stps - 1, 
                      (sm_method, SMOTE(ratio='auto', kind='regular', random_state=32)))
params.update({sm_method + "__k_neighbors":[3,4,5]})

Pipeline setup


In [142]:
verbose = False
mtrs = ["f1_weighted"] #"f1","recall","precision"
cv_thr = 0.3
cv_folds = 5

print pipe.steps


[('Imputer', TypeFeatImputer(allNameCols=Index([u'gender', u'age', u'time_in_hospital', u'num_lab_procedures',
       u'num_procedures', u'num_medications', u'number_outpatient',
       u'number_emergency', u'number_inpatient', u'number_diagnoses',
       u'metformin', u'repaglinide', u'nateglinide', u'chlorpropamide',
       u...src_7', u'adm_src_8', u'adm_src_10',
       u'Poisoning', u'External_causes'],
      dtype='object'),
        dataCatCols=array([1, 1, ..., 1, 1]))), ('Variance', VarianceThreshold(threshold=0.004975)), ('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logReg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]

Run pipeline


In [143]:
print "ALL TRAIN:", X_train.shape
print "TRAIN:", "[0's:", np.sum(y_train==0), "1's:", np.sum(y_train==1), "]"
print "ALL TEST:", X_test.shape
print "TEST:", "[0's:", np.sum(y_test==0), "1's:", np.sum(y_test==1), "]"
print "TEST:", "[0's:", np.sum(y_test==0)/float(y_test.shape[0]), "1's:", np.sum(y_test==1)/float(y_test.shape[0]), "]"

# Run experiment
start = time.time()

#Prepare pipe_cls      
pipeline_cls = pipe_imb
pipeline_params = params

if verbose:
    print "\n",pipeline_cls.steps


#Prepare cv
cv_inner = StratifiedShuffleSplit(y_train, n_iter=cv_folds, test_size=cv_thr,random_state=24)

print "\nCV TRAIN:", cv_inner.n_train
print "CV_TEST:", cv_inner.n_test

#Fit pipeline with CV                        
grid_pipelines = []

for m in mtrs:
    grid_pipeline = GridSearchCV(pipeline_cls, param_grid=pipeline_params, verbose=1, 
                                 n_jobs=-1, cv=cv_inner, scoring= m, error_score = 0,
                                 refit=True) 
    grid_pipeline.fit(X_train, y_train)
    grid_pipelines.append([m,grid_pipeline])

end = time.time()
print "Total time:", end - start


ALL TRAIN: (19463, 74)
TRAIN: [0's: 11921 1's: 7542 ]
ALL TEST: (8342, 74)
TEST: [0's: 5109 1's: 3233 ]
TEST: [0's: 0.612443059218 1's: 0.387556940782 ]

CV TRAIN: 13624
CV_TEST: 5839
Fitting 5 folds for each of 84 candidates, totalling 420 fits
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 420 out of 420 | elapsed:  4.5min finished
Total time: 279.275966167

In [144]:
for gp in grid_pipelines:
    print
    print gp[0]
    print "*****\n"
    print gp[1].best_estimator_.steps[-1]
    print
    
    if gp[1].best_estimator_.steps[1][0] == "combine_fs":
        varFilter = gp[1].best_estimator_.steps[1][1]

        print "Selected thr:", varFilter.percentile
        print "Selected columns:"
        feats = reducedCols[varFilter.ixCols].tolist()
        print feats
        print "Num useful features:", len(feats), feats
    
    if gp[1].best_estimator_.steps[1][0] == "Variance":
        varFilter = gp[1].best_estimator_.steps[1][1]
        feats = reducedCols[varFilter.get_support() == False]
        print "Discarded feats:", len(feats), feats


f1_weighted
*****

('logReg', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))

Discarded feats: 20 Index([u'acetohexamide', u'tolbutamide', u'miglitol', u'troglitazone',
       u'tolazamide', u'examide', u'citoglipton', u'glipizide-metformin',
       u'glimepiride-pioglitazone', u'metformin-rosiglitazone',
       u'metformin-pioglitazone', u'Diabetis', u'Congenital', u'Perinatal',
       u'adm_4', u'adm_7', u'adm_src_3', u'adm_src_5', u'adm_src_8',
       u'adm_src_10'],
      dtype='object')

In [145]:
# Computel Train score (with best CV params)
for gp in grid_pipelines:
    
    cls = gp[1]
    y_pred = cls.predict(X_train)                
    train_prec_scores = metrics.precision_score(y_train, y_pred, average='weighted', pos_label=None)
    train_rec_scores = metrics.recall_score(y_train, y_pred, average='weighted', pos_label=None)
    train_f1_scores = metrics.f1_score(y_train, y_pred, average='weighted', pos_label=None)

    print "\nTRAIN: "
    print "**********\n"

    print "Metric:",gp[0]
    print "TR Prec score:", train_prec_scores
    print "TR Rec score:", train_rec_scores
    print "TR F1 score:", train_f1_scores


TRAIN: 
**********

Metric: f1_weighted
TR Prec score: 0.631902812588
TR Rec score: 0.620767610338
TR F1 score: 0.624446683871

In [125]:
# Compute pipeline evaluation with CV
for gp in grid_pipelines:
    
    cls = gp[1]
    print "\nCV: "
    print "******\n"
    
    print "Metric:", gp[0]
    print "CV selected params {}".format(cls.best_params_.values())
    cv_inner_f1 = cross_val_score(cls.best_estimator_, X_train, y_train, 
                                             cv=cv_inner, scoring='f1_weighted', n_jobs=-1)
    print "CV {} score: {}".format(gp[0], cls.best_score_)
    print "CV f1 score: %0.3f  (+/-%0.03f)" % (np.mean(cv_inner_f1), np.std(cv_inner_f1))


CV: 
******

Metric: f1_weighted
CV selected params ['balanced', 0.1, 'l1']
CV f1_weighted score: 0.619332387764
CV f1 score: 0.619  (+/-0.001)

In [108]:
# Compute pipeline evaluation with CV
dd = []
for gp in grid_pipelines:
    
    cls = gp[1]   
    params =  np.array([str(d.values()) for d in np.array(cls.grid_scores_[:])[:,0]])
    mean = np.array(cls.grid_scores_)[:,1]
    values = np.array(cls.grid_scores_)[:,2]
    std = np.array([np.std(v) for v in values])
    
    dd = np.hstack((params.reshape(-1,1), mean.reshape(-1,1), std.reshape(-1,1), values.reshape(-1,1)))
    

    res = pd.DataFrame(dd,columns=["params","score_mean","score_std","scores"])
    print gp[0]
    print res


f1_weighted
                  params score_mean score_std  \
0       ['gini', 6, 100]       0.62      0.00   
1       ['gini', 6, 150]       0.62      0.00   
2       ['gini', 6, 300]       0.62      0.00   
3       ['gini', 6, 500]       0.62      0.00   
4       ['gini', 8, 100]       0.62      0.00   
5       ['gini', 8, 150]       0.62      0.00   
6       ['gini', 8, 300]       0.62      0.00   
7       ['gini', 8, 500]       0.62      0.00   
8      ['gini', 10, 100]       0.63      0.00   
9      ['gini', 10, 150]       0.63      0.00   
10     ['gini', 10, 300]       0.63      0.00   
11     ['gini', 10, 500]       0.63      0.00   
12     ['gini', 12, 100]       0.63      0.00   
13     ['gini', 12, 150]       0.63      0.00   
14     ['gini', 12, 300]       0.63      0.00   
15     ['gini', 12, 500]       0.63      0.00   
16   ['entropy', 6, 100]       0.62      0.00   
17   ['entropy', 6, 150]       0.62      0.01   
18   ['entropy', 6, 300]       0.62      0.00   
19   ['entropy', 6, 500]       0.62      0.00   
20   ['entropy', 8, 100]       0.62      0.00   
21   ['entropy', 8, 150]       0.62      0.00   
22   ['entropy', 8, 300]       0.62      0.00   
23   ['entropy', 8, 500]       0.62      0.00   
24  ['entropy', 10, 100]       0.63      0.00   
25  ['entropy', 10, 150]       0.63      0.00   
26  ['entropy', 10, 300]       0.63      0.00   
27  ['entropy', 10, 500]       0.62      0.00   
28  ['entropy', 12, 100]       0.63      0.01   
29  ['entropy', 12, 150]       0.63      0.00   
30  ['entropy', 12, 300]       0.63      0.00   
31  ['entropy', 12, 500]       0.63      0.00   

                                               scores  
0   [0.623972673343, 0.618297324615, 0.62511557122...  
1   [0.622435143864, 0.616422035211, 0.62358733624...  
2   [0.622250824515, 0.614819490672, 0.62502650513...  
3   [0.622616221555, 0.614640558473, 0.62336336672...  
4   [0.62803809614, 0.622052403578, 0.627159125471...  
5   [0.62565658651, 0.623014561766, 0.626968849742...  
6   [0.626117478365, 0.624417239522, 0.62643044871...  
7   [0.626053932455, 0.621116097403, 0.62706477250...  
8   [0.626627870628, 0.626979819568, 0.63237180153...  
9   [0.627978765846, 0.626001397648, 0.63239146506...  
10  [0.626524189268, 0.625859709989, 0.63158101851...  
11  [0.627102979385, 0.622887218451, 0.63089654293...  
12  [0.624905298436, 0.625224065357, 0.63247143925...  
13  [0.629705083018, 0.627375712613, 0.63095510602...  
14  [0.632635886588, 0.625351735038, 0.63054424893...  
15  [0.632884100557, 0.62709127276, 0.631250148937...  
16  [0.622596005494, 0.615874184757, 0.62716149574...  
17  [0.623594361983, 0.614435384974, 0.62970133448...  
18  [0.623789580308, 0.616302478544, 0.62845446854...  
19  [0.623456202853, 0.616017362411, 0.62560803679...  
20  [0.624525109172, 0.61898347693, 0.62561528024,...  
21  [0.62717302492, 0.621898539555, 0.623959536111...  
22  [0.628017133103, 0.620216536904, 0.62441242213...  
23  [0.62667746732, 0.61889614222, 0.626843490646,...  
24  [0.626202649752, 0.621247829191, 0.63118065640...  
25  [0.627373201226, 0.623841415259, 0.63111089642...  
26  [0.626796024166, 0.622957553882, 0.62864511450...  
27  [0.628046036932, 0.622479755224, 0.62658846925...  
28  [0.624942195477, 0.618709623022, 0.63324829271...  
29  [0.626948217226, 0.623151177592, 0.63577496074...  
30  [0.630792465718, 0.623142641741, 0.63240617227...  
31  [0.630212067355, 0.624639006813, 0.63319050698...  
recall
                  params score_mean score_std  \
0       ['gini', 6, 100]       0.56      0.01   
1       ['gini', 6, 150]       0.56      0.00   
2       ['gini', 6, 300]       0.56      0.00   
3       ['gini', 6, 500]       0.56      0.00   
4       ['gini', 8, 100]       0.56      0.01   
5       ['gini', 8, 150]       0.56      0.01   
6       ['gini', 8, 300]       0.56      0.01   
7       ['gini', 8, 500]       0.55      0.01   
8      ['gini', 10, 100]       0.54      0.00   
9      ['gini', 10, 150]       0.54      0.00   
10     ['gini', 10, 300]       0.54      0.01   
11     ['gini', 10, 500]       0.54      0.01   
12     ['gini', 12, 100]       0.52      0.00   
13     ['gini', 12, 150]       0.52      0.01   
14     ['gini', 12, 300]       0.52      0.01   
15     ['gini', 12, 500]       0.52      0.01   
16   ['entropy', 6, 100]       0.56      0.01   
17   ['entropy', 6, 150]       0.56      0.01   
18   ['entropy', 6, 300]       0.56      0.00   
19   ['entropy', 6, 500]       0.56      0.00   
20   ['entropy', 8, 100]       0.56      0.01   
21   ['entropy', 8, 150]       0.56      0.01   
22   ['entropy', 8, 300]       0.56      0.01   
23   ['entropy', 8, 500]       0.56      0.00   
24  ['entropy', 10, 100]       0.55      0.01   
25  ['entropy', 10, 150]       0.55      0.01   
26  ['entropy', 10, 300]       0.55      0.01   
27  ['entropy', 10, 500]       0.54      0.01   
28  ['entropy', 12, 100]       0.53      0.01   
29  ['entropy', 12, 150]       0.53      0.01   
30  ['entropy', 12, 300]       0.53      0.01   
31  ['entropy', 12, 500]       0.52      0.00   

                                               scores  
0   [0.568272205038, 0.567388422448, 0.55103844454...  
1   [0.566062748564, 0.56473707468, 0.552364118427...  
2   [0.561643835616, 0.556783031374, 0.54927087936...  
3   [0.558992487848, 0.55634114008, 0.551038444543...  
4   [0.562085726911, 0.560760053027, 0.54750331418...  
5   [0.562527618206, 0.560760053027, 0.54396818382...  
6   [0.562085726911, 0.559876270437, 0.54573574900...  
7   [0.557224922669, 0.554131683606, 0.54573574900...  
8   [0.543526292532, 0.550154661953, 0.53645603181...  
9   [0.542200618648, 0.546619531595, 0.53424657534...  
10  [0.547503314185, 0.543968183827, 0.52894387980...  
11  [0.543526292532, 0.541758727353, 0.52806009721...  
12  [0.515687140963, 0.522757401679, 0.51480335837...  
13  [0.519664162616, 0.524524966858, 0.50905877154...  
14  [0.519222271321, 0.522315510384, 0.50375607600...  
15  [0.518338488732, 0.524966858153, 0.50684931506...  
16  [0.566062748564, 0.567388422448, 0.55059655324...  
17  [0.570923552806, 0.563411400795, 0.55678303137...  
18  [0.56385329209, 0.561643835616, 0.554131683606...  
19  [0.55634114008, 0.558550596553, 0.554573574901...  
20  [0.566062748564, 0.560318161732, 0.55148033583...  
21  [0.571807335395, 0.56473707468, 0.54706142289,...  
22  [0.565178965974, 0.55545735749, 0.549712770658...  
23  [0.560318161732, 0.554573574901, 0.55059655324...  
24  [0.553689792311, 0.550154661953, 0.53247901016...  
25  [0.558108705259, 0.551038444543, 0.52894387980...  
26  [0.552806009722, 0.549270879364, 0.52717631462...  
27  [0.549270879364, 0.549270879364, 0.5293857711,...  
28  [0.533362792753, 0.521431727795, 0.51480335837...  
29  [0.527618205921, 0.527176314627, 0.51745470614...  
30  [0.533804684048, 0.524524966858, 0.51391957578...  
31  [0.527176314627, 0.525850640742, 0.51612903225...  

In [109]:
#Compute test score
for gp in grid_pipelines:
    
    cls = gp[1]
    y_pred =cls.predict(X_test)
    test_f1 = metrics.f1_score(y_test, y_pred, average='weighted', pos_label=None)

    print "\n",gp[0],":"
    print "**********\n"
    print "Test f1: %0.3f" % (test_f1)
    print "with following performance in test:"
    print metrics.classification_report(y_test, y_pred)
    cm = metrics.confusion_matrix(y_test, y_pred)
    print "\nConfusion matrix:"
    print cm

    print "\nAccuracy:", (cm[0,0] + cm[1,1])/ float(cm[0,0] + cm[1,1]+cm[0,1] + cm[1,0])
    print "Sensitivity:", cm[1,1] / float(cm[1,1] + cm[1,0]) #Reduce FN (recall)
    print "Specificity:", cm[0,0] / float(cm[0,0] + cm[0,1]) #Reduce FP

    y_probs = cls.best_estimator_.predict_proba(X_test)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_probs[:,1], pos_label=1)
    print "AUC:", metrics.auc(fpr, tpr)


f1_weighted :
**********

Test f1: 0.628
with following performance in test:
             precision    recall  f1-score   support

          0       0.70      0.68      0.69      5109
          1       0.52      0.54      0.53      3233

avg / total       0.63      0.63      0.63      8342


Confusion matrix:
[[3483 1626]
 [1492 1741]]

Accuracy: 0.626228722129
Sensitivity: 0.538509124652
Specificity: 0.681738109219
AUC: 0.661631975062

recall :
**********

Test f1: 0.619
with following performance in test:
             precision    recall  f1-score   support

          0       0.70      0.64      0.67      5109
          1       0.50      0.58      0.54      3233

avg / total       0.63      0.62      0.62      8342


Confusion matrix:
[[3269 1840]
 [1370 1863]]

Accuracy: 0.615200191801
Sensitivity: 0.576244973709
Specificity: 0.639851242905
AUC: 0.653975320688

Learning curve


In [149]:
cls = grid_pipelines[1][1]
print cls


GridSearchCV(cv=StratifiedShuffleSplit(labels=[0 1 ..., 0 0], n_iter=10, test_size=0.2, random_state=24),
       error_score=0,
       estimator=Pipeline(steps=[('Imputer', TypeFeatImputer(allNameCols=Index([u'gender', u'age', u'time_in_hospital', u'num_lab_procedures',
       u'num_procedures', u'num_medications', u'number_outpatient',
       u'number_emergency', u'number_inpatient', u'number_diagnoses',
       u'metformin', u'repaglinide', u'glimep...alty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'logReg__class_weight': [None, 'balanced'], 'logReg__C': [0.001, 1, 10], 'logReg__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, scoring='recall', verbose=1)

In [145]:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):

    plt.figure(figsize=(8,6))
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("% Training set")
    plt.ylabel("F1-score")
    train_sizes_lc, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,scoring="f1_weighted")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid(True)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="b",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.axhline(0.5,color='r',ls='--', label="random")
    
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    return plt

In [148]:
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
title = ""
plot_learning_curve(cls.best_estimator_, title, X_train, y_train, ylim=(0.4, 1.01), 
                    cv=cv_inner,
                    train_sizes=[0.05,0.10,0.15,0.25,0.50,0.75,0.80,1.0], 
                    n_jobs=-1)

plt.show()


/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/aegle/miniconda2/envs/readmision/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [ ]: