In [1]:
%reset -f

In [2]:
from IPython.display import display, HTML
import pandas as pd
import numpy as np
import os
import itertools
from sklearn import metrics
from sklearn import cross_validation
from scipy import interp
import imblearn.metrics as metrics2

import sys
sys.path.insert(1, "/home/xrafael/Projects/myOsa/src/venv/notebook/puntual-monitoring-improved-2017-final/src/")
from TypeFeatImputer import TypeFeatImputer
from i_score_parallel import i_score
from TypeFeatFS import DiscreteFS, ContinuousFS
from TypeFeatFilter import DiscreteFilter, ContinuousFilter
from OutlierFiltering import OutlierFiltering
from typeFeat_score import typeFeat_score

from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt

from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt


np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.2f}'.format
plt.style.use('classic')

%matplotlib inline


/home/xrafael/Programs/anaconda2/envs/myosa/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

1. Load data


In [3]:
folder = "more_than_four" #equal_more_four

In [4]:
dfAll =[]
datasets = [0,1,2]
for ds in datasets:
    pathDir = os.path.join('resources','results',folder,'ds_' + str(ds)) 
    path, dirs, files = os.walk(pathDir).next()
    file_count = len(files)

    for num_exp in range(file_count):
        ddf = pd.read_pickle(os.path.join('resources','results', folder,'ds_' + str(ds),'results_pipe_' 
                                         + str(ds) + '_' + str(num_exp) + '.pkl'))
    ddf["ds"] = ds
    print ddf.shape
    dfAll.append(ddf)


(76, 23)
(76, 23)
(76, 23)

In [5]:
dfs = pd.DataFrame(np.vstack(dfAll), columns=ddf.columns)

dfs.cv_f1_mean = pd.to_numeric(dfs.cv_f1_mean)
dfs.cv_rec_mean = pd.to_numeric(dfs.cv_rec_mean)
dfs.cv_prec_mean = pd.to_numeric(dfs.cv_prec_mean)

dfs.cv_f1_std = pd.to_numeric(dfs.cv_f1_std)
dfs.cv_prec_std = pd.to_numeric(dfs.cv_prec_std)
dfs.cv_rec_std = pd.to_numeric(dfs.cv_rec_std)
dfs.exp = pd.to_numeric(dfs.exp)

dfs["cv_f1"] = dfs["cv_f1_mean"].round(2).astype(str).str.cat(dfs["cv_f1_std"].round(2).astype(str), sep="+/-")
dfs["cv_prec"] = dfs["cv_prec_mean"].round(2).astype(str).str.cat(dfs["cv_prec_std"].round(2).astype(str), sep="+/-")
dfs["cv_rec"] = dfs["cv_rec_mean"].round(2).astype(str).str.cat(dfs["cv_rec_std"].round(2).astype(str), sep="+/-")
dfs["num_nones"] = np.sum(np.hstack(((dfs.sm == "none").reshape(-1,1),(dfs.fs == "none").reshape(-1,1))), axis=1)


/home/xrafael/Programs/anaconda2/envs/myosa/lib/python2.7/site-packages/ipykernel/__main__.py:15: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead

2. Best pipelines per timepoint


In [25]:
ds = 1

In [26]:
class_names = ["Non-compliant","Compliant"]

#Make a copy
df = dfs[dfs.ds == ds].sort_values("cv_f1_mean", ascending=False).copy()

df.cv_f1_mean = pd.to_numeric(df.cv_f1_mean)
df.cv_rec_mean = pd.to_numeric(df.cv_rec_mean)
df.cv_prec_mean = pd.to_numeric(df.cv_prec_mean)
cls = df.iloc[0].pipeline
cls_name = df.iloc[0].cls

print cls_name
print cls,"\n"
print cls.steps[-1][1],"\n"
print df.columns
display(df.iloc[:1][["ds","fs","sm","metric","cls","params","cv_f1","cv_f1_mean"]])


svmRBF
Pipeline(steps=[('Imputer', TypeFeatImputer(allNameCols=array(['GENERO', 'ACTIVO', 'JUBILADO', 'EDAD', 'FUMA', 'FUMA_PAQ_ANY',
       'FUMA_EX_ANYS', 'ALCOHOL', 'ALCOHOL_GR_DIA', 'CAFES_DIA',
       'DEPRESION', 'ANSIEDAD', 'HTA', 'CARDIOPATIA', 'ENF_RESP',
       'DIABETES', 'OBESIDAD', 'DISLIPEMIA', 'OBSTR...bf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False))]) 

SVC(C=15, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False) 

Index([u'exp', u'out', u'fs', u'sm', u'cls', u'metric', u'params', u'tr_f1',
       u'tr_prec', u'tr_rec', u'cv_f1_mean', u'cv_f1_std', u'cv_prec_mean',
       u'cv_prec_std', u'cv_rec_mean', u'cv_rec_std', u'test_f1', u'test_prec',
       u'test_rec', u'test_auc', u'time', u'pipeline', u'ds', u'cv_f1',
       u'cv_prec', u'cv_rec', u'num_nones'],
      dtype='object')
ds fs sm metric cls params cv_f1 cv_f1_mean
128 1 none after f1_weighted svmRBF [0.001, None, 4, 15] 0.82+/-0.06 0.82

In [27]:
datafilenames = []
datafilenames.append(os.path.join('resources','data_partition_cl0_short_no_monit_14012016.pkl'))
datafilenames.append(os.path.join('resources','data_partition_cl0_short_1m_monit_14012016.pkl'))
datafilenames.append(os.path.join('resources','data_partition_cl0_short_3m_1m_monit_14012016.pkl'))


#Load Train & test data
f = file(datafilenames[ds],"rb")
X_train = np.load(f).astype(float)
y_train = np.load(f).astype(int)
X_test = np.load(f).astype(float)
y_test = np.load(f).astype(int)
cols = np.load(f)
tr_codes = np.load(f)
ts_codes = np.load(f)
feat_types = np.load(f)
feat_types = dict(feat_types.tolist())
f.close()

#Join data
X_all = np.vstack((X_train,X_test))
y_all = np.hstack((y_train, y_test)).reshape(-1,1)
all_codes = np.hstack((tr_codes,ts_codes))
print X_all.shape, y_all.shape, all_codes.shape
print X_train.shape, X_test.shape
print np.sum(y_test == 0), np.sum(y_test == 1)
print cols

#Load cv_outer datasets
cv_outer = cross_validation.StratifiedShuffleSplit(y_train, n_iter=10, test_size=0.30, random_state=42)


(42, 63) (42, 1) (42,)
(29, 63) (13, 63)
6 7
['GENDER' 'ACTIVE' 'RETIRED' 'AGE' 'SMOKER' 'SMOKE_PACK_YEAR'
 'SMOKE_EX_YEAR' 'ALCOHOL' 'ALCOHOL_GR_DAY' 'CAFES_DAY' 'DEPRESSION'
 'ANXIETY' 'HTA' 'CARDIOPATHY' 'RESP_DIS' 'DIABETES' 'OBESITY' 'DISLIPEMIA'
 'OBSTR_NASAL_CHRON' 'SNORES' 'REP_SLEEP' 'CRISIS_ASP' 'NOCTURIA'
 'APNEAS_PRES' 'CEFALEA' 'DAY_SOMN' 'AWAKE_NIGHT' 'CONC_DIS'
 'IRR_APAT_DEPRE' 'INSOMNIA' 'ACT_MOTR_NOCT' 'FEEL_REAL_DRE'
 'DREAM_INT_PIROSIS' 'DEC_SEX_DIS' 'IECAS' 'DIURETICS' 'ANTIAGREG'
 'ANTIACID' 'HIPOLIPEM' 'BDZ' 'SLEEP_TIME' 'IAH' 'TIME_SAT<90%' 'IND_DESAT'
 'SIZE' 'BMI' 'CIRC_NECK' 'TAS_MEAN' 'TAD_MEAN' 'SAT_O2' 'PRESS_CPAP'
 'EPWORTH' 'EUROQOL' 'VISUAL' 'TOTAL_USE_HOURS_1' 'MEAN_USE_HOURS_1'
 'AL_IRR_CUT_MASC_1' 'DRY_MOUTH_1' 'TAS_MEAN_1' 'TAD_MEAN_1' 'EPWORTH_1'
 'EUROQOL_1' 'VISUAL_EUROQOL_1' 'LABEL']

In [28]:
datafilenames = []
datafilenames.append(os.path.join('resources','mapping_filtered_cl0_short_no_monit_14012016_final.pkl'))
datafilenames.append(os.path.join('resources','mapping_filtered_cl0_short_1m_monit_14012016_final.pkl'))
datafilenames.append(os.path.join('resources','mapping_filtered_cl0_short_3m_1m_monit_14012016_final.pkl'))

dfFeatures = pd.read_pickle(datafilenames[ds])
print dfFeatures.shape


(93, 6)
Out[28]:
Name Description Values Type Cat Month
RONCA SNORES Snores 'never':0, 'sometimes':1, 'frequently':2, 'alw... CAT SYMPTOMS M0
TRAST_MEM MEM_DIS Memmory disorder 'never':0, 'sometimes':1, 'frequently':2, 'alw... CAT SYMPTOMS M0
EUROQOL EUROQOL Questionnaire result for measuring generic hea... [0-1] NUM BASAL M0
ALCOHOL ALCOHOL Alcohol intake 'no':0, 'yes':1 CAT CLINICAL_HISTORY M0
ANTICOAG ANTICOAG Anticoagulant intake 'no':0, 'yes':1 CAT THERAPY M0

3. CV Performances


In [29]:
cv_f1 = cross_validation.cross_val_score(cls, X_train, y_train, 
                                         cv=cv_outer, scoring='f1_weighted', n_jobs=-1)
cv_prec = cross_validation.cross_val_score(cls, X_train, y_train, 
                                         cv=cv_outer, scoring='precision_weighted', n_jobs=-1)
cv_rec = cross_validation.cross_val_score(cls, X_train, y_train, 
                                         cv=cv_outer, scoring='recall_weighted', n_jobs=-1)
cv_auc = cross_validation.cross_val_score(cls, X_train, y_train, 
                                         cv=cv_outer, scoring='roc_auc', n_jobs=-1)

print "\nCV OUTER f1 score: %0.3f  (+/-%0.03f)" % (np.mean(cv_f1), np.std(cv_f1))
print "CV OUTER prec score: %0.3f  (+/-%0.03f)" % (np.mean(cv_prec), np.std(cv_prec))
print "CV OUTER rec score: %0.3f  (+/-%0.03f)" % (np.mean(cv_rec), np.std(cv_rec))
print "CV OUTER auc score: %0.3f  (+/-%0.03f)" % (np.mean(cv_auc), np.std(cv_auc))


CV OUTER f1 score: 0.839  (+/-0.077)
CV OUTER prec score: 0.872  (+/-0.065)
CV OUTER rec score: 0.844  (+/-0.074)
CV OUTER auc score: 0.845  (+/-0.127)

4. Test performances


In [30]:
y_pred = cls.predict(X_test)
y_scores = cls.predict_proba(X_test)
print y_scores.shape
print y_test.shape

test_f1 = metrics.f1_score(y_test, y_pred, average='weighted', pos_label=None)
test_prec = metrics.recall_score(y_test, y_pred, average='weighted', pos_label=1)
test_rec = metrics.precision_score(y_test, y_pred, average='weighted', pos_label=1)
test_auc_w = metrics.roc_auc_score(y_test, y_pred, average='weighted')
test_auc_mi = metrics.roc_auc_score(y_test, y_pred, average="micro")
test_auc_ma = metrics.roc_auc_score(y_test, y_pred, average="macro")

sens = metrics2.sensitivity_score(y_test, y_pred,average='weighted')
spec = metrics2.specificity_score(y_test, y_pred, average='weighted')

print "\nTest performances:\n"
print "F1-weighted: %0.3f" % (test_f1)
print "Precision-weighted: %0.3f" % (test_prec)
print "Recall-weighted: %0.3f" % (test_rec)
print "AUC-weighted: %0.3f" % (test_auc_w)
print "AUC-micro: %0.3f" % (test_auc_mi)
print "AUC-macro: %0.3f" % (test_auc_ma)
print "Sensitivity:", sens
print "Specificity:", spec


print metrics.classification_report(y_test, y_pred)


(13, 2)
(13,)

Test performances:

F1-weighted: 0.761
Precision-weighted: 0.769
Recall-weighted: 0.846
AUC-weighted: 0.786
AUC-micro: 0.786
AUC-macro: 0.786
Sensitivity: 0.769230769231
Specificity: 0.802197802198
             precision    recall  f1-score   support

          0       0.67      1.00      0.80         6
          1       1.00      0.57      0.73         7

avg / total       0.85      0.77      0.76        13

5. Confusion matrix


In [418]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Oranges, cbar=False):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    plt.grid(False)
    im = plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar(im, orientation='vertical',shrink=0.3)
    
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [419]:
print ds

# Compute confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
fig = plt.figure(figsize=(10,6))
ax = plt.subplot(1,2,1)
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix\n without normalization')

# Plot normalized confusion matrix
ax = plt.subplot(1,2,2)
im = plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix', cbar=True)


plt.tight_layout()
plt.show()


0
Confusion matrix, without normalization
[[6 0]
 [3 4]]
Normalized confusion matrix
[[ 1.    0.  ]
 [ 0.43  0.57]]

4. classification Probabilities


In [420]:
# Plot the two-class decision scores
plot_colors = "br"
plot_step = 0.02

twoclass_output = cls.predict_proba(X_test)[:,1]
plot_range = (twoclass_output.min(), twoclass_output.max())
print cls
print twoclass_output


plt.figure(figsize=(10, 5))
for i, n, c in zip(range(2), class_names, plot_colors):
    plt.hist(twoclass_output[y_test == i],
             bins=20,
             range=(0,1.0),
             facecolor=c,
             label='Class %s' % n,
             alpha=.5,
             edgecolor='k')
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, y1, y2 * 1.2))
plt.legend(loc='upper right')
plt.ylabel('Samples')
plt.xlabel('Score')
plt.title('Predicted probabilities Scores')
plt.axvline(0.5,label="Default threshold",c="r",ls="--")
plt.xticks(np.arange(0,1.05,0.05), np.arange(0,1.05,0.05))
plt.xlim(0,1)
plt.tight_layout()


Pipeline(steps=[('Imputer', TypeFeatImputer(allNameCols=array(['GENERO', 'ACTIVO', 'JUBILADO', 'EDAD', 'FUMA', 'FUMA_PAQ_ANY',
       'FUMA_EX_ANYS', 'ALCOHOL', 'ALCOHOL_GR_DIA', 'CAFES_DIA',
       'DEPRESION', 'ANSIEDAD', 'HTA', 'CARDIOPATIA', 'ENF_RESP',
       'DIABETES', 'OBESIDAD', 'DISLIPEMIA', 'OBSTR...bf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False))])
[ 0.39  0.6   0.68  0.61  0.49  0.38  0.42  0.39  0.46  0.45  0.76  0.42
  0.46]

6. Precision-recall curve


In [185]:
if cls_name == "rf":    
    y_prob = cls.predict_proba(X_test)
    average_precision = metrics.average_precision_score(y_test, y_prob[:,1], average="weighted")
    precision, recall, _ = metrics.precision_recall_curve(y_test, y_prob[:,1])
else: 
    y_score = cls.decision_function(X_test)
    average_precision = metrics.average_precision_score(y_test, y_score, average="weighted")
    precision, recall, _ = metrics.precision_recall_curve(y_test, y_score)
    

plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,
                 color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Average precision-recall score: {0:0.2f}'.format(average_precision))
plt.show()


7. Classification boundaries


In [394]:
#pca
from sklearn.decomposition import PCA

print X_test.shape

X_test_trans = X_test.copy()
for s in range(len(cls.steps) - 1):
    if cls.steps[s][0] != "sm_smote":
        X_test_trans = cls.steps[s][1].transform(X_test_trans)  
        print "Test:", X_test_trans.shape

X_train_trans = X_train.copy()
for s in range(len(cls.steps) - 1):
    if cls.steps[s][0] != "sm_smote":
        X_train_trans = cls.steps[s][1].transform(X_train_trans)  
        print "Train:", X_train_trans.shape

pca = PCA(n_components=2)
X_r = pca.fit(X_train_trans).transform(X_test_trans)

print "DATASET: {}M".format(ds)
print('explained variance ratio (first components): %s'
      % str(pca.explained_variance_ratio_))
print "explained variance ratio of 2 pcas:", np.sum(pca.explained_variance_ratio_[:2])


(13, 70)
Test: (13, 70)
Test: (13, 70)
Test: (13, 70)
Test: (13, 11)
Train: (29, 70)
Train: (29, 70)
Train: (29, 70)
Train: (29, 11)
DATASET: 2M
explained variance ratio (first components): [ 0.22  0.17]
explained variance ratio of 2 pcas: 0.394013773249

In [395]:
#Make meshgrid

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .1     # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = X_r[:, 0].min() - 2, X_r[:, 0].max() + 2
y_min, y_max = X_r[:, 1].min() - 2, X_r[:, 1].max() + 2
print x_min, x_max
print y_min, y_max

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
print xx.shape, np.min(xx), np.max(xx)
print yy.shape, np.min(yy), np.max(yy)

X_extra = np.c_[xx.ravel(), yy.ravel()]
x_min_extra, x_max_extra = X_extra[:, 0].min(), X_extra[:, 0].max()
y_min_extra, y_max_extra = X_extra[:, 1].min(), X_extra[:, 1].max()
print x_min_extra, x_max_extra
print y_min_extra, y_max_extra
print X_extra.shape

X_extra_trans = pca.inverse_transform(X_extra)
print X_extra_trans.shape


-3.38236903614 3.54869408304
-3.34922350662 3.93110822265
(73, 70) -3.38236903614 3.51763096386
(73, 70) -3.34922350662 3.85077649338
-3.38236903614 3.51763096386
-3.34922350662 3.85077649338
(5110, 2)
(5110, 11)

In [396]:
# Obtain labels for each point in mesh. Use last trained model
Z = cls.steps[-1][1].predict(X_extra_trans)
print Z.shape
print np.sum(Z==0), np.sum(Z==1)

y_pred_ts = cls.predict(X_test)
print np.sum(y_pred_ts==0), np.sum(y_pred_ts==1)


y_pred_cs = []
for d in range(len(y_pred_ts)):
    if y_pred_ts[d] == y_test[d]:
        if y_pred_ts[d] == 0:
            y_pred_cs.append("blue")
        else:
            y_pred_cs.append("g")
    else:
        y_pred_cs.append("r")

y_pred_cs = np.array(y_pred_cs)
print y_pred_cs
print "Pred errors:", np.sum(y_pred_cs=="r")


(5110,)
2814 2296
4 9
['blue' 'g' 'g' 'g' 'r' 'blue' 'blue' 'blue' 'g' 'r' 'g' 'g' 'g']
Pred errors: 2

In [397]:
plt.figure(figsize=(15,5))
ax = plt.subplot(1,3,1)

#Predicted
#ax.scatter(X_r[:, 0], X_r[:, 1], c=y_pred_cs)
ax.scatter(X_r[y_pred_cs=="g", 0], X_r[y_pred_cs=="g", 1], c=["g" for y in y_pred_ts[y_pred_cs=="g"]],label="pred. compliant", alpha=0.5, s=48)
ax.scatter(X_r[y_pred_cs=="blue", 0], X_r[y_pred_cs=="blue", 1], c=["blue" for y in y_pred_ts[y_pred_cs=="blue"]],label="pred. non-compliant", alpha=0.5, s=48)
ax.scatter(X_r[y_pred_cs=="r", 0], X_r[y_pred_cs=="r", 1], c=["r" for y in y_pred_ts[y_pred_cs=="r"]],label="pred. error", s=48)

ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_title("Combine Predicted vs Truth")
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),fancybox=True, shadow=True, ncol=1)

ax = plt.subplot(1,3,2)
ax.scatter(X_r[y_pred_ts==1, 0], X_r[y_pred_ts==1, 1], c=["g" for y in y_pred_ts[y_pred_ts==1]],label="pred. compliant", alpha=0.5, s=48)
ax.scatter(X_r[y_pred_ts==0, 0], X_r[y_pred_ts==0, 1], c=["blue" for y in y_pred_ts[y_pred_ts==0]],label="pred. non-compliant", alpha=0.5, s=48)
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_title("Predicted")
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),fancybox=True, shadow=True, ncol=1)

ax = plt.subplot(1,3,3)
#ax.scatter(X_r[:, 0], X_r[:, 1], c=["g" if y else "blue" for y in y_test], alpha=0.3)
ax.scatter(X_r[y_test==1, 0], X_r[y_test==1, 1], c=["g" for y in y_pred_ts[y_test==1]],label="real compliant", alpha=0.5, s=48)
ax.scatter(X_r[y_test==0, 0], X_r[y_test==0, 1], c=["blue" for y in y_pred_ts[y_test==0]],label="real non-compliant", alpha=0.5, s=48)
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_title("Truth")
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),fancybox=True, shadow=True, ncol=1)

plt.show()


8. Interpretability

8.1 Select best interpretable classifier


In [633]:
dfDesc = df[np.logical_not(df["cls"].isin(["svmRBF","nn"]))]\
        .sort_values("cv_f1_mean", ascending=False)
display(dfDesc[["fs","sm","metric","cls","params",
          "tr_f1","cv_f1", "cv_prec", "cv_rec",
          "test_f1",'test_prec', 'test_rec']].iloc[:1,:])

cls_int = dfDesc.iloc[0].pipeline
print cls_int.get_params()


fs sm metric cls params tr_f1 cv_f1 cv_prec cv_rec test_f1 test_prec test_rec
182 lasso_fs after precision_weighted rf [1, 250, gini, 4, None, None] 1.00 0.87+/-0.15 0.89+/-0.14 0.88+/-0.14 0.84 0.85 0.88
{'lasso_fs__estimator__C': 1, 'lasso_fs': SelectFromModel(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        prefit=False, threshold=None), 'Scaler__copy': True, 'lasso_fs__prefit': False, 'lasso_fs__estimator__warm_start': False, 'lasso_fs__estimator__intercept_scaling': 1, 'sm_smote__svm_estimator': None, 'sm_smote__k_neighbors': 4, 'Imputer': TypeFeatImputer(allNameCols=array(['GENERO', 'ACTIVO', ..., 'EUROQOL_3', 'VISUAL EUROQOL_3'], dtype=object),
        dataCatCols=array([1, 1, ..., 0, 0])), 'rf__n_jobs': -1, 'rf__warm_start': False, 'sm_smote__m_neighbors': 10, 'sm_smote__n_jobs': 1, 'lasso_fs__threshold': None, 'sm_smote__ratio': 'auto', 'sm_smote__random_state': 32, 'lasso_fs__estimator': LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'lasso_fs__estimator__random_state': 42, 'sm_smote__out_step': 0.5, 'sm_smote__kind': 'regular', 'lasso_fs__estimator__tol': 0.0001, 'lasso_fs__estimator__solver': 'liblinear', 'rf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=250, n_jobs=-1, oob_score=False, random_state=42,
            verbose=0, warm_start=False), 'Imputer__dataCatCols': array([1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0]), 'rf__oob_score': False, 'sm_smote': SMOTE(k=None, k_neighbors=4, kind='regular', m=None, m_neighbors=10, n_jobs=1,
   out_step=0.5, random_state=32, ratio='auto', svm_estimator=None), 'lasso_fs__estimator__max_iter': 100, 'rf__min_samples_leaf': 1, 'rf__bootstrap': True, 'lasso_fs__estimator__class_weight': None, 'rf__max_features': 'auto', 'Imputer__allNameCols': array(['GENERO', 'ACTIVO', 'JUBILADO', 'EDAD', 'FUMA', 'FUMA_PAQ_ANY',
       'FUMA_EX_ANYS', 'ALCOHOL', 'ALCOHOL_GR_DIA', 'CAFES_DIA',
       'DEPRESION', 'ANSIEDAD', 'HTA', 'CARDIOPATIA', 'ENF_RESP',
       'DIABETES', 'OBESIDAD', 'DISLIPEMIA', 'OBSTR_NASAL_CRON', 'RONCA',
       'SUE_REP', 'CRISIS_ASF', 'NICTURIA', 'APNEAS_PRES', 'CEFALEA',
       'SOMN_DIURN', 'DESPERTAR_NOCT', 'TRAST_CONC', 'IRR_APAT_DEPR',
       'INSOMNIO', 'ACT_MOTR_NOCT', 'SEN_SUE_REAL_DORM', 'SUE_INT_PIROSIS',
       'DISM_DESEO_SEX', 'IECAS', 'DIURETICOS', 'ANTIAGREG', 'ANTIACID',
       'HIPOLIPEM', 'BDZ', 'TIEMP_SUENO', 'IAH', 'TIEMPOSAT<90%',
       'IND_DESAT', 'TALLA', 'IMC', 'CIRC_CUELLO', 'MEDIA_TAS',
       'MEDIA_TAD', 'Sat O2', 'PRESS_CPAP', 'EPWORTH', 'EUROQOL', 'VISUAL',
       'HORAS_USO_TOT_1', 'HORAS_USO_MED_NOCHE_1', 'AL_IRR_CUT_MASC_1',
       'BOCA_SECA_1', 'MEDIA_TAS_1', 'MEDIA_TAD_1', 'EPWORTH_1',
       'EUROQOL_1', 'VISUAL_EUROQOL_1', 'HORAS_USO_MED_NOCHE_3',
       'BOCA_SECA_3', 'MEDIA TAS_3', 'MEDIA TAD_3', 'EPWORTH_3',
       'EUROQOL_3', 'VISUAL EUROQOL_3'], dtype=object), 'rf__min_weight_fraction_leaf': 0.0, 'lasso_fs__estimator__penalty': 'l1', 'lasso_fs__estimator__multi_class': 'ovr', 'Variance': VarianceThreshold(threshold=0.0), 'Scaler__with_mean': True, 'rf__max_depth': None, 'rf__n_estimators': 250, 'Scaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'rf__verbose': 0, 'rf__criterion': 'gini', 'rf__max_leaf_nodes': None, 'Scaler__with_std': True, 'lasso_fs__estimator__dual': False, 'lasso_fs__estimator__fit_intercept': True, 'Variance__threshold': 0.0, 'lasso_fs__estimator__verbose': 0, 'rf__min_samples_split': 2, 'sm_smote__k': None, 'steps': [('Imputer', TypeFeatImputer(allNameCols=array(['GENERO', 'ACTIVO', ..., 'EUROQOL_3', 'VISUAL EUROQOL_3'], dtype=object),
        dataCatCols=array([1, 1, ..., 0, 0]))), ('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('Variance', VarianceThreshold(threshold=0.0)), ('lasso_fs', SelectFromModel(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        prefit=False, threshold=None)), ('sm_smote', SMOTE(k=None, k_neighbors=4, kind='regular', m=None, m_neighbors=10, n_jobs=1,
   out_step=0.5, random_state=32, ratio='auto', svm_estimator=None)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=250, n_jobs=-1, oob_score=False, random_state=42,
            verbose=0, warm_start=False))], 'sm_smote__m': None, 'rf__min_impurity_split': 1e-07, 'rf__class_weight': None, 'lasso_fs__estimator__n_jobs': -1, 'rf__random_state': 42}

8.2 Plot classifier weights


In [646]:
# Plot feature importance
clf = cls_int.steps[-1][1]
var = cls_int.steps[2][1]
var_cols = cols[var.get_support()]

print cols[var.get_support() == 1]
print len(cols)
print len(var_cols)

fs_cols = var_cols
fs_name = cls_int.steps[3][0]
if fs_name in ["lasso_fs","rfe_fs","combined_fs"]:
    fs = cls.steps[3][1]
    fs_cols = var_cols[fs.get_support()]
    print fs_cols, np.sum(fs.get_support()), len(fs_cols)
    
plt.figure(figsize=(15,5))    
if cls_name == "rf":
    feature_importance = clf.feature_importances_
    feature_importance = (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)[::-1]
    pos = np.arange(sorted_idx.shape[0]) + .5
    
    plt.bar(np.arange(len(feature_importance)), feature_importance[sorted_idx])
    plt.xticks(np.arange(len(feature_importance))+0.4, fs_cols[sorted_idx], rotation=90)
    plt.xlabel('')
else:
    #feature_importance = clf.coef_[0,:]
    weights = clf.coef_
    weights /= weights.max()
    print weights.shape
    
    sorted_idx = np.argsort(weights[::])[::-1]
    
    plt.bar(np.arange(weights.shape[0])+0.3, weights[sorted_idx])
    plt.xticks(np.arange(weights.shape[0])+0.3,fs_cols[sorted_idx], rotation=90)

plt.axhline(0.5, c="red", ls="--")    
plt.title('')
plt.show()


/home/aegle/miniconda2/envs/myosa/lib/python2.7/site-packages/ipykernel/__main__.py:4: VisibleDeprecationWarning: boolean index did not match indexed array along dimension 0; dimension is 71 but corresponding boolean dimension is 70
/home/aegle/miniconda2/envs/myosa/lib/python2.7/site-packages/ipykernel/__main__.py:6: VisibleDeprecationWarning: boolean index did not match indexed array along dimension 0; dimension is 71 but corresponding boolean dimension is 70
['GENDER' 'ACTIVE' 'RETIRED' 'AGE' 'SMOKER' 'SMOKE_PACK_YEAR'
 'SMOKE_EX_YEAR' 'ALCOHOL' 'ALCOHOL_GR_DAY' 'CAFES_DAY' 'DEPRESSION'
 'ANXIETY' 'HTA' 'CARDIOPATHY' 'RESP_DIS' 'DIABETES' 'OBESITY' 'DISLIPEMIA'
 'OBSTR_NASAL_CHRON' 'SNORES' 'REP_SLEEP' 'CRISIS_ASP' 'NOCTURIA'
 'APNEAS_PRES' 'CEFALEA' 'DAY_SOMN' 'AWAKE_NIGHT' 'CONC_DIS'
 'IRR_APAT_DEPRE' 'INSOMNIA' 'ACT_MOTR_NOCT' 'FEEL_REAL_DRE'
 'DREAM_INT_PIROSIS' 'DEC_SEX_DIS' 'IECAS' 'DIURETICS' 'ANTIAGREG'
 'ANTIACID' 'HIPOLIPEM' 'BDZ' 'SLEEP_TIME' 'IAH' 'TIME_SAT<90%' 'IND_DESAT'
 'SIZE' 'BMI' 'CIRC_NECK' 'TAS_MEAN' 'TAD_MEAN' 'SAT_O2' 'PRESS_CPAP'
 'EPWORTH' 'EUROQOL' 'VISUAL' 'TOTAL_USE_HOURS_1' 'MEAN_USE_HOURS_1'
 'AL_IRR_CUT_MASC_1' 'DRY_MOUTH_1' 'TAS_MEAN_1' 'TAD_MEAN_1' 'EPWORTH_1'
 'EUROQOL_1' 'VISUAL_EUROQOL_1' 'MEAN_USE_HOURS_3' 'DRY_MOUTH_3'
 'TAS_MEAN_3' 'TAD_MEAN_3' 'EPWORTH_3' 'EUROQOL_3' 'VISUAL_EUROQOL_3']
71
70
['SNORES' 'CEFALEA' 'DREAM_INT_PIROSIS' 'ANTIAGREG' 'ANTIACID' 'VISUAL'
 'DRY_MOUTH_1' 'MEAN_USE_HOURS_3' 'EPWORTH_3'] 9 9

8.3 Compute feature stability


In [635]:
stab = 100
stab_thr = 0.4
stabArr = dict()
cv_int = cross_validation.StratifiedShuffleSplit(y_train, n_iter=stab, test_size=0.30, random_state=100)
for train_ix, test_ix in cv_int:    
    cls_int.fit(X_train[train_ix],y_train[train_ix])
    
    clf = cls_int.steps[-1][1]
    var = cls_int.steps[2][1]
    var_cols = cols[var.get_support()]

    fs_cols = var_cols
    fs_name = cls_int.steps[3][0]
    if fs_name in ["lasso_fs","rfe_fs","combined_fs"]:
        fs = cls.steps[3][1]
        fs_cols = var_cols[fs.get_support()]
    
    if cls_name == "rf":
        feature_importance = np.abs(clf.feature_importances_)
        weights = (feature_importance / float(feature_importance.max()))
    else:
        weights = np.abs(clf.coef_)
        weights /= float(weights.max())
        weights = weights[0,:]

    for i,w in enumerate(weights):
        if w >= stab_thr:
            col = fs_cols[i]
            if col in stabArr.keys():
                stabArr[col].append(w)
            else:
                stabArr[col] = [w]

dfStab = []        
for j in stabArr.keys():
    k = len(stabArr[j])/100.0
    mean = np.mean(stabArr[j])
    std = np.std(stabArr[j])
    max = np.max(stabArr[j])
    min = np.min(stabArr[j])
    dfStab.append([j, k, mean,std,max,min])

dfStab = pd.DataFrame(dfStab, columns = ["column", "kont", "mean", "std", "max", "min"]) 
dfStab.sort_values(["kont","mean"], ascending = False, inplace=True)


/home/aegle/miniconda2/envs/myosa/lib/python2.7/site-packages/ipykernel/__main__.py:10: VisibleDeprecationWarning: boolean index did not match indexed array along dimension 0; dimension is 71 but corresponding boolean dimension is 70

In [637]:
#Plot stability
plt.figure(figsize=(15,5))
ax = plt.subplot(1,1,1)
ax.bar(np.arange(dfStab.shape[0]), dfStab["kont"], alpha=0.5, label="stability")
ax.plot(np.arange(dfStab.shape[0])+0.3, dfStab["mean"],c="orange", marker='o', label="weights_mean")
ax.fill_between(np.arange(dfStab.shape[0])+0.3, 
                dfStab["mean"] - dfStab["std"],
                dfStab["mean"] + dfStab["std"],
                alpha=0.6, color="orange")
    
plt.xticks(np.arange(dfStab.shape[0])+0.3,dfStab["column"],rotation=90)
plt.ylim(0.0,1.05)
plt.legend()
plt.show()

#Stability table
print dfStab.shape
print dfStab[dfStab["kont"]>=0.4].shape
display(dfStab[dfStab["kont"]>=0.4])
display(dfStab)


(18, 6)
(2, 6)
column kont mean std max min
6 MEAN_USE_HOURS_3 1.00 0.99 0.05 1.00 0.69
5 EPWORTH_3 0.41 0.74 0.18 1.00 0.42
column kont mean std max min
6 MEAN_USE_HOURS_3 1.00 0.99 0.05 1.00 0.69
5 EPWORTH_3 0.41 0.74 0.18 1.00 0.42
9 VISUAL 0.32 0.53 0.12 0.84 0.40
4 CEFALEA 0.16 0.56 0.13 0.85 0.41
7 EPWORTH_1 0.15 0.66 0.16 1.00 0.47
12 IRR_APAT_DEPRE 0.07 0.54 0.10 0.66 0.41
2 CIRC_NECK 0.07 0.49 0.09 0.70 0.41
14 MEAN_USE_HOURS_1 0.04 0.81 0.19 1.00 0.56
1 HTA 0.03 0.53 0.17 0.76 0.41
17 ALCOHOL_GR_DAY 0.02 0.52 0.06 0.59 0.46
0 IAH 0.02 0.46 0.03 0.49 0.43
16 SAT_O2 0.02 0.42 0.01 0.43 0.42
10 TAS_MEAN 0.01 0.61 0.00 0.61 0.61
8 SIZE 0.01 0.55 0.00 0.55 0.55
11 SNORES 0.01 0.49 0.00 0.49 0.49
13 TAS_MEAN_1 0.01 0.47 0.00 0.47 0.47
3 AGE 0.01 0.46 0.00 0.46 0.46
15 SLEEP_TIME 0.01 0.42 0.00 0.42 0.42

8..2 Feature Plots


In [400]:
features = sorted_idx[:5]
names = fs_cols[features]
data = np.hstack((X_train, y_train.reshape(-1,1)))
data = pd.DataFrame(data, columns=cols)

print data.shape
display(data.describe().T)


(29, 71)
count mean std min 25% 50% 75% max
GENERO 29.00 0.76 0.44 0.00 1.00 1.00 1.00 1.00
ACTIVO 29.00 0.55 0.51 0.00 0.00 1.00 1.00 1.00
JUBILADO 29.00 0.38 0.49 0.00 0.00 0.00 1.00 1.00
EDAD 29.00 57.48 12.77 33.00 48.00 57.00 66.00 81.00
FUMA 29.00 0.38 0.49 0.00 0.00 0.00 1.00 1.00
FUMA_PAQ_ANY 29.00 185.79 285.96 0.00 0.00 0.00 365.00 1,095.00
FUMA_EX_ANYS 29.00 2.46 5.13 0.00 0.00 0.00 1.00 20.00
ALCOHOL 29.00 0.48 0.51 0.00 0.00 0.00 1.00 1.00
ALCOHOL_GR_DIA 29.00 4.23 8.42 0.00 0.00 0.00 5.00 40.00
CAFES_DIA 29.00 1.38 1.37 0.00 0.00 1.00 2.00 5.00
DEPRESION 29.00 0.14 0.35 0.00 0.00 0.00 0.00 1.00
ANSIEDAD 29.00 0.17 0.38 0.00 0.00 0.00 0.00 1.00
HTA 29.00 0.62 0.49 0.00 0.00 1.00 1.00 1.00
CARDIOPATIA 29.00 0.28 0.45 0.00 0.00 0.00 1.00 1.00
ENF_RESP 29.00 0.28 0.45 0.00 0.00 0.00 1.00 1.00
DIABETES 29.00 0.31 0.47 0.00 0.00 0.00 1.00 1.00
OBESIDAD 29.00 0.72 0.45 0.00 0.00 1.00 1.00 1.00
DISLIPEMIA 29.00 0.38 0.49 0.00 0.00 0.00 1.00 1.00
OBSTR_NASAL_CRON 29.00 0.45 0.51 0.00 0.00 0.00 1.00 1.00
RONCA 29.00 2.76 0.44 2.00 3.00 3.00 3.00 3.00
SUE_REP 29.00 1.34 1.01 0.00 0.00 2.00 2.00 3.00
CRISIS_ASF 29.00 0.59 0.68 0.00 0.00 0.00 1.00 2.00
NICTURIA 29.00 1.72 1.22 0.00 1.00 2.00 3.00 3.00
APNEAS_PRES 28.00 1.68 1.06 0.00 1.00 2.00 2.25 3.00
CEFALEA 29.00 0.24 0.44 0.00 0.00 0.00 0.00 1.00
SOMN_DIURN 29.00 1.41 1.15 0.00 0.00 1.00 2.00 3.00
DESPERTAR_NOCT 29.00 1.10 1.18 0.00 0.00 1.00 2.00 3.00
TRAST_CONC 29.00 0.41 0.68 0.00 0.00 0.00 1.00 2.00
IRR_APAT_DEPR 29.00 0.45 0.69 0.00 0.00 0.00 1.00 2.00
INSOMNIO 29.00 0.17 0.38 0.00 0.00 0.00 0.00 1.00
... ... ... ... ... ... ... ... ...
IAH 29.00 55.09 20.00 22.90 40.06 56.32 66.00 94.50
TIEMPOSAT<90% 29.00 70.19 83.53 0.90 14.00 30.00 92.55 299.20
IND_DESAT 29.00 38.71 26.62 1.97 15.61 34.60 61.80 84.80
TALLA 29.00 1.67 0.10 1.50 1.60 1.67 1.73 1.92
IMC 29.00 33.81 6.48 23.33 30.00 32.85 36.60 50.22
CIRC_CUELLO 29.00 42.38 4.70 35.00 39.00 42.00 45.00 53.00
MEDIA_TAS 29.00 138.28 17.62 93.50 129.00 139.50 147.50 172.50
MEDIA_TAD 29.00 88.24 15.48 59.00 77.00 89.00 96.00 127.50
Sat O2 29.00 97.24 1.72 93.00 97.00 98.00 98.00 99.00
PRESS_CPAP 29.00 10.88 1.34 8.00 10.00 11.00 12.00 14.00
EPWORTH 29.00 9.76 4.87 0.00 6.00 9.00 14.00 18.00
EUROQOL 29.00 0.77 0.24 0.13 0.65 0.76 1.00 1.00
VISUAL 29.00 61.24 25.93 5.00 40.00 70.00 80.00 100.00
HORAS_USO_TOT_1 26.00 210.81 92.48 10.00 160.75 206.50 248.50 451.00
HORAS_USO_MED_NOCHE_1 26.00 5.34 1.82 0.20 4.62 5.40 7.00 7.50
AL_IRR_CUT_MASC_1 26.00 0.15 0.37 0.00 0.00 0.00 0.00 1.00
BOCA_SECA_1 26.00 0.42 0.50 0.00 0.00 0.00 1.00 1.00
MEDIA_TAS_1 25.00 132.16 16.88 94.00 122.00 133.50 141.00 167.00
MEDIA_TAD_1 25.00 82.12 12.51 49.50 75.50 84.50 89.00 104.00
EPWORTH_1 26.00 6.37 3.78 0.00 4.00 6.00 7.75 15.00
EUROQOL_1 25.00 0.86 0.14 0.49 0.76 0.80 1.00 1.00
VISUAL_EUROQOL_1 25.00 73.20 16.51 40.00 60.00 80.00 80.00 100.00
HORAS_USO_MED_NOCHE_3 28.00 5.18 1.87 0.80 3.90 5.50 6.50 8.50
BOCA_SECA_3 28.00 0.25 0.44 0.00 0.00 0.00 0.25 1.00
MEDIA TAS_3 26.00 133.35 18.09 96.50 120.75 138.00 143.62 162.00
MEDIA TAD_3 27.00 80.81 13.10 49.50 74.50 82.50 87.00 109.00
EPWORTH_3 28.00 6.62 4.16 1.00 4.00 6.00 9.00 20.00
EUROQOL_3 27.00 0.84 0.19 0.22 0.74 0.80 1.00 1.00
VISUAL EUROQOL_3 27.00 70.77 20.25 0.85 70.00 80.00 80.00 100.00
LABEL 29.00 0.59 0.50 0.00 0.00 1.00 1.00 1.00

71 rows × 8 columns


In [401]:
plt.figure(figsize=(15,4))
for i,n in enumerate(names):
    ax = plt.subplot(1,len(names), i+1)
    data.groupby("LABEL")[n].agg(["mean","std"]).plot(kind="bar", ax=ax, yerr="std")
    ax.set_title(n)
    ax.set_xlabel("")
    ax.set_ylim(bottom=0)

plt.show()


9. ROC curve

9.1 ROC Test set

In [410]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

#y_score = cls.decision_function(X_test) #For D0 & D1
y_pred = cls.predict(X_test)
y_prob = cls.predict_proba(X_test)
    
fpr_test, tpr_test, _ = metrics.roc_curve(y_test, y_prob[:,1], pos_label=1)
#fpr_test, tpr_test, _ = metrics.roc_curve(y_test, y_score, pos_label=1)

test_auc_w = metrics.roc_auc_score(y_test, y_prob[:,1], average='weighted')
#test_auc_w = metrics.roc_auc_score(y_test, y_score, average='weighted')

print y_test
print y_pred
print
print test_auc_w
print tpr_test
print fpr_test

plt.figure(figsize=(12,6))

ax = plt.subplot(1,2,1)
ax.plot(fpr_test, tpr_test, color="g",
         label='ROC testSet (AUC-weighted = %0.2f)' % (test_auc_w),lw=2, alpha=.8)

ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='random', alpha=.3)
ax.legend(loc='lower right')
ax.set_xlabel("fpr")
ax.set_ylabel("tpr")
ax.set_xlim(-0.05,1.05)
ax.set_ylim(0,1.05)
plt.show()


[0 1 1 1 0 0 0 0 1 0 1 1 1]
[0 1 1 1 1 0 0 0 1 1 1 1 1]

0.952380952381
[ 0.29  0.71  0.71  1.    1.  ]
[ 0.    0.    0.17  0.17  1.  ]
9.2 ROC CV set

In [412]:
tprs = []
fprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 9)

#Compute cv scores
i = 0
print cls.steps[-1]

plt.figure(figsize=(6,6))
for train_ix, test_ix in cv_outer:
    #y_score = cls.fit(X_train[train_ix],y_train[train_ix]).decision_function(X_train[test_ix]) #For D0 & D1
    y_prob = cls.fit(X_train[train_ix],y_train[train_ix]).predict_proba(X_train[test_ix])
    
    # Compute ROC curve and area under the curve
    #fpr, tpr, _ = metrics.roc_curve(y_train[test_ix], y_score, pos_label=1)
    fpr, tpr, _ = metrics.roc_curve(y_train[test_ix], y_prob[:,1], pos_label=1)

    tprs.append(tpr)
    fprs.append(fpr)
    
    #roc_auc = metrics.roc_auc_score(y_train[test_ix], y_score, average='weighted')
    roc_auc = metrics.roc_auc_score(y_train[test_ix], y_prob[:,1], average='weighted')

    
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.1,label="ROC fold {} (AUC-weighted = {})".format(str(i), str(roc_auc)))
    i += 1

#Homogenyze fpr,tpr arrays (adding 0 at beginning and as many 1 as max difference among arrays)    
max = 0
for t in tprs:
    if max < len(t):
        max = len(t)
max += 1

for i in range(len(tprs)):
    tprs[i] = np.insert(tprs[i],0,0)
    fprs[i] = np.insert(fprs[i],0,0)
            
    if len(tprs[i]) < max:
        diff = max - len(tprs[i])        
        for j in range(diff):            
            tprs[i] = np.insert(tprs[i],-1,1)
            fprs[i] = np.insert(fprs[i],-1,1)
            
#Plot auc mean line and shadow mean area
mean_auc = np.mean(aucs)
std_auc = np.std(aucs)

mean_tpr = np.mean(tprs, axis=0)
mean_fpr = np.mean(fprs, axis=0)

print mean_tpr
print mean_fpr

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)


plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

#Plot guess line
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='random', alpha=.8)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()


('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=250, n_jobs=-1, oob_score=False, random_state=42,
            verbose=0, warm_start=False))
[ 0.    0.2   0.76  0.84  0.94  0.96  0.98  1.  ]
[ 0.    0.    0.03  0.55  0.7   0.93  0.95  1.  ]

10. Learning curve


In [408]:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):

    plt.figure(figsize=(8,6))
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("% Training set")
    plt.ylabel("F1-score")
    train_sizes_lc, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,scoring="f1_weighted")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid(True)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="b",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.axhline(0.5,color='r',ls='--', label="random")
    
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    return plt

In [409]:
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
title = ""
plot_learning_curve(cls, title, X_train, y_train, ylim=(0.4, 1.01), 
                    cv=cv_outer,
                    train_sizes=[0.75,0.8,0.9,1.0], 
                    n_jobs=-1)

plt.show()


/home/aegle/miniconda2/envs/myosa/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [ ]: