In the earlier hyperparameter optimization (Daisy Features notebook), we used the full set of + labels. Is this causing over-fitting during hyperoptimization?

Conclusion

Splitting the + labels in the validation data into independent validate/test samples converges to the same set of hyperparameters, and almost the same ROC curve. So the poor performance I'm currently seeing in "Full Classifier" is probably do to sub-optimal training dat and/or hyperparameters.

I will investiget this in Cross Validation Testing 2


In [11]:
%pylab

import json
import random
import itertools
import os
import sys

import numpy as np
from sklearn.metrics import auc_score
from sklearn.utils import resample

from bubbly.wiserf import WiseRF
from bubbly.extractors import MultiViewExtractor, ManyManyExtractors
from bubbly.util import roc_curve
from bubbly.dr1 import WideLocationGenerator, highest_quality_on_params
from bubbly.hyperopt import fmin, rf_space, auc_below_fpos

os.environ['WISERF_ROOT'] = '/Users/beaumont/WiseRF-1.5.9-macosx-x86_64-rc1'


Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline].
For more information, type 'help(pylab)'.

In [12]:
ex = MultiViewExtractor(ManyManyExtractors())
ex.shp = (60, 60)

In [16]:
data = np.load('training_data.npz')
xtrain = data['xtrain']
xvalidate = data['xvalidate']
yvalidate = data['yvalidate']
ytrain = data['ytrain']

In [19]:
xtest = np.vstack((xvalidate[yvalidate == 0], xvalidate[yvalidate == 1][:80]))
ytest = np.hstack((yvalidate[yvalidate == 0], yvalidate[yvalidate == 1][:80]))

xval = np.vstack((xvalidate[yvalidate == 0], xvalidate[yvalidate == 1][80:]))
yval = np.hstack((yvalidate[yvalidate == 0], yvalidate[yvalidate == 1][80:]))

In [21]:
del xvalidate
del yvalidate
del data

In [23]:
def rf_objective(**params):
    clf = WiseRF(**params)
    clf.fit(xtrain, ytrain)

    df = clf.decision_function(xval).ravel()
    return -auc_below_fpos(yval, df, .0005), clf

In [24]:
for best, best_params, clf in fmin(rf_objective, rf_space):
    print best, best_params
    sys.stdout.flush()

    roc_curve(yval, clf.decision_function(xval), label='val', lw=3)
    roc_curve(ytest, clf.decision_function(xtest), label='test', lw=3)
    xlim(0, .002)
    legend(loc='lower right')
    show()


-0.000385714285714 {'max_features': 'auto', 'min_samples_split': 1, 'n_jobs': 2, 'criterion': 'gini', 'n_estimators': 400}
-0.000387662337662 {'max_features': 'auto', 'min_samples_split': 4, 'n_jobs': 2, 'criterion': 'gini', 'n_estimators': 400}
-0.000418181818182 {'max_features': 'auto', 'min_samples_split': 1, 'n_jobs': 2, 'criterion': 'infogain', 'n_estimators': 800}