In the earlier hyperparameter optimization (Daisy Features notebook), we used the full set of + labels. Is this causing over-fitting during hyperoptimization?
Splitting the + labels in the validation data into independent validate/test samples converges to the same set of hyperparameters, and almost the same ROC curve. So the poor performance I'm currently seeing in "Full Classifier" is probably do to sub-optimal training dat and/or hyperparameters.
I will investiget this in Cross Validation Testing 2
In [11]:
%pylab
import json
import random
import itertools
import os
import sys
import numpy as np
from sklearn.metrics import auc_score
from sklearn.utils import resample
from bubbly.wiserf import WiseRF
from bubbly.extractors import MultiViewExtractor, ManyManyExtractors
from bubbly.util import roc_curve
from bubbly.dr1 import WideLocationGenerator, highest_quality_on_params
from bubbly.hyperopt import fmin, rf_space, auc_below_fpos
os.environ['WISERF_ROOT'] = '/Users/beaumont/WiseRF-1.5.9-macosx-x86_64-rc1'
In [12]:
ex = MultiViewExtractor(ManyManyExtractors())
ex.shp = (60, 60)
In [16]:
data = np.load('training_data.npz')
xtrain = data['xtrain']
xvalidate = data['xvalidate']
yvalidate = data['yvalidate']
ytrain = data['ytrain']
In [19]:
xtest = np.vstack((xvalidate[yvalidate == 0], xvalidate[yvalidate == 1][:80]))
ytest = np.hstack((yvalidate[yvalidate == 0], yvalidate[yvalidate == 1][:80]))
xval = np.vstack((xvalidate[yvalidate == 0], xvalidate[yvalidate == 1][80:]))
yval = np.hstack((yvalidate[yvalidate == 0], yvalidate[yvalidate == 1][80:]))
In [21]:
del xvalidate
del yvalidate
del data
In [23]:
def rf_objective(**params):
clf = WiseRF(**params)
clf.fit(xtrain, ytrain)
df = clf.decision_function(xval).ravel()
return -auc_below_fpos(yval, df, .0005), clf
In [24]:
for best, best_params, clf in fmin(rf_objective, rf_space):
print best, best_params
sys.stdout.flush()
roc_curve(yval, clf.decision_function(xval), label='val', lw=3)
roc_curve(ytest, clf.decision_function(xtest), label='test', lw=3)
xlim(0, .002)
legend(loc='lower right')
show()