This notebook trains a random forest using the benchmark training data (which itself is a combination of random positions, and false positives from a previous, sub-optimal classifier).
This new classifier (which is very good) is then used to create a bootstrapped sample of 20,000 negative examples uniformly distributed in difficulty
Before you run all the cells: some of the cells in this notebook involve time-consuming calculations, both locally and on PiCloud.
In [1]:
%pylab
import json
import random
import itertools
import os
import sys
import numpy as np
from sklearn.metrics import auc_score
from sklearn.utils import resample
from bubbly.wiserf import WiseRF
from bubbly.extractors import MultiViewExtractor, ManyManyExtractors
from bubbly.util import roc_curve
from bubbly.dr1 import WideLocationGenerator, highest_quality_on_params
from bubbly.hyperopt import fmin, rf_space, auc_below_fpos
os.environ['WISERF_ROOT'] = '/Users/beaumont/WiseRF-1.5.9-macosx-x86_64-rc1'
In [2]:
data = json.load(open('../models/benchmark_training_data.json'))
lg = WideLocationGenerator(mod3 = 1)
data['pos'] = filter(lambda x: lg.valid_longitude(x[0]), highest_quality_on_params())
for k, v in data.items():
data[k] = sorted(v)
ex = MultiViewExtractor(ManyManyExtractors())
ex.shp = (60, 60)
In [3]:
def _xy(ex, on, off):
x = np.vstack(ex.extract(*o).ravel().astype(np.float32) for o in on + off)
x = np.nan_to_num(x)
y = np.hstack((np.ones(len(on), dtype=np.int), np.zeros(len(off), dtype=np.int)))
return x, y
npos = len(data['pos'])
xtrain, ytrain = _xy(ex, data['pos'], random.sample(data['neg'], npos))
warning Building the validation set takes a while (~1 hour)
In [ ]:
xvalidate, yvalidate = _xy(ex, data['cv_pos'], data['cv_neg'])
In [4]:
def rf_objective(**params):
clf = WiseRF(**params)
clf.fit(xtrain, ytrain)
df = clf.decision_function(xvalidate).ravel()
return -auc_below_fpos(yvalidate, df, .0005), clf
In [7]:
#this will loop until interrupted
for best, best_params, clf in fmin(rf_objective, rf_space):
print best, best_params
sys.stdout.flush()
roc_curve(yvalidate, clf.decision_function(xvalidate), label='val', lw=3)
roc_curve(ytrain, clf.decision_function(xtrain), label='train', lw=3)
xlim(0, .002)
legend(loc='lower right')
show()
In [8]:
best_params = {'max_features': 'auto', 'min_samples_split': 4, 'n_jobs': 2, 'criterion': 'infogain', 'n_estimators': 800}
clf = WiseRF(**best_params).fit(xtrain, ytrain)
In [9]:
df = clf.decision_function(xvalidate).ravel()
In [12]:
from skimage.util.montage import montage2d
def montage(arrs):
print "montaging %i images" % len(arrs)
print 'image dim:', arrs[0].shape
r, g, b = tuple(montage2d(np.array([a[:, :, i] for a in arrs]))
for i in range(3))
return np.dstack((r, g, b)).astype(np.uint8)
In [14]:
from bubbly.extractors import RGBExtractor
def _ex(*params):
p = list(params)
r1 = rgb.extract(*p)
p[-1] *= 2.5
r2 = rgb.extract(*p)
return np.hstack((r1, r2))
rgb = RGBExtractor()
rgb.shp = (100, 100)
In [11]:
on_ind = np.argsort(df[yvalidate == 1])
off_ind = np.argsort(df[yvalidate == 0])[::-1]
figure(figsize=(15, 10))
im = montage([_ex(*data['cv_pos'][i]) for i in on_ind[:25]])
imshow(im, origin='upper')
title("Hard Positives")
show()
figure(figsize=(15, 10))
im = montage([_ex(*data['cv_neg'][i]) for i in off_ind[:16]])
imshow(im, origin='upper')
title("Hard Negatives")
show()