Generating a Bootstrapped Training Dataset

This notebook trains a random forest using the benchmark training data (which itself is a combination of random positions, and false positives from a previous, sub-optimal classifier).

This new classifier (which is very good) is then used to create a bootstrapped sample of 20,000 negative examples uniformly distributed in difficulty


Before you run all the cells: some of the cells in this notebook involve time-consuming calculations, both locally and on PiCloud.

In [1]:

import json
import random
import itertools
import os
import sys

import numpy as np
from sklearn.metrics import auc_score
from sklearn.utils import resample

from bubbly.wiserf import WiseRF
from bubbly.extractors import MultiViewExtractor, ManyManyExtractors
from bubbly.util import roc_curve
from bubbly.dr1 import WideLocationGenerator, highest_quality_on_params
from bubbly.hyperopt import fmin, rf_space, auc_below_fpos

os.environ['WISERF_ROOT'] = '/Users/beaumont/WiseRF-1.5.9-macosx-x86_64-rc1'

Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline].
For more information, type 'help(pylab)'.
/Users/beaumont/Library/Python/2.7/lib/python/site-packages/pytz/ UserWarning: Module argparse was already imported from /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/argparse.pyc, but /Users/beaumont/Library/Python/2.7/lib/python/site-packages is being added to sys.path
  from pkg_resources import resource_stream

Load training data, build feature vectors

In [2]:
data = json.load(open('../models/benchmark_training_data.json'))

lg = WideLocationGenerator(mod3 = 1)
data['pos'] = filter(lambda x: lg.valid_longitude(x[0]), highest_quality_on_params())

for k, v in data.items():
    data[k] = sorted(v)
ex = MultiViewExtractor(ManyManyExtractors())
ex.shp = (60, 60)

In [3]:
def _xy(ex, on, off):
    x = np.vstack(ex.extract(*o).ravel().astype(np.float32) for o in on + off)
    x = np.nan_to_num(x)
    y = np.hstack((np.ones(len(on),, np.zeros(len(off),
    return x, y

npos = len(data['pos'])
xtrain, ytrain = _xy(ex, data['pos'], random.sample(data['neg'], npos))

Train and optimize hyper-parameters

warning Building the validation set takes a while (~1 hour)

In [ ]:
xvalidate, yvalidate = _xy(ex, data['cv_pos'], data['cv_neg'])

In [4]:
def rf_objective(**params):
    clf = WiseRF(**params), ytrain)

    df = clf.decision_function(xvalidate).ravel()
    return -auc_below_fpos(yvalidate, df, .0005), clf

In [7]:
#this will loop until interrupted

for best, best_params, clf in fmin(rf_objective, rf_space):
    print best, best_params

    roc_curve(yvalidate, clf.decision_function(xvalidate), label='val', lw=3)
    roc_curve(ytrain, clf.decision_function(xtrain), label='train', lw=3)
    xlim(0, .002)
    legend(loc='lower right')

-0.000287898089172 {'max_features': 'auto', 'min_samples_split': 4, 'n_jobs': 2, 'criterion': 'gini', 'n_estimators': 200}
-0.000393630573248 {'max_features': 'auto', 'min_samples_split': 2, 'n_jobs': 2, 'criterion': 'infogain', 'n_estimators': 200}

Final Fit

In [8]:
best_params = {'max_features': 'auto', 'min_samples_split': 4, 'n_jobs': 2, 'criterion': 'infogain', 'n_estimators': 800}
clf = WiseRF(**best_params).fit(xtrain, ytrain)

Visualize Hard Examples

Here we make a montage of the + and - examples from the validation set which the classifier does the poorest job with

In [9]:
df = clf.decision_function(xvalidate).ravel()

In [12]:
from skimage.util.montage import montage2d

def montage(arrs):
    print "montaging %i images" % len(arrs)
    print 'image dim:', arrs[0].shape
    r, g, b = tuple(montage2d(np.array([a[:, :, i] for a in arrs])) 
                    for i in range(3))
    return np.dstack((r, g, b)).astype(np.uint8)

In [14]:
from bubbly.extractors import RGBExtractor

def _ex(*params):
    p = list(params)
    r1 = rgb.extract(*p)
    p[-1] *= 2.5
    r2 = rgb.extract(*p)
    return np.hstack((r1, r2))

rgb = RGBExtractor()
rgb.shp = (100, 100)

In [11]:
on_ind = np.argsort(df[yvalidate == 1])
off_ind = np.argsort(df[yvalidate == 0])[::-1]

figure(figsize=(15, 10))
im = montage([_ex(*data['cv_pos'][i]) for i in on_ind[:25]])
imshow(im, origin='upper')
title("Hard Positives")

figure(figsize=(15, 10))
im = montage([_ex(*data['cv_neg'][i]) for i in off_ind[:16]])

imshow(im, origin='upper')
title("Hard Negatives")

montaging 25 images
image dim: (100, 200, 3)
montaging 16 images
image dim: (100, 200, 3)