Generating a Bootstrapped Training Dataset

This notebook trains a random forest using the benchmark training data (which itself is a combination of random positions, and false positives from a previous, sub-optimal classifier).

This new classifier (which is very good) is then used to create a bootstrapped sample of 20,000 negative examples uniformly distributed in difficulty

Warning

Before you run all the cells: some of the cells in this notebook involve time-consuming calculations, both locally and on PiCloud.


In [1]:
%pylab

import json
import random
import itertools
import os
import sys

import numpy as np
from sklearn.metrics import auc_score
from sklearn.utils import resample

from bubbly.wiserf import WiseRF
from bubbly.extractors import MultiViewExtractor, ManyManyExtractors
from bubbly.util import roc_curve
from bubbly.dr1 import WideLocationGenerator, highest_quality_on_params
from bubbly.hyperopt import fmin, rf_space, auc_below_fpos

os.environ['WISERF_ROOT'] = '/Users/beaumont/WiseRF-1.5.9-macosx-x86_64-rc1'


Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline].
For more information, type 'help(pylab)'.
/Users/beaumont/Library/Python/2.7/lib/python/site-packages/pytz/__init__.py:35: UserWarning: Module argparse was already imported from /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/argparse.pyc, but /Users/beaumont/Library/Python/2.7/lib/python/site-packages is being added to sys.path
  from pkg_resources import resource_stream

Load training data, build feature vectors


In [2]:
data = json.load(open('../models/benchmark_training_data.json'))

lg = WideLocationGenerator(mod3 = 1)
data['pos'] = filter(lambda x: lg.valid_longitude(x[0]), highest_quality_on_params())

for k, v in data.items():
    data[k] = sorted(v)
    
ex = MultiViewExtractor(ManyManyExtractors())
ex.shp = (60, 60)

In [3]:
def _xy(ex, on, off):
    x = np.vstack(ex.extract(*o).ravel().astype(np.float32) for o in on + off)
    x = np.nan_to_num(x)
    y = np.hstack((np.ones(len(on), dtype=np.int), np.zeros(len(off), dtype=np.int)))
    return x, y

npos = len(data['pos'])
xtrain, ytrain = _xy(ex, data['pos'], random.sample(data['neg'], npos))

Train and optimize hyper-parameters

warning Building the validation set takes a while (~1 hour)


In [ ]:
xvalidate, yvalidate = _xy(ex, data['cv_pos'], data['cv_neg'])

In [4]:
def rf_objective(**params):
    clf = WiseRF(**params)
    clf.fit(xtrain, ytrain)

    df = clf.decision_function(xvalidate).ravel()
    return -auc_below_fpos(yvalidate, df, .0005), clf

In [7]:
#this will loop until interrupted

for best, best_params, clf in fmin(rf_objective, rf_space):
    print best, best_params
    sys.stdout.flush()

    roc_curve(yvalidate, clf.decision_function(xvalidate), label='val', lw=3)
    roc_curve(ytrain, clf.decision_function(xtrain), label='train', lw=3)
    xlim(0, .002)
    legend(loc='lower right')
    show()


-0.000287898089172 {'max_features': 'auto', 'min_samples_split': 4, 'n_jobs': 2, 'criterion': 'gini', 'n_estimators': 200}
-0.000393630573248 {'max_features': 'auto', 'min_samples_split': 2, 'n_jobs': 2, 'criterion': 'infogain', 'n_estimators': 200}

Final Fit


In [8]:
best_params = {'max_features': 'auto', 'min_samples_split': 4, 'n_jobs': 2, 'criterion': 'infogain', 'n_estimators': 800}
clf = WiseRF(**best_params).fit(xtrain, ytrain)

Visualize Hard Examples

Here we make a montage of the + and - examples from the validation set which the classifier does the poorest job with


In [9]:
df = clf.decision_function(xvalidate).ravel()

In [12]:
from skimage.util.montage import montage2d

def montage(arrs):
    print "montaging %i images" % len(arrs)
    print 'image dim:', arrs[0].shape
    r, g, b = tuple(montage2d(np.array([a[:, :, i] for a in arrs])) 
                    for i in range(3))
    return np.dstack((r, g, b)).astype(np.uint8)

In [14]:
from bubbly.extractors import RGBExtractor

def _ex(*params):
    p = list(params)
    r1 = rgb.extract(*p)
    p[-1] *= 2.5
    r2 = rgb.extract(*p)
    return np.hstack((r1, r2))

rgb = RGBExtractor()
rgb.shp = (100, 100)

In [11]:
on_ind = np.argsort(df[yvalidate == 1])
off_ind = np.argsort(df[yvalidate == 0])[::-1]

figure(figsize=(15, 10))
im = montage([_ex(*data['cv_pos'][i]) for i in on_ind[:25]])
imshow(im, origin='upper')
title("Hard Positives")
show()

figure(figsize=(15, 10))
im = montage([_ex(*data['cv_neg'][i]) for i in off_ind[:16]])

imshow(im, origin='upper')
title("Hard Negatives")
show()


montaging 25 images
image dim: (100, 200, 3)
montaging 16 images
image dim: (100, 200, 3)

In [16]:
import cPickle
with open ('good_classifier.pkl', 'w') as outfile:
    cPickle.dump(clf, outfile)

In [18]:
np.savez_compressed('training_data.npz', xtrain=xtrain, ytrain=ytrain, xvalidate=xvalidate, yvalidate=yvalidate)

Bootstrapped Training Data

We'll use the classifier we just made to classify 200,000 nominally negative examples, from which we can build the final classifiers


In [2]:
import cPickle as pickle
clf = pickle.load(open('good_classifier.pkl'))

In [3]:
def getoff(loc):
    return list(itertools.islice(loc.negatives_iterator(), 0, 100000))
    
loc1 = WideLocationGenerator(0)
loc2 = WideLocationGenerator(1)
offs = getoff(loc1) + getoff(loc2)

In [10]:
from bubbly.model import Model
ex = MultiViewExtractor(ManyManyExtractors())
ex.shp = (60, 60)
lg = WideLocationGenerator(mod3 = 1)

offs = sorted(offs)  # minimize io
m = Model(ex, lg, clf)
offs_df = m.cloud_decision_function(offs, workers=100)

It looks like we need to ignore the 50 examples with highest decision fucntion values, since these are mostly bubbles not in DR1.

The two mosaics show the 98 regions with highest decision functions


In [15]:
ind = np.argsort(offs_df)[::-1]
figure(figsize=(15, 10))
imshow(montage([_ex(*offs[i]) for i in ind[:49]]), origin='upper')
show()
figure(figsize=(15, 10))
imshow(montage([_ex(*offs[i]) for i in ind[49:49*2]]), origin='upper')


montaging 49 images
image dim: (100, 200, 3)
montaging 49 images
image dim: (100, 200, 3)
Out[15]:
<matplotlib.image.AxesImage at 0x10d9abb50>

The proposed cutoff


In [25]:
cutoff = offs_df[ind[100]]
hist(offs_df, histtype='step', bins = 50)
axvline(cutoff, color='k')


Out[25]:
<matplotlib.lines.Line2D at 0x10cdcb590>

In [46]:
ind = np.argsort(np.array(offs)[:, 1])
bootstrap_offs = {'off_params' : sorted(offs),
                  'off_score' : offs_df[ind].tolist(),
                  'on_params' : sorted(list(highest_quality_on_params()))}
with open('../models/bootstrapped_labels.json', 'w') as outfile:
    json.dump(bootstrap_offs, outfile)