Tomorrow: Add a third zoom in. Improve again? (Yes! 70%@0, 90% @ .002

Expand + examples to include 2 mod 3 fields. Double - examples accordingly

Best Zoom2Extractor with Daisy: ROC = 'max_features': 'auto', 'min_samples_split': 4, 'n_jobs': 2, 'criterion': 'infogain', 'n_estimators': 800


In [1]:
%pylab

import json
import random
import itertools

import numpy as np
from scipy import stats
from sklearn.metrics import auc_score
from sklearn.ensemble import GradientBoostingClassifier

from bubbly.model import Model
import bubbly.extractors as ext
from bubbly.util import rfp_curve
from bubbly.dr1 import WideLocationGenerator, highest_quality_on_params
from bubbly.hyperopt import rf_space

from skimage.feature import daisy


Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline].
For more information, type 'help(pylab)'.
/Users/beaumont/Library/Python/2.7/lib/python/site-packages/scikits/__init__.py:1: UserWarning: Module argparse was already imported from /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/argparse.pyc, but /Users/beaumont/Library/Python/2.7/lib/python/site-packages is being added to sys.path
  __import__('pkg_resources').declare_namespace(__name__)

In [22]:
class DaisyExtractor(ext.Extractor):    
    
    def _extract_rgb(self, rgb):
        kwargs = dict(step=rgb.shape[0]/5, radius=rgb.shape[0] / 10, rings=2,
                      histograms=6, orientations=8)
        return np.hstack(daisy(rgb[:, :, i], **kwargs).ravel() for i in [0, 1])
        
class DaisySuper(ext.CompositeExtractor):
    composite_classes = [ext.RingExtractor, ext.MultiWaveletExtractor, ext.CompressionExtractor,
                         DaisyExtractor]


class ZoomedExtractor(ext.Extractor):
    def __init__(self, orig):
        self.orig = orig
        
    def extract(self, lon, l, b, r):
        return np.hstack((self.orig.extract(lon, l, b, r), 
                          self.orig.extract(lon, l, b, r * 2)))
    
class Zoom2Extractor(ext.Extractor):
    def __init__(self, orig):
        self.orig = orig
        
    def extract(self, lon, l, b, r):
        return np.hstack((self.orig.extract(lon, l, b, r), 
                          self.orig.extract(lon, l, b, r / 2),
                          self.orig.extract(lon, l, b, r * 2),
                          self.orig.extract(lon, l, b + r / 2, r)))

In [244]:
data = json.load(open('../models/benchmark_training_data.json'))

lg = WideLocationGenerator(mod3 = 1)
data['pos'] = filter(lambda x: lg._longitude_check(x[0]), highest_quality_on_params())

for k, v in data.items():
    data[k] = sorted(v)
    
ex = Zoom2Extractor(DaisySuper())
ex.shp = (60, 60)

In [245]:
from sklearn.utils import resample

def _xy(ex, on, off):
    x = np.vstack(ex.extract(*o).ravel().astype(np.float32) for o in on + off)
    x = np.nan_to_num(x)
    y = np.hstack((np.ones(len(on), dtype=np.int), np.zeros(len(off), dtype=np.int)))
    return x, y

npos = len(data['pos'])
xtrain, ytrain = _xy(ex, data['pos'], random.sample(data['neg'], npos))
xvalidate, yvalidate = _xy(ex, data['cv_pos'], data['cv_neg'])

In [252]:
from PyWiseRF import WiseRF
import os
os.environ['WISERF_ROOT'] = '/Users/beaumont/WiseRF-1.5.9-macosx-x86_64-rc1'

from sklearn.metrics import auc_score, auc
import sklearn.metrics as met


def decision_function(self, x):
    p = self.predict_proba(x)
    return p[:, 1] - p[:, 0]

WiseRF.decision_function = decision_function

def auc_below_fpos(y, yp, fpos):
    fp, tp, th = met.roc_curve(y, yp)
    good = (fp <= fpos)
    return auc(fp[good], tp[good])


class Choice(object):
    def __init__(self, *choices):
        self._choices = choices
        
    def rvs(self):
        return random.choice(self._choices)
    
class Space(object):
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        
    def __iter__(self):
        while True:
            yield {k: v.rvs() for k, v in self._hyperparams.items()}

            
gb_space = Space(learning_rate = stats.uniform(1e-3, 1 - 1.01e-3),
                 n_estimators = Choice(50, 100, 200),
                 max_depth = Choice(1, 2, 3),
                 subsample = stats.uniform(1e-3, 1 - 1.01e-3))

rf_space = Space(n_estimators = Choice(200, 400, 800, 1600),
                 min_samples_split = Choice(1, 2, 4),                 
                 criterion = Choice('gini', 'gainratio', 'infogain'),
                 max_features = Choice('auto'),
                 n_jobs = Choice(2))
                 
                 
def gb_objective(**params):
    clf = GradientBoostingClassifier(**params)
    clf.fit(xtrain, ytrain)
    
    df = clf.decision_function(xvalidate).ravel()
    return (df[yvalidate == 1] <= df[yvalidate == 0].max()).mean(), clf

def rf_objective(**params):
    clf = WiseRF(**params)
    clf.fit(xtrain, ytrain)

    df = clf.decision_function(xvalidate).ravel()
    return -auc_below_fpos(yvalidate, df, .0005), clf
    
    
def fmin(objective, space, threshold=np.inf):
    best = threshold
    
    try:        
        for p in space:
            f, clf = objective(**p)
            if f < best:
                best = f
                yield best, p, clf
    except KeyboardInterrupt:
        pass

In [253]:
import sys
from bubbly.util import roc_curve

for best, best_params, clf in fmin(rf_objective, rf_space, threshold):
    print best, best_params
    sys.stdout.flush()

    roc_curve(yvalidate, clf.decision_function(xvalidate), label='val')
    roc_curve(ytrain, clf.decision_function(xtrain), label='train')
    xlim(0, .002)
    legend(loc='lower right')
    show()


-0.000282484076433 {'max_features': 'auto', 'min_samples_split': 1, 'n_jobs': 2, 'criterion': 'gainratio', 'n_estimators': 200}
-0.000380573248408 {'max_features': 'auto', 'min_samples_split': 2, 'n_jobs': 2, 'criterion': 'gini', 'n_estimators': 1600}
-0.000385987261146 {'max_features': 'auto', 'min_samples_split': 4, 'n_jobs': 2, 'criterion': 'gini', 'n_estimators': 1600}

In [145]:
from skimage.util.montage import montage2d

def montage(arrs):
    print "montaging %i images" % len(arrs)
    print 'image dim:', arrs[0].shape
    r, g, b = tuple(montage2d(np.array([a[:, :, i] for a in arrs])) 
                    for i in range(3))
    return np.dstack((r, g, b)).astype(np.uint8)

def ex(*params):
    p = list(params)
    r1 = rgb.extract(*p)
    p[-1] *= 2.5
    r2 = rgb.extract(*p)
    return np.hstack((r1, r2))

rgb = ext.RGBExtractor()
rgb.shp = (80, 80)

df = clf.decision_function(xvalidate).ravel()

on_ind = np.argsort(df[yvalidate == 1])
off_ind = np.argsort(df[yvalidate == 0])[::-1]

figure(figsize=(10, 10))
im = montage([ex(*data['cv_pos'][i]) for i in on_ind[:49]])
imshow(im)
title("Hard Positives")
show()

figure(figsize=(10, 10))
im = montage([ex(*data['cv_neg'][i]) for i in off_ind[:49]])

imshow(im)
title("Hard Negatives")
show()


montaging 49 images
image dim: (80, 160, 3)
montaging 49 images
image dim: (80, 160, 3)

In [190]:
xtrain2, ytrain2 = _xy(ex, data['pos'], random.sample(data['neg'], npos))

In [246]:
xtrain_all, ytrain_all = _xy(ex, data['pos'], data['neg'])

In [200]:
class BaggedEstimator(object):
    def __init__(self, estimators):
        self.estimators = estimators
        
    def decision_function(self, X):
        return reduce(np.add, (e.decision_function(X) for e in self.estimators))
    
    def predict(self, X):
        return self.decision_function(X) > 0

In [247]:
from sklearn.utils import resample

def balanced_resample(x, y):
    npos = (y == 1).sum()
    xn = x[y == 0]
    xresample = resample(x, n_samples = npos)
    return np.vstack((x[y == 1], xresample)), np.hstack((np.ones(npos, dtype=np.int), np.zeros(npos, dtype=np.int)))

In [248]:
from sklearn.base import clone
clfs = []
for i in range(5):
    x, y = balanced_resample(xtrain_all, ytrain_all)
    clfs.append(clone(clf).fit(x, y))
    
bag = BaggedEstimator(clfs)

In [249]:
roc_curve(yvalidate, bag.decision_function(xvalidate), label='val')
roc_curve(ytrain, bag.decision_function(xtrain), label='train')
xlim(0, .002)
legend(loc='lower right')
show()



In [216]:
from skimage.util.montage import montage2d

def montage(arrs):
    print "montaging %i images" % len(arrs)
    print 'image dim:', arrs[0].shape
    r, g, b = tuple(montage2d(np.array([a[:, :, i] for a in arrs])) 
                    for i in range(3))
    return np.dstack((r, g, b)).astype(np.uint8)

def ex(*params):
    p = list(params)
    r1 = rgb.extract(*p)
    p[-1] *= 2.5
    r2 = rgb.extract(*p)
    return np.hstack((r1, r2))

rgb = ext.RGBExtractor()
rgb.shp = (80, 80)

df = bag.decision_function(xvalidate).ravel()

on_ind = np.argsort(df[yvalidate == 1])
off_ind = np.argsort(df[yvalidate == 0])[::-1]

figure(figsize=(15, 10))
im = montage([ex(*data['cv_pos'][i]) for i in on_ind[:49]])
imshow(im, origin='upper')
title("Hard Positives")
show()

figure(figsize=(15, 10))
im = montage([ex(*data['cv_neg'][i]) for i in off_ind[:49]])

imshow(im, origin='upper')
title("Hard Negatives")
show()


montaging 49 images
image dim: (80, 160, 3)
montaging 49 images
image dim: (80, 160, 3)

In [220]:
rgb.shp = (180, 180)
im = montage([ex(*data['cv_neg'][i]) for i in off_ind[:16]])
figure(figsize=(15, 10))
imshow(im, origin='upper')
rgb.shp = (80, 80)


montaging 16 images
image dim: (180, 360, 3)

In [225]:
hist((df[yvalidate == 1], df[yvalidate == 0]), normed=True, log=True)


Out[225]:
([array([ 0.        ,  0.        ,  0.        ,  0.01231997,  0.06159987,
        0.03079994,  0.13551972,  0.18479962,  0.23407951,  0.30799936]),
  array([  2.71373308e-01,   4.04932302e-01,   2.05802708e-01,
         6.58607350e-02,   1.42166344e-02,   4.15860735e-03,
         6.76982592e-04,   9.67117988e-05,   0.00000000e+00,
         0.00000000e+00])],
 array([-5.695, -4.661, -3.627, -2.593, -1.559, -0.525,  0.509,  1.543,
        2.577,  3.611,  4.645]),
 <a list of 2 Lists of Patches objects>)

In [230]:
on = np.array(data['pos'])
off = np.array(data['neg'])
cv = np.array(data['cv_pos'] + data['cv_neg'])


figure(figsize=(10, 10))
plot(on[:, 1] % 3, on[:, 2], 'g.')
plot(off[:, 1] % 3, off[:, 2], 'r.')

plot(cv[:, 1] % 3, cv[:, 2], 'b.', alpha=.3)


Out[230]:
[<matplotlib.lines.Line2D at 0x1152c8510>]

In [ ]:
class SubsetClassifier(object):
    def __init__(self, clf, subset):
        self.clf = clf
        self.subset = subset
        
    def _sub(self, X):
        return X[:, self.subset]
    
    def predict(self, X):
        return self.clf.predict(self._sub(X))
    
    def decision_function(self, X):
        self.clf.decision_function(self._sub(X))

In [232]:
clf2 = clone(clf)
clf2.set_params(compute_importances = True)
clf2.fit(xtrain, ytrain)


Out[232]:
<PyWiseRF.sklearn.WiseRF at 0x1159e6190>

In [233]:
clf2.feature_importances_.shape


Out[233]:
(47200,)

In [234]:
np.sqrt(47200)


Out[234]:
217.25560982400432

In [237]:
im = np.zeros((220, 200))
im.flat[:47200] = clf2.feature_importances_

In [241]:
imshow(np.sqrt(im))


Out[241]:
<matplotlib.image.AxesImage at 0x115f5ee50>

In [ ]: