In [6]:
%pylab

import json
import random
import itertools

import numpy as np
from scipy import stats
from sklearn.metrics import auc_score
from sklearn.ensemble import GradientBoostingClassifier

from bubbly.model import Model
import bubbly.extractors as ext
from bubbly.util import rfp_curve


Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline].
For more information, type 'help(pylab)'.

In [7]:
data = json.load(open('../models/benchmark_training_data.json'))
for k, v in data.items():
    data[k] = sorted(v)
ex = ext.RingWaveletCompressionExtractor()
ex.shp = (80, 80)

In [9]:
from sklearn.utils import resample

def _xy(ex, on, off):
    x = np.vstack(ex.extract(*o).ravel().astype(np.float32) for o in on + off)
    x = np.nan_to_num(x)
    y = np.hstack((np.ones(len(on), dtype=np.int), np.zeros(len(off), dtype=np.int)))
    return x, y

npos = len(data['pos'])
xtrain, ytrain = _xy(ex, data['pos'], random.sample(data['neg'], npos))
xvalidate, yvalidate = _xy(ex, data['cv_pos'], data['cv_neg'])


WARNING: RuntimeWarning: invalid value encountered in divide [bubbly.util]
WARNING: RuntimeWarning: invalid value encountered in divide [bubbly.util]
WARNING: RuntimeWarning: invalid value encountered in divide [bubbly.util]

In [122]:
from PyWiseRF import WiseRF

class Choice(object):
    def __init__(self, *choices):
        self._choices = choices
        
    def rvs(self):
        return random.choice(self._choices)
    
class Space(object):
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        
    def __iter__(self):
        while True:
            yield {k: v.rvs() for k, v in self._hyperparams.items()}

            
gb_space = Space(learning_rate = stats.uniform(1e-3, 1 - 1.01e-3),
                 n_estimators = Choice(50, 100, 200),
                 max_depth = Choice(1, 2, 3),
                 subsample = stats.uniform(1e-3, 1 - 1.01e-3))

rf_space = Space(n_estimators = Choice(50, 100, 200),
                 min_samples_split = Choice(1, 2, 4),                 
                 criterion = Choice('gini', 'gainratio', 'infogain'),
                 max_features = Choice('auto', 1, 2, 4),
                 n_jobs = Choice(2))
                 
                 
def gb_objective(**params):
    clf = GradientBoostingClassifier(**params)
    clf.fit(xtrain, ytrain)
    
    df = clf.decision_function(xvalidate).ravel()
    return (df[yvalidate == 1] <= df[yvalidate == 0].max()).mean(), clf

def rf_objective(**params):
    clf = WiseRF(**params)
    clf.fit(xtrain, ytrain)

    df = clf.decision_function(xvalidate).ravel()
    return (df[yvalidate == 1] <= df[yvalidate == 0].max()).mean(), clf
    

def fmin(objective, space, threshold=np.inf):
    best = threshold
    
    try:        
        for p in space:
            f, clf = objective(**p)
            if f < best:
                best = f
                yield best, p, clf
    except KeyboardInterrupt:
        pass

In [123]:
import sys
from bubbly.util import roc_curve

for best, best_params, clf in fmin(rf_objective, rf_space):
    print best, best_params
    sys.stdout.flush()

    roc_curve(yvalidate, clf.decision_function(xvalidate), label='val')
    roc_curve(ytrain, clf.decision_function(xtrain), label='train')
    xlim(0, .002)
    legend(loc='lower right')
    show()



In [66]:
best_params = {'n_estimators': 200, 'subsample': 0.8532699679950504, 'learning_rate': 0.016793630316864427, 'max_depth': 3}
best_params = {'n_estimators': 100, 'subsample': 0.6634478421194823, 'learning_rate': 0.1871843310390825, 'max_depth': 3}
best_clf = GradientBoostingClassifier(**best_params).fit(xtrain, ytrain)

In [125]:
best_params = {'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 200, 'n_jobs': 2}
best_clf = clf

In [102]:
def decision_function(self, x):
    p = self.predict_proba(x)
    return p[:, 1] - p[:, 0]

type(rf).decision_function = decision_function

clf.decision_function(xvalidate[0:1, :])


Out[102]:
array([[ 7.00325768]])

In [128]:
_clf = best_clf

roc_curve(yvalidate, _clf.decision_function(xvalidate).ravel(), label='val')
roc_curve(ytrain, _clf.decision_function(xtrain).ravel(), label='train')
xlim(0, .010)
legend(loc='lower right')
show()
df = _clf.decision_function(xvalidate).ravel()
print (df[yvalidate == 1] <= df[yvalidate == 0].max()).mean()
print (df[yvalidate == 0] >= df[yvalidate == 1].min()).mean()
print auc_score(yvalidate, df), auc_score(ytrain, _clf.decision_function(xtrain))



In [127]:
from skimage.util.montage import montage2d

rcParams['image.origin'] = 'upper'
rcParams['figure.figsize'] = (10, 10)
rcParams['figure.dpi'] = 100

def montage(arrs):
    print "montaging %i images" % len(arrs)
    print 'image dim:', arrs[0].shape
    r, g, b = tuple(montage2d(np.array([a[:, :, i] for a in arrs])) 
                    for i in range(3))
    return np.dstack((r, g, b)).astype(np.uint8)

def ex(*params):
    p = list(params)
    #p[-1] *= 2.5
    return rgb.extract(*p)

rgb = ext.RGBExtractor()
rgb.shp = (80, 80)

df = best_clf.decision_function(xvalidate).ravel()

on_ind = np.argsort(df[yvalidate == 1])
off_ind = np.argsort(df[yvalidate == 0])[::-1]

im = montage([ex(*data['cv_pos'][i]) for i in on_ind[:49]])
imshow(im)
title("Hard Positives")
show()


im = montage([ex(*data['cv_neg'][i]) for i in off_ind[:49]])

imshow(im)
title("Hard Negatives")
show()