In [6]:
%pylab
import json
import random
import itertools
import numpy as np
from scipy import stats
from sklearn.metrics import auc_score
from sklearn.ensemble import GradientBoostingClassifier
from bubbly.model import Model
import bubbly.extractors as ext
from bubbly.util import rfp_curve
In [7]:
data = json.load(open('../models/benchmark_training_data.json'))
for k, v in data.items():
data[k] = sorted(v)
ex = ext.RingWaveletCompressionExtractor()
ex.shp = (80, 80)
In [9]:
from sklearn.utils import resample
def _xy(ex, on, off):
x = np.vstack(ex.extract(*o).ravel().astype(np.float32) for o in on + off)
x = np.nan_to_num(x)
y = np.hstack((np.ones(len(on), dtype=np.int), np.zeros(len(off), dtype=np.int)))
return x, y
npos = len(data['pos'])
xtrain, ytrain = _xy(ex, data['pos'], random.sample(data['neg'], npos))
xvalidate, yvalidate = _xy(ex, data['cv_pos'], data['cv_neg'])
In [122]:
from PyWiseRF import WiseRF
class Choice(object):
def __init__(self, *choices):
self._choices = choices
def rvs(self):
return random.choice(self._choices)
class Space(object):
def __init__(self, **hyperparams):
self._hyperparams = hyperparams
def __iter__(self):
while True:
yield {k: v.rvs() for k, v in self._hyperparams.items()}
gb_space = Space(learning_rate = stats.uniform(1e-3, 1 - 1.01e-3),
n_estimators = Choice(50, 100, 200),
max_depth = Choice(1, 2, 3),
subsample = stats.uniform(1e-3, 1 - 1.01e-3))
rf_space = Space(n_estimators = Choice(50, 100, 200),
min_samples_split = Choice(1, 2, 4),
criterion = Choice('gini', 'gainratio', 'infogain'),
max_features = Choice('auto', 1, 2, 4),
n_jobs = Choice(2))
def gb_objective(**params):
clf = GradientBoostingClassifier(**params)
clf.fit(xtrain, ytrain)
df = clf.decision_function(xvalidate).ravel()
return (df[yvalidate == 1] <= df[yvalidate == 0].max()).mean(), clf
def rf_objective(**params):
clf = WiseRF(**params)
clf.fit(xtrain, ytrain)
df = clf.decision_function(xvalidate).ravel()
return (df[yvalidate == 1] <= df[yvalidate == 0].max()).mean(), clf
def fmin(objective, space, threshold=np.inf):
best = threshold
try:
for p in space:
f, clf = objective(**p)
if f < best:
best = f
yield best, p, clf
except KeyboardInterrupt:
pass
In [123]:
import sys
from bubbly.util import roc_curve
for best, best_params, clf in fmin(rf_objective, rf_space):
print best, best_params
sys.stdout.flush()
roc_curve(yvalidate, clf.decision_function(xvalidate), label='val')
roc_curve(ytrain, clf.decision_function(xtrain), label='train')
xlim(0, .002)
legend(loc='lower right')
show()
In [66]:
best_params = {'n_estimators': 200, 'subsample': 0.8532699679950504, 'learning_rate': 0.016793630316864427, 'max_depth': 3}
best_params = {'n_estimators': 100, 'subsample': 0.6634478421194823, 'learning_rate': 0.1871843310390825, 'max_depth': 3}
best_clf = GradientBoostingClassifier(**best_params).fit(xtrain, ytrain)
In [125]:
best_params = {'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 200, 'n_jobs': 2}
best_clf = clf
In [102]:
def decision_function(self, x):
p = self.predict_proba(x)
return p[:, 1] - p[:, 0]
type(rf).decision_function = decision_function
clf.decision_function(xvalidate[0:1, :])
Out[102]:
In [128]:
_clf = best_clf
roc_curve(yvalidate, _clf.decision_function(xvalidate).ravel(), label='val')
roc_curve(ytrain, _clf.decision_function(xtrain).ravel(), label='train')
xlim(0, .010)
legend(loc='lower right')
show()
df = _clf.decision_function(xvalidate).ravel()
print (df[yvalidate == 1] <= df[yvalidate == 0].max()).mean()
print (df[yvalidate == 0] >= df[yvalidate == 1].min()).mean()
print auc_score(yvalidate, df), auc_score(ytrain, _clf.decision_function(xtrain))
In [127]:
from skimage.util.montage import montage2d
rcParams['image.origin'] = 'upper'
rcParams['figure.figsize'] = (10, 10)
rcParams['figure.dpi'] = 100
def montage(arrs):
print "montaging %i images" % len(arrs)
print 'image dim:', arrs[0].shape
r, g, b = tuple(montage2d(np.array([a[:, :, i] for a in arrs]))
for i in range(3))
return np.dstack((r, g, b)).astype(np.uint8)
def ex(*params):
p = list(params)
#p[-1] *= 2.5
return rgb.extract(*p)
rgb = ext.RGBExtractor()
rgb.shp = (80, 80)
df = best_clf.decision_function(xvalidate).ravel()
on_ind = np.argsort(df[yvalidate == 1])
off_ind = np.argsort(df[yvalidate == 0])[::-1]
im = montage([ex(*data['cv_pos'][i]) for i in on_ind[:49]])
imshow(im)
title("Hard Positives")
show()
im = montage([ex(*data['cv_neg'][i]) for i in off_ind[:49]])
imshow(im)
title("Hard Negatives")
show()