Tomorrow: Add a third zoom in. Improve again? (Yes! 70%@0, 90% @ .002
Expand + examples to include 2 mod 3 fields. Double - examples accordingly
Best Zoom2Extractor with Daisy: ROC = 'max_features': 'auto', 'min_samples_split': 4, 'n_jobs': 2, 'criterion': 'infogain', 'n_estimators': 800
In [1]:
%pylab
import json
import random
import itertools
import numpy as np
from scipy import stats
from sklearn.metrics import auc_score
from sklearn.ensemble import GradientBoostingClassifier
from bubbly.model import Model
import bubbly.extractors as ext
from bubbly.util import rfp_curve
from bubbly.dr1 import WideLocationGenerator, highest_quality_on_params
from bubbly.hyperopt import rf_space
from skimage.feature import daisy
In [22]:
class DaisyExtractor(ext.Extractor):
def _extract_rgb(self, rgb):
kwargs = dict(step=rgb.shape[0]/5, radius=rgb.shape[0] / 10, rings=2,
histograms=6, orientations=8)
return np.hstack(daisy(rgb[:, :, i], **kwargs).ravel() for i in [0, 1])
class DaisySuper(ext.CompositeExtractor):
composite_classes = [ext.RingExtractor, ext.MultiWaveletExtractor, ext.CompressionExtractor,
DaisyExtractor]
class ZoomedExtractor(ext.Extractor):
def __init__(self, orig):
self.orig = orig
def extract(self, lon, l, b, r):
return np.hstack((self.orig.extract(lon, l, b, r),
self.orig.extract(lon, l, b, r * 2)))
class Zoom2Extractor(ext.Extractor):
def __init__(self, orig):
self.orig = orig
def extract(self, lon, l, b, r):
return np.hstack((self.orig.extract(lon, l, b, r),
self.orig.extract(lon, l, b, r / 2),
self.orig.extract(lon, l, b, r * 2),
self.orig.extract(lon, l, b + r / 2, r)))
In [244]:
data = json.load(open('../models/benchmark_training_data.json'))
lg = WideLocationGenerator(mod3 = 1)
data['pos'] = filter(lambda x: lg._longitude_check(x[0]), highest_quality_on_params())
for k, v in data.items():
data[k] = sorted(v)
ex = Zoom2Extractor(DaisySuper())
ex.shp = (60, 60)
In [245]:
from sklearn.utils import resample
def _xy(ex, on, off):
x = np.vstack(ex.extract(*o).ravel().astype(np.float32) for o in on + off)
x = np.nan_to_num(x)
y = np.hstack((np.ones(len(on), dtype=np.int), np.zeros(len(off), dtype=np.int)))
return x, y
npos = len(data['pos'])
xtrain, ytrain = _xy(ex, data['pos'], random.sample(data['neg'], npos))
xvalidate, yvalidate = _xy(ex, data['cv_pos'], data['cv_neg'])
In [252]:
from PyWiseRF import WiseRF
import os
os.environ['WISERF_ROOT'] = '/Users/beaumont/WiseRF-1.5.9-macosx-x86_64-rc1'
from sklearn.metrics import auc_score, auc
import sklearn.metrics as met
def decision_function(self, x):
p = self.predict_proba(x)
return p[:, 1] - p[:, 0]
WiseRF.decision_function = decision_function
def auc_below_fpos(y, yp, fpos):
fp, tp, th = met.roc_curve(y, yp)
good = (fp <= fpos)
return auc(fp[good], tp[good])
class Choice(object):
def __init__(self, *choices):
self._choices = choices
def rvs(self):
return random.choice(self._choices)
class Space(object):
def __init__(self, **hyperparams):
self._hyperparams = hyperparams
def __iter__(self):
while True:
yield {k: v.rvs() for k, v in self._hyperparams.items()}
gb_space = Space(learning_rate = stats.uniform(1e-3, 1 - 1.01e-3),
n_estimators = Choice(50, 100, 200),
max_depth = Choice(1, 2, 3),
subsample = stats.uniform(1e-3, 1 - 1.01e-3))
rf_space = Space(n_estimators = Choice(200, 400, 800, 1600),
min_samples_split = Choice(1, 2, 4),
criterion = Choice('gini', 'gainratio', 'infogain'),
max_features = Choice('auto'),
n_jobs = Choice(2))
def gb_objective(**params):
clf = GradientBoostingClassifier(**params)
clf.fit(xtrain, ytrain)
df = clf.decision_function(xvalidate).ravel()
return (df[yvalidate == 1] <= df[yvalidate == 0].max()).mean(), clf
def rf_objective(**params):
clf = WiseRF(**params)
clf.fit(xtrain, ytrain)
df = clf.decision_function(xvalidate).ravel()
return -auc_below_fpos(yvalidate, df, .0005), clf
def fmin(objective, space, threshold=np.inf):
best = threshold
try:
for p in space:
f, clf = objective(**p)
if f < best:
best = f
yield best, p, clf
except KeyboardInterrupt:
pass
In [253]:
import sys
from bubbly.util import roc_curve
for best, best_params, clf in fmin(rf_objective, rf_space, threshold):
print best, best_params
sys.stdout.flush()
roc_curve(yvalidate, clf.decision_function(xvalidate), label='val')
roc_curve(ytrain, clf.decision_function(xtrain), label='train')
xlim(0, .002)
legend(loc='lower right')
show()
In [145]:
from skimage.util.montage import montage2d
def montage(arrs):
print "montaging %i images" % len(arrs)
print 'image dim:', arrs[0].shape
r, g, b = tuple(montage2d(np.array([a[:, :, i] for a in arrs]))
for i in range(3))
return np.dstack((r, g, b)).astype(np.uint8)
def ex(*params):
p = list(params)
r1 = rgb.extract(*p)
p[-1] *= 2.5
r2 = rgb.extract(*p)
return np.hstack((r1, r2))
rgb = ext.RGBExtractor()
rgb.shp = (80, 80)
df = clf.decision_function(xvalidate).ravel()
on_ind = np.argsort(df[yvalidate == 1])
off_ind = np.argsort(df[yvalidate == 0])[::-1]
figure(figsize=(10, 10))
im = montage([ex(*data['cv_pos'][i]) for i in on_ind[:49]])
imshow(im)
title("Hard Positives")
show()
figure(figsize=(10, 10))
im = montage([ex(*data['cv_neg'][i]) for i in off_ind[:49]])
imshow(im)
title("Hard Negatives")
show()
In [190]:
xtrain2, ytrain2 = _xy(ex, data['pos'], random.sample(data['neg'], npos))
In [246]:
xtrain_all, ytrain_all = _xy(ex, data['pos'], data['neg'])
In [200]:
class BaggedEstimator(object):
def __init__(self, estimators):
self.estimators = estimators
def decision_function(self, X):
return reduce(np.add, (e.decision_function(X) for e in self.estimators))
def predict(self, X):
return self.decision_function(X) > 0
In [247]:
from sklearn.utils import resample
def balanced_resample(x, y):
npos = (y == 1).sum()
xn = x[y == 0]
xresample = resample(x, n_samples = npos)
return np.vstack((x[y == 1], xresample)), np.hstack((np.ones(npos, dtype=np.int), np.zeros(npos, dtype=np.int)))
In [248]:
from sklearn.base import clone
clfs = []
for i in range(5):
x, y = balanced_resample(xtrain_all, ytrain_all)
clfs.append(clone(clf).fit(x, y))
bag = BaggedEstimator(clfs)
In [249]:
roc_curve(yvalidate, bag.decision_function(xvalidate), label='val')
roc_curve(ytrain, bag.decision_function(xtrain), label='train')
xlim(0, .002)
legend(loc='lower right')
show()
In [216]:
from skimage.util.montage import montage2d
def montage(arrs):
print "montaging %i images" % len(arrs)
print 'image dim:', arrs[0].shape
r, g, b = tuple(montage2d(np.array([a[:, :, i] for a in arrs]))
for i in range(3))
return np.dstack((r, g, b)).astype(np.uint8)
def ex(*params):
p = list(params)
r1 = rgb.extract(*p)
p[-1] *= 2.5
r2 = rgb.extract(*p)
return np.hstack((r1, r2))
rgb = ext.RGBExtractor()
rgb.shp = (80, 80)
df = bag.decision_function(xvalidate).ravel()
on_ind = np.argsort(df[yvalidate == 1])
off_ind = np.argsort(df[yvalidate == 0])[::-1]
figure(figsize=(15, 10))
im = montage([ex(*data['cv_pos'][i]) for i in on_ind[:49]])
imshow(im, origin='upper')
title("Hard Positives")
show()
figure(figsize=(15, 10))
im = montage([ex(*data['cv_neg'][i]) for i in off_ind[:49]])
imshow(im, origin='upper')
title("Hard Negatives")
show()
In [220]:
rgb.shp = (180, 180)
im = montage([ex(*data['cv_neg'][i]) for i in off_ind[:16]])
figure(figsize=(15, 10))
imshow(im, origin='upper')
rgb.shp = (80, 80)
In [225]:
hist((df[yvalidate == 1], df[yvalidate == 0]), normed=True, log=True)
Out[225]:
In [230]:
on = np.array(data['pos'])
off = np.array(data['neg'])
cv = np.array(data['cv_pos'] + data['cv_neg'])
figure(figsize=(10, 10))
plot(on[:, 1] % 3, on[:, 2], 'g.')
plot(off[:, 1] % 3, off[:, 2], 'r.')
plot(cv[:, 1] % 3, cv[:, 2], 'b.', alpha=.3)
Out[230]:
In [ ]:
class SubsetClassifier(object):
def __init__(self, clf, subset):
self.clf = clf
self.subset = subset
def _sub(self, X):
return X[:, self.subset]
def predict(self, X):
return self.clf.predict(self._sub(X))
def decision_function(self, X):
self.clf.decision_function(self._sub(X))
In [232]:
clf2 = clone(clf)
clf2.set_params(compute_importances = True)
clf2.fit(xtrain, ytrain)
Out[232]:
In [233]:
clf2.feature_importances_.shape
Out[233]:
In [234]:
np.sqrt(47200)
Out[234]:
In [237]:
im = np.zeros((220, 200))
im.flat[:47200] = clf2.feature_importances_
In [241]:
imshow(np.sqrt(im))
Out[241]:
In [ ]: