In [1]:
%pylab
import json
import random
import itertools
import os
import sys
import numpy as np
from sklearn.metrics import auc_score
from sklearn.utils import resample
from bubbly.wiserf import WiseRF
from bubbly.extractors import MultiViewExtractor, ManyManyExtractors
from bubbly.util import roc_curve
from bubbly.dr1 import WideLocationGenerator, highest_quality_on_params
from bubbly.model import Model
In [2]:
#XXX Do these need to be re-optimized, potentially for each of the 3 models separately?
best_params = {'max_features': 'auto', 'min_samples_split': 4, 'n_jobs': 2, 'criterion': 'infogain', 'n_estimators': 800}
In [3]:
data = json.load(open('../models/bootstrapped_labels_targeted.json'))
lgs = [WideLocationGenerator(mod3=i) for i in [0, 1, 2]]
pos = [filter(lambda x: l.valid_longitude(x[0]), data['on_params']) for l in lgs]
neg = [filter(lambda x: l.valid_longitude(x[0]), data['off_params']) for l in lgs]
#randomly subset negative examples, to balance
neg = [random.sample(n, len(p)) for n, p in zip(neg, pos)]
#sort by longitude, to minimize IO
for i in range(3):
pos[i] = sorted(pos[i])
neg[i] = sorted(neg[i])
ex = MultiViewExtractor(ManyManyExtractors())
ex.shp = (60, 60)
# sanity check
# assert that the trainin data are non-overlapping
mods = [np.array(np.hstack((p, n)))[:, 1] % 3 for p, n in zip(pos, neg)]
assert ((mods[0] >= 0.5) & (mods[0] <= 2.5)).all()
assert ((mods[1] <= 0.5) | (mods[1] >= 1.5)).all()
assert ((mods[2] >= 1.5) | (mods[2] <= 2.5)).all()
In [38]:
#concerns
# our previous hyperparameter optimization may have overfit the entire + dataset
# training data may not be as informative as old data
# we need a new hyperparameter optimization for the new dataset
#setup:
# vectors for all data in data
# 3 strategies for sampling - labels
# hyperoptimize, hiding 1/3 of validation +
# assess over-fit optimization
# compare to previous results
# if that comparison is bad, re-do using all + examples. If *that* looks good, we've over optimized
xs = {}
for key in ['on_params', 'off_params']:
xs[key] = np.vstack(ex.extract(*row) for row in data[key])
#XXX nope, need CV data to be *uniformly* sampled
def filter_data(locator):
on = np.array([locator.valid_longitude(x[0]) for x in data['on_params']])
off = np.array([locator.valid_longitude(x[0]) for x in data['off_params']])
cvon = np.array([not locator.valid_longitude(x[0]) for x in data['on_params']])
cvoff = np.array([not locator.valid_longitude(x[0]) for x in data['off_params']])
return xs['on_params'][on], xs['off_params'][off], xs['on_params'][cvon], xs['on_params'][cvoff]
def sample_negatives(strategy):
def training_data(locator, strategy):
pass
def validation_data(locator):
len(data['off_params'])
Out[38]:
In [4]:
models = [Model(ex, lg, WiseRF(**best_params)).fit(on, off)
for lg, on, off in zip(lgs, pos, neg)]
In [10]:
from bubbly.model import ModelGroup
import cPickle as pickle
gm = ModelGroup(*models)
with open('../models/full_classifier.pkl', 'w') as outfile:
pickle.dump(gm, outfile)
In [31]:
cv_data = json.load(open('../models/bootstrapped_labels_unbiased.json'))
cv_pos = [filter(lambda x: not l.valid_longitude(x[0]), cv_data['on_params']) for l in lgs]
cv_neg = [filter(lambda x: not l.valid_longitude(x[0]), cv_data['off_params']) for l in lgs]
cv_neg[-1] = cv_neg[-1][:10000]
#sort to minimize io
cv_pos = [sorted(c) for c in cv_pos]
cv_neg = [sorted(c) for c in cv_neg]
cv = [n + p for n, p in zip(cv_neg, cv_pos)]
cvy = [np.hstack((np.zeros(len(n), dtype=np.int),
np.ones(len(p), dtype=np.int)))
for n, p in zip(cv_neg, cv_pos)]
In [32]:
dfs = [m.cloud_decision_function(c, workers=30) for m, c in zip(models, cv)]
In [37]:
from bubbly.util import roc_curve
colors = ['#1B9E77', '#D95F02', '#7570B3']
for i, (y, yp, c) in enumerate(zip(cvy, dfs, colors)):
roc_curve(y, yp, lw=3, color=c, label='mod %i' % i)
legend(loc='lower right')
xlim(0, .002)
Out[37]: