Building a Complete Classifier

This notebook constructs 3 classifiers, trained on non-overlapping subsets of the data, and builds a ModelGroup suitable for classifying alll the data


In [1]:
%pylab

import json
import random
import itertools
import os
import sys

import numpy as np
from sklearn.metrics import auc_score
from sklearn.utils import resample

from bubbly.wiserf import WiseRF
from bubbly.extractors import MultiViewExtractor, ManyManyExtractors
from bubbly.util import roc_curve
from bubbly.dr1 import WideLocationGenerator, highest_quality_on_params
from bubbly.model import Model


Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline].
For more information, type 'help(pylab)'.
/Users/beaumont/Library/Python/2.7/lib/python/site-packages/pytz/__init__.py:35: UserWarning: Module argparse was already imported from /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/argparse.pyc, but /Users/beaumont/Library/Python/2.7/lib/python/site-packages is being added to sys.path
  from pkg_resources import resource_stream

In [2]:
#XXX Do these need to be re-optimized, potentially for each of the 3 models separately?
best_params = {'max_features': 'auto', 'min_samples_split': 4, 'n_jobs': 2, 'criterion': 'infogain', 'n_estimators': 800}

In [3]:
data = json.load(open('../models/bootstrapped_labels_targeted.json'))
lgs = [WideLocationGenerator(mod3=i) for i in [0, 1, 2]]

pos = [filter(lambda x: l.valid_longitude(x[0]), data['on_params']) for l in lgs]
neg = [filter(lambda x: l.valid_longitude(x[0]), data['off_params']) for l in lgs]

#randomly subset negative examples, to balance
neg = [random.sample(n, len(p)) for n, p in zip(neg, pos)]

#sort by longitude, to minimize IO
for i in range(3):
    pos[i] = sorted(pos[i])
    neg[i] = sorted(neg[i])
    
ex = MultiViewExtractor(ManyManyExtractors())
ex.shp = (60, 60)

# sanity check
# assert that the trainin data are non-overlapping
mods = [np.array(np.hstack((p, n)))[:, 1] % 3 for p, n in zip(pos, neg)]
assert ((mods[0] >= 0.5) & (mods[0] <= 2.5)).all()
assert ((mods[1] <= 0.5) | (mods[1] >= 1.5)).all()
assert ((mods[2] >= 1.5) | (mods[2] <= 2.5)).all()

In [38]:
#concerns
# our previous hyperparameter optimization may have overfit the entire + dataset
# training data may not be as informative as old data
# we need a new hyperparameter optimization for the new dataset

#setup:
#   vectors for all data in data
#   3 strategies for sampling - labels
#   hyperoptimize, hiding 1/3 of validation +
#   assess over-fit optimization
#   compare to previous results
#   if that comparison is bad, re-do using all + examples. If *that* looks good, we've over optimized
xs = {}
for key in ['on_params', 'off_params']:
    xs[key] = np.vstack(ex.extract(*row) for row in data[key]) 

    
#XXX nope, need CV data to be *uniformly* sampled    
def filter_data(locator):    
    on = np.array([locator.valid_longitude(x[0]) for x in data['on_params']])
    off = np.array([locator.valid_longitude(x[0]) for x in data['off_params']])

    cvon = np.array([not locator.valid_longitude(x[0]) for x in data['on_params']])
    cvoff = np.array([not locator.valid_longitude(x[0]) for x in data['off_params']])
    
    return xs['on_params'][on], xs['off_params'][off], xs['on_params'][cvon], xs['on_params'][cvoff]

def sample_negatives(strategy):

    
def training_data(locator, strategy):
    pass

def validation_data(locator):
    
len(data['off_params'])


Out[38]:
20000

In [4]:
models = [Model(ex, lg, WiseRF(**best_params)).fit(on, off) 
          for lg, on, off in zip(lgs, pos, neg)]


WARNING: Non-finite values in feature vectors. Fixing [bubbly.model]

Save


In [10]:
from bubbly.model import ModelGroup
import cPickle as pickle

gm = ModelGroup(*models)
with open('../models/full_classifier.pkl', 'w') as outfile:
    pickle.dump(gm, outfile)

Checking Classification Quality


In [31]:
cv_data = json.load(open('../models/bootstrapped_labels_unbiased.json'))

cv_pos = [filter(lambda x: not l.valid_longitude(x[0]), cv_data['on_params']) for l in lgs]
cv_neg = [filter(lambda x: not l.valid_longitude(x[0]), cv_data['off_params']) for l in lgs]
cv_neg[-1] = cv_neg[-1][:10000]

#sort to minimize io
cv_pos = [sorted(c) for c in cv_pos]
cv_neg = [sorted(c) for c in cv_neg]

cv = [n + p for n, p in zip(cv_neg, cv_pos)]
cvy = [np.hstack((np.zeros(len(n), dtype=np.int), 
                  np.ones(len(p), dtype=np.int)))
       for n, p in zip(cv_neg, cv_pos)]

In [32]:
dfs = [m.cloud_decision_function(c, workers=30) for m, c in zip(models, cv)]

In [37]:
from bubbly.util import roc_curve

colors = ['#1B9E77', '#D95F02', '#7570B3']
for i, (y, yp, c) in enumerate(zip(cvy, dfs, colors)):
    roc_curve(y, yp, lw=3, color=c, label='mod %i' % i)
  
legend(loc='lower right')
xlim(0, .002)


Out[37]:
(0, 0.002)