Pre-filtering

Is it possible to quickly pre-filter obvious negative fields before dispatching to picloud, without rejecting ~any true positives?


In [59]:
import json
from bubbly.extractors import CompressionExtractor
import bubbly.extractors as ext
from bubbly.dr1 import bubble_params

In [7]:
data = json.load(open('../models/random_scores.json'))
bubbles = bubble_params()

In [10]:
len(data['stamps'])


Out[10]:
50000

In [104]:
full = ext.MultiViewExtractor(ext.ManyManyExtractors())
full.shape = (80, 80)

In [105]:
%timeit ex.extract(*bubbles[0])
%timeit full.extract(*bubbles[0])


100 loops, best of 3: 5.3 ms per loop
1 loops, best of 3: 108 ms per loop

In [152]:
ex = ext.RingExtractor()
on = np.vstack([ex.extract(*b) for b in sorted(bubbles)[:500]])
off = np.vstack([ex.extract(*b) for b in sorted(data['stamps'][:500])])


Loading a new field at l=1
Loading a new field at l=2
Loading a new field at l=3
Loading a new field at l=4
Loading a new field at l=5
Loading a new field at l=6
Loading a new field at l=7
Loading a new field at l=8
Loading a new field at l=9
Loading a new field at l=10
Loading a new field at l=11
Loading a new field at l=12
Loading a new field at l=13
Loading a new field at l=14
Loading a new field at l=15
Loading a new field at l=16
Loading a new field at l=17
Loading a new field at l=18
Loading a new field at l=0

In [166]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

#clf = DecisionTreeClassifier(max_depth=1, compute_importances=True)
#clf = LogisticRegression(class_weight={0:1, 1:5})

x = np.vstack((on, off))
x[x != x] = 0
y = np.hstack((np.ones(on.shape[0]), np.zeros(off.shape[0])))
xr, xs, yr, ys = train_test_split(x, y)
clf.fit(xr, yr)

yp = clf.predict_proba(xs)[:, 1]
yp = yp >= yp[ys == 1].min()

#yp = clf.predict(xs)
print yp[ys == 0].mean(), yp[ys == 1].mean()


0.779661016949 1.0

In [ ]: