Pre-filtering

Is it possible to quickly pre-filter obvious negative fields before dispatching to picloud, without rejecting ~any true positives?



In [59]:

    
import json
from bubbly.extractors import CompressionExtractor
import bubbly.extractors as ext
from bubbly.dr1 import bubble_params



In [7]:

    
data = json.load(open('../models/random_scores.json'))
bubbles = bubble_params()



In [10]:

    
len(data['stamps'])









    Out[10]:





50000



In [104]:

    
full = ext.MultiViewExtractor(ext.ManyManyExtractors())
full.shape = (80, 80)



In [105]:

    
%timeit ex.extract(*bubbles[0])
%timeit full.extract(*bubbles[0])









    



100 loops, best of 3: 5.3 ms per loop
1 loops, best of 3: 108 ms per loop



In [152]:

    
ex = ext.RingExtractor()
on = np.vstack([ex.extract(*b) for b in sorted(bubbles)[:500]])
off = np.vstack([ex.extract(*b) for b in sorted(data['stamps'][:500])])









    



Loading a new field at l=1
Loading a new field at l=2
Loading a new field at l=3
Loading a new field at l=4
Loading a new field at l=5
Loading a new field at l=6
Loading a new field at l=7
Loading a new field at l=8
Loading a new field at l=9
Loading a new field at l=10
Loading a new field at l=11
Loading a new field at l=12
Loading a new field at l=13
Loading a new field at l=14
Loading a new field at l=15
Loading a new field at l=16
Loading a new field at l=17
Loading a new field at l=18
Loading a new field at l=0



In [166]:

    
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

#clf = DecisionTreeClassifier(max_depth=1, compute_importances=True)
#clf = LogisticRegression(class_weight={0:1, 1:5})

x = np.vstack((on, off))
x[x != x] = 0
y = np.hstack((np.ones(on.shape[0]), np.zeros(off.shape[0])))
xr, xs, yr, ys = train_test_split(x, y)
clf.fit(xr, yr)

yp = clf.predict_proba(xs)[:, 1]
yp = yp >= yp[ys == 1].min()

#yp = clf.predict(xs)
print yp[ys == 0].mean(), yp[ys == 1].mean()









    



0.779661016949 1.0



In [ ]: