Let's make sure we can reproduce the ballpark results from the old mwp package


In [1]:
from itertools import islice

from bubbly.model import Model
from bubbly.dr1 import LocationGenerator
from bubbly.extractors import RingWaveletCompressionExtractor
from bubbly.util import summary


/Users/beaumont/Library/Python/2.7/lib/python/site-packages/scikits/__init__.py:1: UserWarning: Module argparse was already imported from /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/argparse.pyc, but /Users/beaumont/Library/Python/2.7/lib/python/site-packages is being added to sys.path
  __import__('pkg_resources').declare_namespace(__name__)

In [2]:
model = Model(RingWaveletCompressionExtractor(), LocationGenerator(), 
              weak_learner_params=dict(verbose=1, max_depth=1, n_estimators=200, subsample=.4), 
              cascade_params=dict(verbose=1, max_layers=1))

First round of fitting


In [3]:
model.fit()


WARNING: RuntimeWarning: invalid value encountered in divide [bubbly.util]
WARNING: RuntimeWarning: invalid value encountered in divide [bubbly.util]
WARNING: Non-finite values in feature vectors. Fixing [bubbly.model]
........................................................................................................................................................................................................Cascade round 1. False pos rate: 7.853982e-02. Recall: 9.900442e-01
WARNING: Could not reduce false positive enough after 1 layers. False positive rate: 7.853982e-02. Recall: 9.900442e-01 [bubbly.cascade]
No handlers could be found for logger "bubbly.model"

In [4]:
x, y = model._make_xy(model.training_data[0]['pos'], model.training_data[0]['neg'])
summary(model.estimator, x, y)


False Positive: 0.079
Recall:         0.989
Accuracy:       0.938

In [43]:
cv_locator = LocationGenerator(1)
on2 = cv_locator.positives()
off2 = list(islice(cv_locator.negatives_iterator(), 10000))
x2, y2 = model._make_xy(on2, off2)
summary(model.estimator, x2, y2)


WARNING: RuntimeWarning: invalid value encountered in divide [bubbly.util]
False Positive: 0.037
Recall:         0.923
Accuracy:       0.960

First Bootstrap


In [15]:
off3 = model.cloud_false_positives(2000)

In [16]:
model.add_layer(model.training_data[0]['pos'], off3)


........................................................................................................................................................................................................

In [17]:
summary(model.estimator, x2, y2)


False Positive: 0.045
Recall:         0.944
Accuracy:       0.946

In [20]:
model.add_layer(model.training_data[0]['pos'], off3)


........................................................................................................................................................................................................

In [44]:
summary(model.estimator, x2, y2)


False Positive: 0.037
Recall:         0.923
Accuracy:       0.960

Second Bootstrap


In [24]:
off4 = model.cloud_false_positives(2000)

In [25]:
model.add_layer(model.training_data[0]['pos'], off4)


........................................................................................................................................................................................................

In [117]:
x4, y4 = model._make_xy(model.training_data[0]['pos'], off4)

In [45]:
summary(model.estimator, x2, y2)


False Positive: 0.037
Recall:         0.923
Accuracy:       0.960

In [27]:
summary(model.estimator, x, y)


False Positive: 0.030
Recall:         0.978
Accuracy:       0.972

In [118]:
summary(model.estimator, x4, y4)


False Positive: 0.586
Recall:         0.971
Accuracy:       0.587

In [50]:
from bubbly.util import rfp_curve
import brewer2mpl
import matplotlib.pyplot as plt

colors = brewer2mpl.get_map('Purples', 'sequential', 7).mpl_colors[::-1]

for i, y2p in enumerate(model.estimator.staged_decision_function(x2)):
    rfp_curve(y2p, y2, label = 'CV %i' % i, color = colors[i])

yp = model.estimator.decision_function(x)
rfp_curve(yp, y, color='red', label='Training Data')

plt.ylim(0, .05)
plt.legend(loc='upper left')


Out[50]:
<matplotlib.legend.Legend at 0x11b925d10>

Third Bootstrap


In [52]:
off5 = model.cloud_false_positives(2000)

In [53]:
model.add_layer(model.training_data[0]['pos'], off5)


........................................................................................................................................................................................................

In [54]:
summary(model.estimator, x2, y2)


False Positive: 0.032
Recall:         0.915
Accuracy:       0.964

In [55]:
summary(model.estimator, x, y)


False Positive: 0.026
Recall:         0.971
Accuracy:       0.973

In [58]:
for i, y2p in enumerate(model.estimator.staged_decision_function(x2)):
    rfp_curve(y2p, y2, label = 'CV %i' % i, color = colors[i])

yp = model.estimator.decision_function(x)
rfp_curve(yp, y, color='red', label='Training Data')

plt.ylim(0, .01)
plt.legend(loc='upper left')


Out[58]:
<matplotlib.legend.Legend at 0x11e0055d0>

In [136]:
from json import dump
with open('../models/reproducing_old_training_data.json', 'w') as outfile:
    dump(model.training_data, outfile)

In [ ]: