In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
DATA_PATH = '/mnt/cms/version2/merged.pickle'
In [2]:
import cPickle as pickle
with open(DATA_PATH, 'r') as f:
data = pickle.load(f)
labels = np.load('/mnt/cms/version2/labels.npy')
In [3]:
non_zero_lumi_index = np.where(
(data['_instantLumi_minibias'] != 0.0) | \
(data['_instantLumi_muons'] != 0.0) | \
(data['_instantLumi_photons'] != 0.0)
)[0]
In [4]:
not_features = [
'_luminosityBlock',
'_run',
'_instantLumi_minibias',
'_instantLumi_muons',
'_instantLumi_photons'
]
features = sorted(set(data.columns) - set(not_features))
X = data[features].values
y = labels
weights = data['_instantLumi_minibias'].values
In [5]:
print 'Non-zero luminosity:', non_zero_lumi_index.shape[0], 'lumisections'
In [6]:
def build_predictions(clf, X, y, thr_q = 0.97,
n_folds = 10, max_loss_rate = 0.0, max_pollution_rate = 0.0,
title = ''):
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.cross_validation import StratifiedKFold
cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=7777)
proba = np.zeros(X.shape[0])
train_indx, test_indx = iter(cv).next()
clf.fit(X[train_indx, :], y[train_indx])
proba[test_indx] = clf.predict_proba(X[test_indx, :])[:, 1]
fpr, tpr, thr_roc = roc_curve(y[test_indx], proba[test_indx])
auc_score = auc(fpr, tpr)
precision, recall, thr = precision_recall_curve(y[test_indx], proba[test_indx])
pr_auc = auc(precision, recall, reorder = True)
thr = np.hstack([thr, [1.0]])
pollution = 1.0 - precision
loss = 1.0 - recall
fs, axs = plt.subplots(1, 3, figsize=(14, 4))
plt.suptitle(title)
axs[0].set_title('ROC curve')
axs[0].plot([0, 1], [0, 1], '--')
axs[0].plot(fpr, tpr, label = "AUC = %.2f" % auc_score)
axs[0].set_xlabel('False Positive Rate')
axs[0].set_ylabel('True Positive Rate')
axs[0].legend(loc='lower right')
axs[1].set_title('Pollution-Loss curve')
axs[1].plot([1, 1], [0, 0], '--')
axs[1].plot(pollution, loss, label = "AUC = %.2f" % pr_auc)
axs[1].set_xlabel('Pollution Rate')
axs[1].set_ylabel('Loss Rate')
axs[1].legend(loc='lower right')
axs[2].set_title('Pollution/Loss rates')
axs[2].plot(thr, pollution, label='pollution rate')
axs[2].plot(thr, loss, label='loss rate')
axs[2].set_xlabel('threshold')
axs[2].set_ylabel('rate')
axs[2].legend(loc = 'upper left')
plt.show()
# thr_high = np.min(thr[pollution <= max_pollution_rate])
# thr_low = np.max(thr[loss <= max_loss_rate])
thr_high = np.percentile(thr[pollution <= max_pollution_rate], q=(1.0 - thr_q) * 100.0)
thr_low = np.percentile(thr[loss <= max_loss_rate], q=thr_q * 100.0)
return clf, thr_low, thr_high
In [7]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=32, min_samples_leaf=10, n_jobs=-1, random_state=7777)
build_predictions(clf, X, y, n_folds=5)
Out[7]:
In [8]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=32, min_samples_leaf=10, n_jobs=-1, random_state=7777)
build_predictions(clf, X, y, n_folds=10)
Out[8]:
In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=64, min_samples_leaf=10, n_jobs=-1, random_state=7777)
build_predictions(clf, X, y, n_folds=5)
Out[9]:
In [10]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=64, min_samples_leaf=10, n_jobs=-1, random_state=7777)
build_predictions(clf, X, y, n_folds=10)
Out[10]:
In [11]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=96, min_samples_leaf=10, n_jobs=-1, random_state=7777)
build_predictions(clf, X, y, n_folds=10)
Out[11]:
In [12]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, min_samples_leaf=10, n_jobs=-1, random_state=7777)
build_predictions(clf, X, y, n_folds=5)
Out[12]:
In [13]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=256, min_samples_leaf=10, n_jobs=-1, random_state=7777)
build_predictions(clf, X, y, n_folds=10)
Out[13]:
In [14]:
from sklearn.cross_validation import StratifiedKFold
In [15]:
def stream_learning(clf, X, y, n_folds, max_loss_rate, max_pollution_rate, cv_folds = 10):
folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=7777).test_folds
y_predicted = np.zeros(X.shape[0], dtype='int')
threshold_low = np.zeros(n_folds)
threshold_high = np.zeros(n_folds)
X_train = np.zeros(shape = (0, X.shape[1]))
y_train = np.zeros(shape = 0)
rejection_rates = np.zeros(shape=n_folds)
loss_rates = np.zeros(shape=n_folds)
pollution_rates = np.zeros(shape=n_folds)
for iteration in xrange(n_folds):
indx = np.where(folds == iteration)[0]
if X_train.shape[0] == 0:
y_predicted[indx] == 0
X_train = np.vstack([X_train, X[indx, :]])
y_train = np.hstack([y_train, y[indx]])
rejection_rates[iteration] = 1.0
pollution_rates[iteration] = 0.0
loss_rates[iteration] = 0.0
else:
clf, thr_low, thr_high = build_predictions(
clf, X_train, y_train, n_folds=cv_folds, thr_q = 0.85,
max_loss_rate = max_loss_rate, max_pollution_rate = max_pollution_rate,
title='Iteration %d' % iteration
)
proba = clf.predict_proba(X[indx, :])[:, 1]
rejected_indx = indx[(proba > thr_low) & (proba < thr_high)]
y_predicted[indx[proba > thr_high]] = 1
y_predicted[indx[proba < thr_low]] = -1
y_predicted[rejected_indx] = 0
X_train = np.vstack([X_train, X[rejected_indx, :]])
y_train = np.hstack([y_train, y[rejected_indx]])
rejection_rates[iteration] = float(rejected_indx.shape[0]) / indx.shape[0]
try:
### Unrecognized good lumisections to all good lumisections
FN = np.sum((y_predicted[indx] == -1) & (y[indx] == 1))
loss_rates[iteration] = float(FN) / np.sum(y[indx] == 1)
except:
loss_rates[iteration] = 0.0
try:
### Bad lumisections predicted as good
FP = np.sum((y_predicted[indx] == 1) & (y[indx] == 0))
pollution_rates[iteration] = float(FP) / np.sum(y_predicted[indx] == 1)
except:
pollution_rates[iteration] = 0.0
return rejection_rates, loss_rates, pollution_rates
In [16]:
import itertools
folds = 5
max_lrs = np.linspace(0.0, 0.005, num=9)
max_prs = np.linspace(0.0, 0.005, num=9)
arrs = np.ones((max_lrs.shape[0], max_prs.shape[0]))
aprs = np.ones((max_lrs.shape[0], max_prs.shape[0]))
alrs = np.ones((max_lrs.shape[0], max_prs.shape[0]))
rrs = np.ones((max_lrs.shape[0], max_prs.shape[0], folds))
prs = np.ones((max_lrs.shape[0], max_prs.shape[0], folds))
lrs = np.ones((max_lrs.shape[0], max_prs.shape[0], folds))
for lr_i, lr in enumerate(max_lrs):
for pr_i, pr in enumerate(max_prs):
print 'Loss Rate constraint: %.2e' % lr
print 'Pollution Rate constraint: %.2e' % pr
clf = RandomForestClassifier(n_estimators=128, min_samples_leaf=10, n_jobs=-1, random_state=7777)
rrs[lr_i, pr_i, :], lrs[lr_i, pr_i, :], prs[lr_i, pr_i, :] = stream_learning(
clf, X, y, n_folds=folds, cv_folds=7,
max_loss_rate = lr, max_pollution_rate = pr
)
plt.figure(figsize=(8,8))
plt.title('Stream learning')
plt.plot(rrs[lr_i, pr_i, :], label='rejection rate')
plt.plot(lrs[lr_i, pr_i, :], label='loss rate')
plt.plot(prs[lr_i, pr_i, :], label='pollution rate')
plt.xlabel('iteration')
plt.legend()
plt.show()
print 'Avg. rejection rate: %.3e' % np.mean(rrs[lr_i, pr_i, :])
print 'Avg. loss rate: %.3e' % np.mean(lrs[lr_i, pr_i, :])
print 'Avg. pollution rate: %.3e' % np.mean(prs[lr_i, pr_i, :])
arrs[lr_i, pr_i] = np.mean(rrs[lr_i, pr_i, :])
aprs[lr_i, pr_i] = np.mean(prs[lr_i, pr_i, :])
alrs[lr_i, pr_i] = np.mean(lrs[lr_i, pr_i, :])
In [17]:
font = {'family' : 'normal',
'weight' : 'normal',
'size' : 16}
import matplotlib
matplotlib.rc('font', **font)
In [18]:
plt.figure(figsize=(9, 8))
plt.title('Rejection Rate / Manual work')
arrs = np.mean(rrs[:, :, -3:], axis=2)
plt.contourf(
max_prs, max_lrs, arrs,
levels=np.linspace(np.min(arrs), np.max(arrs), num=11),
#cmap=plt.cm.Reds
)
plt.ylabel('Loss Rate constraint')
plt.xlabel('Pollution Rate constraint')
plt.colorbar()
Out[18]:
In [19]:
plt.figure(figsize=(9, 8))
plt.title('Average Loss Rate')
alrs = np.mean(lrs, axis=2)
plt.contourf(
max_prs, max_lrs, alrs,
levels=np.linspace(0, 0.05, num=21),
#cmap=plt.cm.Reds
)
plt.ylabel('Loss Rate constraint')
plt.xlabel('Pollution Rate constraint')
plt.colorbar()
Out[19]:
In [20]:
plt.figure(figsize=(9, 8))
plt.title('Average Pollution Rate')
aprs = np.mean(prs, axis=2)
plt.contourf(
max_prs, max_lrs, aprs.T,
levels=np.linspace(0, 0.005, num=11),
#cmap=plt.cm.Reds
)
plt.xlabel('Loss Rate constraint')
plt.ylabel('Pollution Rate constraint')
plt.colorbar()
Out[20]:
In [21]:
plt.figure(figsize=(9, 8))
plt.title('Average Loss Rate')
plt.contourf(lrs, prs, alrs.T, cmap=plt.cm.Reds)
plt.xlabel('Loss Rate constraint')
plt.ylabel('Pollution Rate constraint')
plt.colorbar()
In [ ]:
plt.figure(figsize=(9, 8))
plt.title('Average Pollution Rate')
plt.contourf(prs, lrs, aprs.T, cmap=plt.cm.Reds)
plt.xlabel('Loss Rate constraint')
plt.ylabel('Pollution Rate constraint')
plt.colorbar()
In [ ]:
plt.matshow(arrs, cmap=plt.cm.Reds)
plt.xlabel('Pollution rate')
plt.ylabel('Loss rate')
plt.colorbar()
In [ ]: