In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
DATA_PATH = '/mnt/cms/version2/merged.pickle'
In [2]:
import cPickle as pickle
with open(DATA_PATH, 'r') as f:
data = pickle.load(f)
labels = np.load('/mnt/cms/version2/labels.npy')
In [3]:
non_zero_lumi_index = np.where(
(data['_instantLumi_minibias'] != 0.0) | \
(data['_instantLumi_muons'] != 0.0) | \
(data['_instantLumi_photons'] != 0.0)
)[0]
In [4]:
not_features = [
'_luminosityBlock',
'_run',
'_instantLumi_minibias',
'_instantLumi_muons',
'_instantLumi_photons'
]
features = sorted(set(data.columns) - set(not_features))
X = data[features].values
y = labels
weights = data['_instantLumi_minibias'].values
In [5]:
print 'Non-zero luminosity:', non_zero_lumi_index.shape[0], 'lumisections'
In [6]:
def build_predictions(clf, X, y, thr_q = 0.97,
n_folds = 10, max_loss_rate = 0.0, max_pollution_rate = 0.0,
title = ''):
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.cross_validation import StratifiedKFold
cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=7777)
proba = np.zeros(X.shape[0])
train_indx, test_indx = iter(cv).next()
clf.fit(X[train_indx, :], y[train_indx])
proba[test_indx] = clf.predict_proba(X[test_indx, :])[:, 1]
fpr, tpr, thr_roc = roc_curve(y[test_indx], proba[test_indx])
auc_score = auc(fpr, tpr)
precision, recall, thr = precision_recall_curve(y[test_indx], proba[test_indx])
pr_auc = auc(precision, recall, reorder = True)
plt.figure(figsize=(8, 8))
plt.hist([
proba[y_test == 0, 1],
])
thr = np.hstack([thr, [1.0]])
pollution = 1.0 - precision
loss = 1.0 - recall
fs, axs = plt.subplots(1, 3, figsize=(14, 4))
plt.suptitle(title)
axs[0].set_title('ROC curve')
axs[0].plot([0, 1], [0, 1], '--')
axs[0].plot(fpr, tpr, label = "AUC = %.2f" % auc_score)
axs[0].set_xlabel('False Positive Rate')
axs[0].set_ylabel('True Positive Rate')
axs[0].legend(loc='lower right')
axs[1].set_title('Pollution-Loss curve')
axs[1].plot([1, 1], [0, 0], '--')
axs[1].plot(pollution, loss, label = "AUC = %.2f" % pr_auc)
axs[1].set_xlabel('Pollution Rate')
axs[1].set_ylabel('Loss Rate')
axs[1].legend(loc='lower right')
axs[2].set_title('Pollution/Loss rates')
axs[2].plot(thr, pollution, label='pollution rate')
axs[2].plot(thr, loss, label='loss rate')
axs[2].set_xlabel('threshold')
axs[2].set_ylabel('rate')
axs[2].legend(loc = 'upper left')
plt.show()
# thr_high = np.min(thr[pollution <= max_pollution_rate])
# thr_low = np.max(thr[loss <= max_loss_rate])
thr_high = np.percentile(thr[pollution <= max_pollution_rate], q=(1.0 - thr_q) * 100.0)
thr_low = np.percentile(thr[loss <= max_loss_rate], q=thr_q * 100.0)
return clf, thr_low, thr_high
In [12]:
def offline_learning(clf, X, y):
from sklearn.cross_validation import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
proba = clf.predict_proba(X_test)[:, 1]
precision, recall, thr = precision_recall_curve(y_test, proba)
thr = np.hstack([thr, 1.0])
print roc_auc_score(y_test, proba)
print thr.shape
pollution = 1.0 - precision
loss = 1.0 - recall
n = thr[::10].shape[0]
rrs = np.ones(shape=(n, n))
lrs = np.ones(shape=(n, n))
prs = np.ones(shape=(n, n))
TP = np.zeros(shape=n)
TN = np.zeros(shape=n)
FP = np.zeros(shape=n)
FN = np.zeros(shape=n)
FPR = np.zeros(shape=n)
TPR = np.zeros(shape=n)
for i, cut_low in enumerate(thr[::10]):
TN[i] = np.sum((proba <= cut_low) & (y_test == 0))
FN[i] = np.sum((proba <= cut_low) & (y_test == 1))
for i, cut_high in enumerate(thr[::10]):
TP[i] = np.sum((proba >= cut_high) & (y_test == 1))
FP[i] = np.sum((proba >= cut_high) & (y_test == 0))
for i, cut_low in enumerate(thr[::10]):
for j, cut_high in enumerate(thr[::10]):
rrs[i, j] = np.mean((proba > cut_low) & (proba < cut_high))
lrs[i, j] = float(FN[i]) / (TP[j] + FN[i])
prs[i, j] = float(FP[j]) / (TP[j] + FP[i])
return rrs, lrs, prs, thr
In [13]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, min_samples_leaf=10, n_jobs=-1, random_state=7777)
rrs, lrs, prs, thr = offline_learning(clf, X, y)
In [15]:
font = {
'family' : 'normal',
'weight' : 'normal',
'size' : 16
}
import matplotlib
matplotlib.rc('font', **font)
In [16]:
lrs
Out[16]:
In [17]:
prs
Out[17]:
In [18]:
rrs
Out[18]:
In [19]:
cuts_low = thr[::10]
cuts_high = thr[::10]
plt.figure(figsize=(9, 8))
plt.contourf(
cuts_high, cuts_low, rrs,
# levels = np.linspace(0., 1., num=21)
)
plt.title('Rejection Rate')
plt.ylabel('Cut "bad"')
plt.xlabel('Cut "good"')
plt.colorbar()
Out[19]:
In [ ]:
plt.scatter(prs.ravel(), lrs.ravel())
In [52]:
n = 150
plt.figure(figsize=(9, 8))
plt.contourf(
cuts_high[-n:], cuts_low[:n], rrs[:n, -n:],
levels = np.linspace(np.min(rrs[:n, -n:]), np.max(rrs[:n, -n:]), num=21)
)
plt.title('Rejection Rate (the region of interest)')
plt.ylabel('Cut "bad"')
plt.xlabel('Cut "good"')
plt.colorbar()
Out[52]:
In [53]:
plt.figure(figsize=(9, 8))
plt.contourf(
cuts_high, cuts_low, lrs,
levels = np.linspace(np.min(lrs), np.max(lrs), num=21)
)
plt.title('Loss Rate')
plt.ylabel('Cut "bad"')
plt.xlabel('Cut "good"')
plt.colorbar()
Out[53]:
In [57]:
plt.figure(figsize=(9, 8))
plt.contourf(
cuts_high[-n:], cuts_low[:n], lrs[:n, -n:],
levels = np.linspace(np.min(lrs[:n, -n:]), np.max(lrs[:n, -n:]), num=21)
)
plt.title('Loss Rate (the region of interest)')
plt.ylabel('Cut "bad"')
plt.xlabel('Cut "good"')
plt.colorbar()
Out[57]:
In [58]:
plt.figure(figsize=(9, 8))
plt.contourf(
cuts_high[-n:], cuts_low[-n:], lrs[-n:, -n:],
levels = np.linspace(np.min(lrs[-n:, -n:]), np.max(lrs[-n:, -n:]), num=21)
)
plt.title('Loss Rate (upper right)')
plt.ylabel('Cut "bad"')
plt.xlabel('Cut "good"')
plt.colorbar()
Out[58]:
In [59]:
plt.figure(figsize=(9, 8))
plt.contourf(
cuts_high, cuts_low, prs,
levels = np.linspace(np.min(prs), np.max(prs), num=21)
)
plt.title('Pollution Rate')
plt.ylabel('Cut "bad"')
plt.xlabel('Cut "good"')
plt.colorbar()
Out[59]:
In [62]:
plt.figure(figsize=(9, 8))
plt.contourf(
cuts_high[-n:], cuts_low[:n], prs[:n, -n:],
# levels = np.linspace(np.min(prs[:n, -n:]), np.max(prs[:n, -n:]), num=21)
)
plt.title('Pollution Rate (the region of interest)')
plt.ylabel('Cut "bad"')
plt.xlabel('Cut "good"')
plt.colorbar()
Out[62]:
In [63]:
plt.figure(figsize=(9, 8))
plt.contourf(
cuts_high[:n], cuts_low[:n], prs[:n, :n],
levels = np.linspace(np.min(prs[:n, :n]), np.max(prs[:n, :n],), num=21)
)
plt.title('Pollution Rate (lower left)')
plt.ylabel('Cut "bad"')
plt.xlabel('Cut "good"')
plt.colorbar()
Out[63]:
In [67]:
plt.figure(figsize=(9, 8))
plt.contourf(
prs, lrs, rrs,
levels = np.linspace(np.min(rrs), np.max(rrs,), num=21)
)
plt.title('Rejection Rate')
plt.ylabel('Pollution Rate')
plt.xlabel('Loss Rate')
plt.colorbar()
Out[67]:
In [100]:
plt.figure(figsize=(9, 8))
plt.contourf(
prs, lrs, rrs,
levels = np.linspace(np.min(rrs), 0.6, num=41)
)
plt.xlim([0.0, 0.025])
plt.ylim([0.0, 0.025])
plt.title('Rejection Rate')
plt.ylabel('Loss Rate')
plt.xlabel('Pollution Rate')
plt.colorbar()
Out[100]:
In [101]:
plt.figure(figsize=(9, 8))
x = rrs.copy()
x[x <= np.exp(-7)] = np.exp(-7)
x = np.log(x)
plt.contourf(
prs, lrs, x,
levels = np.linspace(np.min(x), np.log(0.5), num=21)
)
plt.xlim([0.0, 0.025])
plt.ylim([0.0, 0.025])
plt.title('Rejection Rate')
plt.ylabel('Loss Rate')
plt.xlabel('Pollution Rate')
cb = plt.colorbar()
cb.set_ticklabels( ['%.1e' % t for t in np.exp(np.linspace(np.min(x), np.log(0.5), num=21))])
In [102]:
plt.figure(figsize=(9, 8))
x = rrs.copy()
x[x <= np.exp(-7)] = np.exp(-7)
x = np.log(x)
plt.contourf(
prs, lrs, x,
levels = np.linspace(np.min(x), np.log(0.5), num=21)
)
plt.xlim([0.0, 0.01])
plt.ylim([0.0, 0.01])
plt.title('Rejection Rate')
plt.ylabel('Loss Rate')
plt.xlabel('Pollution Rate')
cb = plt.colorbar()
cb.set_ticklabels( ['%.1e' % t for t in np.exp(np.linspace(np.min(x), np.log(0.5), num=21))])
In [83]:
plt.figure(figsize=(9, 8))
plt.contourf(
prs, lrs, rrs,
levels = np.linspace(np.min(rrs), np.max(rrs), num=41)
)
plt.xlim([0.0, 0.05])
plt.ylim([0.0, 0.05])
plt.title('Rejection Rate')
plt.ylabel('Loss Rate')
plt.xlabel('Pollution Rate')
plt.colorbar()
Out[83]:
In [64]:
plt.figure(figsize=(9, 8))
plt.title('Rejection Rate')
plt.contourf(alrs, aprs, np.mean(rrs[:, :, -4:], axis=2), cmap=plt.cm.Reds)
plt.xlabel('Average Loss Rate')
plt.ylabel('Averange Pollution Rate')
plt.colorbar()
In [ ]:
plt.figure(figsize=(9, 8))
plt.title('Average Loss Rate')
plt.contourf(lrs, prs, alrs.T, cmap=plt.cm.Reds)
plt.xlabel('Loss Rate constraint')
plt.ylabel('Pollution Rate constraint')
plt.colorbar()
In [ ]:
plt.figure(figsize=(9, 8))
plt.title('Average Pollution Rate')
plt.contourf(prs, lrs, aprs.T, cmap=plt.cm.Reds)
plt.xlabel('Loss Rate constraint')
plt.ylabel('Pollution Rate constraint')
plt.colorbar()
In [ ]:
plt.matshow(arrs, cmap=plt.cm.Reds)
plt.xlabel('Pollution rate')
plt.ylabel('Loss rate')
plt.colorbar()
In [ ]: