In [1]:
%pylab inline
In [2]:
matplotlib.rcParams.update({'font.size': 14})
In [3]:
import pandas
Bdata_tracks = pandas.read_csv('models/Bdata_tracks_MC.csv')
Bdata_vertex = pandas.read_csv('models/Bdata_vertex_MC_sqrt.csv')
Bdata_vertex_mike = pandas.read_csv('models/Bdata_vertices_MC_Mike.csv')
In [4]:
Bdata_tracks.head()
Out[4]:
In [5]:
Bdata_vertex.head()
Out[5]:
In [6]:
Bdata_vertex_mike.head()
Out[6]:
In [7]:
def prepare_B_data(Bdata_tracks, Bdata_vertex, only_track=False):
Bdata = pandas.merge(Bdata_tracks, Bdata_vertex, how='outer', on=['event_id', 'Bsign'])
Bdata['Bweight'] = Bdata['Bweight_x'].copy()
Bdata.ix[numpy.isnan(Bdata['Bweight'].values), 'Bweight'] = Bdata.ix[numpy.isnan(Bdata['Bweight'].values), 'Bweight_y']
Bdata = Bdata.drop(['Bweight_x', 'Bweight_y'], axis=1)
# for Nan put 1 as nan influence factor
Bdata.ix[numpy.isnan(Bdata.track_relation_prob.values), 'track_relation_prob'] = 1.
Bdata.ix[numpy.isnan(Bdata.vertex_relation_prob.values), 'vertex_relation_prob'] = 1.
Bdata.ix[~numpy.isfinite(Bdata.track_relation_prob.values), 'track_relation_prob'] = 1.
Bdata.ix[~numpy.isfinite(Bdata.vertex_relation_prob.values), 'vertex_relation_prob'] = 1.
if only_track:
relation_prob = Bdata['track_relation_prob'].values
else:
relation_prob = Bdata['track_relation_prob'].values * Bdata['vertex_relation_prob'].values
Bprob = relation_prob / (1 + relation_prob)
Bweight = Bdata.Bweight.values
Bsign = Bdata.Bsign.values
return Bdata, Bsign, Bprob, Bweight
In [10]:
Bdata, Bsign, Bprob, Bweight = prepare_B_data(Bdata_tracks, Bdata_vertex)
Bdata_m, Bsign_m, Bprob_m, Bweight_m = prepare_B_data(Bdata_tracks, Bdata_vertex_mike)
In [11]:
Bdata_track, Bsign_track, Bprob_track, Bweight_track = prepare_B_data(Bdata_tracks, Bdata_vertex, only_track=True)
In [12]:
from decisiontrain import DecisionTrainClassifier
from rep.estimators import SklearnClassifier
from rep.metaml import FoldingClassifier
In [19]:
Bdata_symmetric = Bdata_m.copy()
Bdata_symmetric.Bsign *= -1
p1 = 1 - Bdata_symmetric.track_relation_prob / (1.+Bdata_symmetric.track_relation_prob)
Bdata_symmetric.track_relation_prob = p1 / (1 - p1)
p2 = 1 - Bdata_symmetric.vertex_relation_prob / (1.+Bdata_symmetric.vertex_relation_prob)
Bdata_symmetric.vertex_relation_prob = p2 / (1 - p2)
new_Bdata = pandas.concat([Bdata_m, Bdata_symmetric])
In [20]:
tt_base = DecisionTrainClassifier(learning_rate=0.01, n_estimators=4000, depth=3,
max_features=2, n_threads=12)
tt_folding = FoldingClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
features=['track_relation_prob', 'vertex_relation_prob',
'n1: track_relation_prob * vertex_relation_prob',
'n2: track_relation_prob + vertex_relation_prob'])
tt_folding.fit(new_Bdata, new_Bdata.Bsign > 0)
Out[20]:
In [21]:
p = tt_folding.predict_proba(new_Bdata)[:, 1][:len(Bdata_m)]
In [22]:
report = tt_folding.test_on(Bdata_m, Bdata_m.Bsign > 0)
In [23]:
from rep.report.metrics import RocAuc
report.learning_curve(RocAuc(), steps=1)
Out[23]:
In [18]:
from sklearn.metrics import roc_auc_score
print 'track', roc_auc_score(Bdata.Bsign > 0, Bdata.track_relation_prob / (1 + Bdata.track_relation_prob))
print 'vertex', roc_auc_score(Bdata.Bsign > 0, Bdata.vertex_relation_prob / (1 + Bdata.vertex_relation_prob))
print 'combining', roc_auc_score(Bdata.Bsign > 0, p)
print 'simple combining', roc_auc_score(Bdata.Bsign > 0, Bprob)
In [24]:
from sklearn.metrics import roc_auc_score
print 'track', roc_auc_score(Bdata_m.Bsign > 0, Bdata_m.track_relation_prob / (1 + Bdata_m.track_relation_prob))
print 'vertex', roc_auc_score(Bdata_m.Bsign > 0, Bdata_m.vertex_relation_prob / (1 + Bdata_m.vertex_relation_prob))
print 'combining', roc_auc_score(Bdata_m.Bsign > 0, p)
print 'simple combining', roc_auc_score(Bdata_m.Bsign > 0, Bprob)
In [15]:
from utils import calibrate_probs
Bprob_calibrated, calibrator = calibrate_probs(Bsign, Bweight, Bprob, symmetrize=True)
In [47]:
Bprob_calibrated_m, _ = calibrate_probs(Bsign_m, Bweight_m, Bprob_m, symmetrize=True)
In [95]:
# Bprob_dt_calibrated, _ = calibrate_probs(Bsign, Bweight, p, logistic=True, symmetrize=True, return_calibrator=True)
In [17]:
Bprob_track_calibrated, _ = calibrate_probs(Bsign_track, Bweight_track, Bprob_track, symmetrize=True)
In [18]:
figure(figsize=(20, 7))
subplot(1,2,1)
hist(Bprob[Bsign == 1], weights=Bweight[Bsign == 1], bins=60, alpha=0.2, label='$B^+$')
hist(Bprob[Bsign == -1], weights=Bweight[Bsign == -1], bins=60, alpha=0.2, label='$B^-$')
legend(), xlabel('$P(B^+)$'), ylabel('events / 0.017')
xticks(fontsize=18), yticks(fontsize=18)
subplot(1,2,2)
hist(Bprob_calibrated[Bsign == 1], weights=Bweight[Bsign == 1], bins=80, alpha=0.2,
range=(0, 1), label='$B^+$')
hist(Bprob_calibrated[Bsign == -1], weights=Bweight[Bsign == -1], bins=80, alpha=0.2,
range=(0, 1), label='$B^-$')
ylim(0, )
xticks(fontsize=18), yticks(fontsize=18)
legend(), xlabel('calibrated $P(B^+)$'), ylabel('events / 0.017')
# plt.savefig('img/Bprob_iso_calibrated_PID_less.png' , format='png')
# plt.savefig('img/paper_B_prob.png', dpi=300, format='png', bbox_inches='tight')
Out[18]:
In [19]:
import json
with open('models/JPsiKMC.json', 'r') as f:
N_B = json.load(f)['N_B_events']
N_B
Out[19]:
In [20]:
from utils import calculate_auc_with_and_without_untag_events, calculate_roc_with_untag_events
from sklearn.metrics import roc_curve
auc, auc_full = calculate_auc_with_and_without_untag_events(Bsign, Bprob_calibrated, Bweight, N_B)
print 'AUC for tagged:', auc, 'AUC with untag:', auc_full
figure(figsize=(12, 10))
fpr, tpr, _ = calculate_roc_with_untag_events(Bsign, Bprob_calibrated, Bweight, N_B)
plot(fpr, tpr, linewidth=2)
plot([0, 1], [0, 1], 'k--')
ylim(0, 1), xlim(0, 1)
xlabel('True positive rate (TPR)', fontsize=16)
ylabel('False positive rate (FPR)', fontsize=16)
grid()
# plt.savefig('img/poster_B_roc.png' , dpi=700, format='png', transparent=True)
In [21]:
auc_m, auc_full_m = calculate_auc_with_and_without_untag_events(Bsign_m, Bprob_calibrated_m, Bweight_m, N_B)
print 'AUC for tagged:', auc_m, 'AUC with untag:', auc_full_m
figure(figsize=(12, 10))
fpr, tpr, _ = calculate_roc_with_untag_events(Bsign_m, Bprob_calibrated_m, Bweight_m, N_B)
plot(fpr, tpr, linewidth=2)
plot([0, 1], [0, 1], 'k--')
ylim(0, 1), xlim(0, 1)
xlabel('True positive rate (TPR)', fontsize=16)
ylabel('False positive rate (FPR)', fontsize=16)
grid()
# plt.savefig('img/poster_B_roc.png' , dpi=700, format='png', transparent=True)
In [22]:
figsize(12, 8)
for sign in [-1, 1]:
hist(sign * (Bprob[Bsign == sign] - 0.5), bins=101, normed=True, alpha=0.2,
weights=Bweight[Bsign == sign], range=(-0.5, 0.5), label='$B^-$' if sign == -1 else '$B^+$')
legend(), title('Symmetry of $p(B^+)$ for $B^+$ and $B^-$, before calibration')
Out[22]:
In [23]:
fpr, tpr, _ = roc_curve(Bsign, (Bprob - 0.5) * Bsign, sample_weight=Bweight)
In [24]:
'KS distance', max(abs(fpr - tpr))
Out[24]:
In [25]:
plot(fpr, tpr), grid()
plot([0, 1], [0, 1], 'k--')
xlim(0, 1), ylim(0, 1)
Out[25]:
In [26]:
from sklearn.metrics import roc_auc_score
roc_auc_score(Bsign, (Bprob - 0.5) * Bsign, sample_weight=Bweight)
Out[26]:
In [27]:
figsize(12, 8)
for sign in [-1, 1]:
hist(sign * (Bprob_calibrated[Bsign == sign] - 0.5) + 0.5, bins=101, alpha=0.2,
weights=Bweight[Bsign == sign], range=(0, 1), normed=True, label='$B^-$' if sign == -1 else '$B^+$')
legend(fontsize=32); ylim(0, ), xlim(0, 1)
xticks(fontsize=18), yticks(fontsize=18)
xlabel('$P($correct class$)$', fontsize=26)
# plt.savefig('img/paper_symmetry.png' , dpi=300, format='png', bbox_inches='tight')
Out[27]:
In [28]:
figsize(12, 8)
for sign in [-1, 1]:
hist(sign * (Bprob_calibrated_m[Bsign_m == sign] - 0.5) + 0.5, bins=101, alpha=0.2,
weights=Bweight_m[Bsign_m == sign], range=(0, 1), normed=True, label='$B^-$' if sign == -1 else '$B^+$')
legend(fontsize=32); ylim(0, ), xlim(0, 1)
xticks(fontsize=18), yticks(fontsize=18)
xlabel('$P($correct class$)$', fontsize=26)
# plt.savefig('img/paper_symmetry.png' , dpi=300, format='png', bbox_inches='tight')
Out[28]:
In [29]:
fpr, tpr, _ = roc_curve(Bsign, (Bprob_calibrated - 0.5) * Bsign, sample_weight=Bweight)
In [30]:
'KS distance', max(abs(fpr - tpr))
Out[30]:
In [31]:
plot(fpr, tpr), grid()
plot([0, 1], [0, 1], 'k--')
xlim(0, 1), ylim(0, 1)
Out[31]:
In [32]:
roc_auc_score(Bsign, (Bprob_calibrated - 0.5) * Bsign, sample_weight=Bweight)
Out[32]:
In [33]:
roc_auc_score(Bsign_m, (Bprob_calibrated_m - 0.5) * Bsign_m, sample_weight=Bweight_m)
Out[33]:
In [33]:
from utils import result_table
In [34]:
from utils import get_N_B_events, bootstrap_calibrate_prob, result_table
N_B_passed = Bweight.sum()
tagging_efficiency = N_B_passed / N_B
tagging_efficiency_delta = numpy.sqrt(N_B_passed) / N_B
D2, aucs = bootstrap_calibrate_prob(Bsign, Bweight, Bprob, symmetrize=True, n_calibrations=30)
print 'AUC', numpy.mean(aucs), numpy.var(aucs)
result = result_table(tagging_efficiency, tagging_efficiency_delta, D2, auc_full, 'Inclusive tagging (track+vertex)')
In [ ]:
D2, aucs = bootstrap_calibrate_prob(Bsign_m, Bweight_m, Bprob_m, symmetrize=True, n_calibrations=30)
print 'AUC', numpy.mean(aucs), numpy.var(aucs)
result_m = result_table(tagging_efficiency, tagging_efficiency_delta, D2, auc_full_m, 'Inclusive tagging (track+vertex) new vertex')
In [61]:
pandas.concat([result, result_m1])
Out[61]:
In [36]:
pandas.concat([result, result_m])
Out[36]:
In [45]:
pandas.concat([result, result_dt, result_track])
Out[45]:
In [37]:
x = numpy.linspace(0, 1, 100)
plot(x, -(iso_reg1.transform((1-x)) + iso_reg2.transform((1-x))) / 2 + 1, label='isotonic transformation reverse')
plot(x, (iso_reg1.transform(x) + iso_reg2.transform(x)) / 2, label='isotonic transformation')
legend(loc='best')
plot([0, 1], [0, 1], "k--")
xlabel('B prob'), ylabel('B prob calibrated')
# plt.savefig('img/iso_transformation_PID_less.png' , format='png')
Out[37]:
In [34]:
from utils import get_N_B_events, compute_mistag
In [35]:
bins = [0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
percentile_bins = [10, 20, 30, 40, 50, 60, 70, 80, 90]
In [36]:
figsize(12, 10)
compute_mistag(Bprob, Bsign, Bweight, Bsign > -100, label="$B$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob, Bsign, Bweight, Bsign == 1, label="$B^+$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob, Bsign, Bweight, Bsign == -1, label="$B^-$", uniform=False,
bins=percentile_bins)
legend(loc='best'), xlabel('mistag probability'), ylabel('true mistag probability')
title('B prob, percentile bins')
Out[36]:
In [37]:
figsize(12, 10)
compute_mistag(Bprob_track, Bsign_track, Bweight_track, Bsign > -100, label="$B$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_track, Bsign_track, Bweight_track, Bsign == 1, label="$B^+$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_track, Bsign_track, Bweight_track, Bsign == -1, label="$B^-$", uniform=False,
bins=percentile_bins)
legend(loc='best'), xlabel('mistag probability'), ylabel('true mistag probability')
title('B prob, percentile bins')
Out[37]:
In [48]:
figsize(12, 10)
compute_mistag(Bprob_m, Bsign_m, Bweight_m, Bsign > -100, label="$B$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_m, Bsign_m, Bweight_m, Bsign == 1, label="$B^+$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_m, Bsign_m, Bweight_m, Bsign == -1, label="$B^-$", uniform=False,
bins=percentile_bins)
legend(loc='best'), xlabel('mistag probability'), ylabel('true mistag probability')
title('B prob, percentile bins')
Out[48]:
In [39]:
figsize(12, 10)
compute_mistag(Bprob_calibrated, Bsign, Bweight, Bsign > -100, label="$B$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_calibrated, Bsign, Bweight, Bsign == 1, label="$B^+$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_calibrated, Bsign, Bweight, Bsign == -1, label="$B^-$", uniform=False,
bins=percentile_bins)
legend(loc='best'), xlabel('mistag probability'), ylabel('true mistag probability')
title('B prob isotonic calibrated, percentile bins')
Out[39]:
In [40]:
figsize(12, 10)
compute_mistag(Bprob_track_calibrated, Bsign_track, Bweight_track, Bsign > -100, label="$B$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_track_calibrated, Bsign, Bweight_track, Bsign_track == 1, label="$B^+$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_track_calibrated, Bsign, Bweight_track, Bsign_track == -1, label="$B^-$", uniform=False,
bins=percentile_bins)
legend(loc='best'), xlabel('mistag probability'), ylabel('true mistag probability')
title('B prob isotonic calibrated, percentile bins')
Out[40]:
In [41]:
figsize(12, 10)
compute_mistag(Bprob_calibrated_m, Bsign_m, Bweight_m, Bsign > -100, label="$B$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_calibrated_m, Bsign_m, Bweight_m, Bsign == 1, label="$B^+$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_calibrated_m, Bsign_m, Bweight_m, Bsign == -1, label="$B^-$", uniform=False,
bins=percentile_bins)
legend(loc='best'), xlabel('mistag probability'), ylabel('true mistag probability')
title('B prob isotonic calibrated, percentile bins')
Out[41]:
In [43]:
N_B_passed = Bweight.sum()
tagging_efficiency = N_B_passed / N_B
tagging_efficiency_delta = numpy.sqrt(N_B_passed) / N_B
In [49]:
print numpy.average((2*(Bprob_calibrated - 0.5))**2, weights=Bweight) * tagging_efficiency * 100
print numpy.average((2*(Bprob_calibrated_m - 0.5))**2, weights=Bweight_m) * tagging_efficiency * 100
print numpy.average((2*(Bprob_track_calibrated - 0.5))**2, weights=Bweight_track) * tagging_efficiency * 100
In [50]:
roc_auc_score(Bsign_m, Bprob_calibrated_m, sample_weight=Bweight_m)
Out[50]:
In [37]:
print numpy.average((2*(Bprob - 0.5))**2, weights=Bweight) * tagging_efficiency * 100
print numpy.average((2*(Bprob_calibrated - 0.5))**2, weights=Bweight) * tagging_efficiency * 100