In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib

Read prepared B data for track-based and vertex-based tagging



In [2]:

    
import pandas
Bdata_tracks = pandas.read_csv('models/Bdata_tracks.csv')



In [3]:

    
Bdata_tracks.head()









    Out[3]:






  
    
      
      Bsign
      Bweight
      event_id
      track_relation_prob
    
  
  
    
      0
       1
       1.091776
       111761_12239990
       0.959871
    
    
      1
       1
      -0.417194
       111761_16432326
       2.426043
    
    
      2
      -1
       1.044602
       111761_29035939
       1.362860
    
    
      3
      -1
       1.062837
       111761_30938577
       0.543403
    
    
      4
       1
       1.076036
        111761_4009818
       0.992681



In [6]:

    
Bdata_vertex = pandas.read_csv('models/Bdata_vertex.csv')



In [7]:

    
Bdata_vertex.head()









    Out[7]:






  
    
      
      Bsign
      Bweight
      event_id
      vertex_relation_prob
    
  
  
    
      0
       1
       1.091776
       111761_12239990
       1.410856
    
    
      1
       1
      -0.237732
       111761_14379738
       0.360223
    
    
      2
       1
      -0.442830
       111761_33866816
       1.278433
    
    
      3
      -1
       0.991477
       111761_43041334
       0.928330
    
    
      4
      -1
       1.091055
       111761_48273537
       0.821604

Merge two datasets



In [8]:

    
Bdata = pandas.merge(Bdata_tracks, Bdata_vertex, how='outer', on=['event_id', 'Bsign'])



In [9]:

    
Bdata.head()









    Out[9]:






  
    
      
      Bsign
      Bweight_x
      event_id
      track_relation_prob
      Bweight_y
      vertex_relation_prob
    
  
  
    
      0
       1
       1.091776
       111761_12239990
       0.959871
       1.091776
       1.410856
    
    
      1
       1
      -0.417194
       111761_16432326
       2.426043
            NaN
            NaN
    
    
      2
      -1
       1.044602
       111761_29035939
       1.362860
            NaN
            NaN
    
    
      3
      -1
       1.062837
       111761_30938577
       0.543403
            NaN
            NaN
    
    
      4
       1
       1.076036
        111761_4009818
       0.992681
            NaN
            NaN

Obtain one weight column



In [10]:

    
Bdata['Bweight'] = Bdata['Bweight_x'].copy()
Bdata.ix[numpy.isnan(Bdata['Bweight'].values), 'Bweight'] = Bdata.ix[numpy.isnan(Bdata['Bweight'].values), 'Bweight_y']
Bdata = Bdata.drop(['Bweight_x', 'Bweight_y'], axis=1)

# for Nan put 1 as non influence factor
Bdata.ix[numpy.isnan(Bdata.track_relation_prob.values), 'track_relation_prob'] = 1.
Bdata.ix[numpy.isnan(Bdata.vertex_relation_prob.values), 'vertex_relation_prob'] = 1.



In [11]:

    
Bdata.head()









    Out[11]:






  
    
      
      Bsign
      event_id
      track_relation_prob
      vertex_relation_prob
      Bweight
    
  
  
    
      0
       1
       111761_12239990
       0.959871
       1.410856
       1.091776
    
    
      1
       1
       111761_16432326
       2.426043
       1.000000
      -0.417194
    
    
      2
      -1
       111761_29035939
       1.362860
       1.000000
       1.044602
    
    
      3
      -1
       111761_30938577
       0.543403
       1.000000
       1.062837
    
    
      4
       1
        111761_4009818
       0.992681
       1.000000
       1.076036



In [12]:

    
relation_prob = Bdata['track_relation_prob'].values * Bdata['vertex_relation_prob'].values
Bprob = relation_prob / (1 + relation_prob)
Bweight = Bdata.Bweight.values
Bsign = Bdata.Bsign.values



In [13]:

    
Bprob[~numpy.isfinite(Bprob)] = 0.5



In [14]:

    
sum(Bweight[Bsign == 1]), sum(Bweight[Bsign == -1])









    Out[14]:





(293224.69221025845, 284651.76733395597)



In [15]:

    
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(Bsign < 0, (Bprob - 0.5) * Bsign, sample_weight=Bweight)



In [16]:

    
plot(fpr, tpr)
plot([0, 1], [0, 1], 'k--')









    Out[16]:





[<matplotlib.lines.Line2D at 0x96ad9d0>]

2-folding calibration by isotonic



In [17]:

    
from utils import calibrate_probs
Bprob_calibrated, (iso_reg1, iso_reg2) = calibrate_probs(Bsign, Bweight, Bprob,
                                                         symmetrize=True, return_calibrator=True)

Add some small noise in prediction for stability



In [33]:

    
Bprob_calibrated += numpy.random.normal(size=len(Bprob_calibrated)) * 0.001



In [34]:

    
figure(figsize=(15, 5))

subplot(1,2,1)
hist(Bprob[Bsign == 1], weights=Bweight[Bsign == 1], bins=60, alpha=0.2, normed=True, label='$B^+$')
hist(Bprob[Bsign == -1], weights=Bweight[Bsign == -1], bins=60, alpha=0.2, normed=True, label='$B^-$')
legend(), title('B probs')

subplot(1,2,2)
hist(Bprob_calibrated[Bsign == 1], weights=Bweight[Bsign == 1], bins=60, alpha=0.2, 
     normed=True, range=(0, 1), label='$B^+$')
hist(Bprob_calibrated[Bsign == -1], weights=Bweight[Bsign == -1], bins=60, alpha=0.2,
     normed=True, range=(0, 1), label='$B^-$')
legend(), title('B probs calibrated')
plt.savefig('img/Bprob_iso_calibrated.png' , format='png')

AUC score and ROC curve for B+ vs B-



In [18]:

    
from utils import calculate_auc_with_and_without_untag_events
from sklearn.metrics import roc_curve

auc, auc_full = calculate_auc_with_and_without_untag_events(Bsign, Bprob_calibrated, Bweight)
print 'AUC for tagged:', auc, 'AUC with untag:', auc_full

fpr, tpr, _ = roc_curve(Bsign, Bprob_calibrated, sample_weight=Bweight)
plot(fpr, tpr)
plot([0, 1], [0, 1], 'k--')
ylim(0, 1), xlim(0, 1)









    



AUC for tagged: 0.596445335644 AUC with untag: 0.579186584116






    Out[18]:





((0, 1), (0, 1))

Symmetry $B^+$ vs $B^-$ checking

before calibration



In [19]:

    
figsize(12, 10)
for sign in [-1, 1]:
    hist(sign * (Bprob[Bsign == sign] - 0.5), bins=101, normed=True, alpha=0.2, 
         weights=Bweight[Bsign == sign], range=(-0.5, 0.5), label='$B^-$' if sign == -1 else '$B^+$')
legend(), title('Symmetry of $p(B^+)$ for $B^+$ and $B^-$, before calibration')









    Out[19]:





(<matplotlib.legend.Legend at 0x71f9650>, <matplotlib.text.Text at 0x720b1d0>)

KS distance



In [20]:

    
fpr, tpr, _ = roc_curve(Bsign, (Bprob - 0.5) * Bsign, sample_weight=Bweight)



In [21]:

    
'KS distance', max(abs(fpr - tpr))









    Out[21]:





('KS distance', 0.016633086790292295)



In [22]:

    
figsize(6, 5)
plot(fpr, tpr), grid(), xlim(0, 1)









    Out[22]:





([<matplotlib.lines.Line2D at 0x7387c90>], None, (0, 1))

after calibration



In [23]:

    
figsize(12, 10)
for sign in [-1, 1]:
    hist(sign * (Bprob_calibrated[Bsign == sign] - 0.5), bins=101, normed=True, alpha=0.2,
         weights=Bweight[Bsign == sign], range=(-0.5, 0.5), label='$B^-$' if sign == -1 else '$B^+$')
legend(), title('Symmetry of $p(B^+)$ for $B^+$ and $B^-$, after calibration')









    Out[23]:





(<matplotlib.legend.Legend at 0x869f4d0>, <matplotlib.text.Text at 0x8445d10>)

KS distance



In [24]:

    
fpr, tpr, _ = roc_curve(Bsign, (Bprob_calibrated - 0.5) * Bsign, sample_weight=Bweight)



In [25]:

    
'KS distance', max(abs(fpr - tpr))









    Out[25]:





('KS distance', 0.017322610402452532)



In [26]:

    
figsize(6, 5)
plot(fpr, tpr), grid(), xlim(0, 1)









    Out[26]:





([<matplotlib.lines.Line2D at 0xc69e5d0>], None, (0, 1))

D2 estimation with bootstrap calibration



In [28]:

    
from utils import get_N_B_events, bootstrap_calibrate_prob, result_table

N_B_passed = Bweight.sum()
tagging_efficiency = N_B_passed / get_N_B_events()
tagging_efficiency_delta = numpy.sqrt(N_B_passed) / get_N_B_events()

D2, aucs = bootstrap_calibrate_prob(Bsign, Bweight, Bprob, symmetrize=True)
print 'AUC', numpy.mean(aucs), numpy.var(aucs)

result = result_table(tagging_efficiency, tagging_efficiency_delta, D2, auc_full, 'Inclusive tagging')



In [20]:

    
result









    Out[20]:






  
    
      
      name
      $\epsilon_{tag}, \%$
      $\Delta \epsilon_{tag}, \%$
      $D^2$
      $\Delta D^2$
      $\epsilon, \%$
      $\Delta \epsilon, \%$
      AUC, with untag
      $\Delta$ AUC, with untag
    
  
  
    
      0
       Inclusive tagging
       77.789955
       0.102331
       0.034494
       0.000457
       2.683308
       0.035762
       57.925763
       0



In [21]:

    
result.to_csv('img/new-tagging.csv', header=True, index=False)

Some plots



In [22]:

    
x = numpy.linspace(0, 1, 100)
plot(x, -(iso_reg1.transform((1-x)) + iso_reg2.transform((1-x))) / 2 + 1, label='isotonic transformation reverse')
plot(x, (iso_reg1.transform(x) + iso_reg2.transform(x)) / 2, label='isotonic transformation')
legend(loc='best')
plot([0, 1], [0, 1], "k--")
xlabel('B prob'), ylabel('B prob calibrated')
plt.savefig('img/iso_transformation.png' , format='png')



In [33]:

    
from utils import get_N_B_events, compute_mistag



In [24]:

    
bins = [0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
percentile_bins = [10, 20, 30, 40, 50, 60, 70, 80, 90]

before calibration



In [34]:

    
figsize(12, 10)
compute_mistag(Bprob, Bsign, Bweight, Bsign > -100, label="$B$", bins=bins)
compute_mistag(Bprob, Bsign, Bweight, Bsign == 1, label="$B^+$", bins=bins)
compute_mistag(Bprob, Bsign, Bweight, Bsign == -1, label="$B^-$", bins=bins)
legend(loc='best')
title('B prob, uniform bins'), xlabel('mistag probability'), ylabel('true mistag probability')
plt.savefig('img/Bprob_calibration_check_uniform.png' , format='png')









    



/moosefs/ipython_env/local/lib/python2.7/site-packages/matplotlib/collections.py:548: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == 'face':



In [35]:

    
compute_mistag(Bprob, Bsign, Bweight, Bsign > -100, label="$B$", uniform=False, bins=percentile_bins)
compute_mistag(Bprob, Bsign, Bweight, Bsign == 1, label="$B^+$", uniform=False, bins=percentile_bins)
compute_mistag(Bprob, Bsign, Bweight, Bsign == -1, label="$B^-$", uniform=False, bins=percentile_bins)
legend(loc='best')
title('B prob, percentile bins'), xlabel('mistag probability'), ylabel('true mistag probability')
plt.savefig('img/Bprob_calibration_check_percentile.png' , format='png')

after calibration



In [36]:

    
compute_mistag(Bprob_calibrated, Bsign, Bweight, Bsign > -100, label="$B$", bins=bins)
compute_mistag(Bprob_calibrated, Bsign, Bweight, Bsign == 1, label="$B^+$", bins=bins)
compute_mistag(Bprob_calibrated, Bsign, Bweight, Bsign == -1, label="$B^-$", bins=bins)
legend(loc='best')
title('B prob isotonic calibrated, uniform bins'), xlabel('mistag probability'), ylabel('true mistag probability')
plt.savefig('img/Bprob_calibration_check_iso_uniform.png' , format='png')



In [37]:

    
figsize(12, 10)
compute_mistag(Bprob_calibrated, Bsign, Bweight, Bsign > -100, label="$B$", uniform=False,
               bins=percentile_bins)
compute_mistag(Bprob_calibrated, Bsign, Bweight, Bsign == 1, label="$B^+$", uniform=False, 
               bins=percentile_bins)
compute_mistag(Bprob_calibrated, Bsign, Bweight, Bsign == -1, label="$B^-$", uniform=False, 
               bins=percentile_bins)
legend(loc='best'),  xlabel('mistag probability'), ylabel('true mistag probability')
title('B prob isotonic calibrated, percentile bins')
plt.savefig('img/Bprob_calibration_check_iso_percentile.png' , format='png')



In [38]:

    
print numpy.average((2*(Bprob - 0.5))**2, weights=Bweight) * tagging_efficiency * 100
print numpy.average((2*(Bprob_calibrated - 0.5))**2, weights=Bweight) * Bweight.sum() / get_N_B_events() * 100









    



2.95787327676
2.68691291397

	Bsign	Bweight	event_id	track_relation_prob
0	1	1.091776	111761_12239990	0.959871
1	1	-0.417194	111761_16432326	2.426043
2	-1	1.044602	111761_29035939	1.362860
3	-1	1.062837	111761_30938577	0.543403
4	1	1.076036	111761_4009818	0.992681

	Bsign	Bweight	event_id	vertex_relation_prob
0	1	1.091776	111761_12239990	1.410856
1	1	-0.237732	111761_14379738	0.360223
2	1	-0.442830	111761_33866816	1.278433
3	-1	0.991477	111761_43041334	0.928330
4	-1	1.091055	111761_48273537	0.821604