About

vertex based tagging: vertices are constructed in another way, like in jet-tagging LCHb approach



In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [ ]:

    
import sys
sys.path.insert(0, "../")

Import



In [2]:

    
import pandas
import numpy

from sklearn.metrics import roc_curve, roc_auc_score

from rep.metaml import FoldingClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc



In [3]:

    
from utils import get_N_B_events, get_events_number, get_events_statistics

Reading initial data



In [4]:

    
import root_numpy
data = pandas.DataFrame(root_numpy.root2array('../datasets/MC/csv/Bu_JPsiK/Vertices_Mike.root'))



In [5]:

    
data.head()









    Out[5]:






  
    
      
      run
      event
      No
      mult
      nnkrec
      Bmass
      ptB
      vflag
      ptmean
      ipsmean
      ...
      W_M_BDphiDir
      M_svtau
      W_M_svtau
      M_pointtheta
      W_M_pointtheta
      docamax
      signB
      signVtx
      N_sig_sw
      BOosc
    
  
  
    
      0
      2517778
      1110824
      0
      1
      4
      5.27489
      11.70600
      2
      1.564780
      8.62936
      ...
      -1.675360
      1.024920
      166.444000
      0.067733
      0.012434
      0.000000
      1
      1
      0
      0
    
    
      1
      2517778
      1110839
      0
      1
      3
      5.28328
      3.88046
      4
      0.579329
      7.54620
      ...
      -2.229240
      1.807200
      15.986400
      0.011598
      0.098215
      0.016291
      -1
      1
      0
      1
    
    
      2
      2517778
      1110869
      0
      1
      1
      5.27815
      2.78838
      2
      1.764510
      5.58850
      ...
      0.507730
      0.484238
      128503.000000
      0.190129
      0.017458
      0.000000
      1
      -1
      0
      0
    
    
      3
      2517778
      1110880
      0
      1
      2
      5.28571
      8.92569
      6
      0.881253
      6.91230
      ...
      1.651890
      2.628070
      227.988007
      0.029884
      0.003415
      0.085911
      1
      1
      0
      0
    
    
      4
      2517778
      1110894
      0
      1
      3
      5.27156
      10.75980
      4
      0.926523
      12.08480
      ...
      0.565922
      2.379660
      57.450001
      0.035372
      0.010057
      0.014691
      -1
      1
      0
      0
    
  

5 rows × 24 columns



In [6]:

    
len(data)









    Out[6]:





737093



In [7]:

    
event_id_column = 'event_id'
event_id = data.run.apply(str) + '_' + data.event.apply(str)
data['group_column'] = numpy.unique(event_id, return_inverse=True)[1]

# all weights are 1, because this is MC
data['N_sig_sw'] = 1
data[event_id_column] = event_id



In [8]:

    
get_events_statistics(data)









    Out[8]:





{'Events': 733032, 'parts': 737093}



In [9]:

    
import json
with open('../models/JPsiKMC.json', 'r') as f:
    N_B_events = json.load(f)['N_B_events']



In [10]:

    
N_B_events









    Out[10]:





1488891.0



In [11]:

    
N_pass = get_events_number(data)
tagging_efficiency = 1. * N_pass / N_B_events
tagging_efficiency_delta = sqrt(N_pass) / N_B_events
print tagging_efficiency, tagging_efficiency_delta









    



0.492334227287 0.000575040671035

Define features



In [12]:

    
data.columns









    Out[12]:





Index([u'run', u'event', u'No', u'mult', u'nnkrec', u'Bmass', u'ptB', u'vflag',
       u'ptmean', u'ipsmean', u'vcharge', u'svm', u'svp', u'M_BDphiDir',
       u'W_M_BDphiDir', u'M_svtau', u'W_M_svtau', u'M_pointtheta',
       u'W_M_pointtheta', u'docamax', u'signB', u'signVtx', u'N_sig_sw',
       u'BOosc', u'group_column', u'event_id'],
      dtype='object')



In [13]:

    
features =  ['mult', u'nnkrec', u'ptB', u'vflag',
               u'ptmean', u'ipsmean', u'vcharge', u'svm', u'svp', u'M_BDphiDir',
               u'W_M_BDphiDir', u'M_svtau', u'W_M_svtau', u'M_pointtheta',
               u'W_M_pointtheta', u'docamax',]

Vertices sum of charges asymmetry



In [14]:

    
from utils import compute_sum_of_charges



In [15]:

    
compute_sum_of_charges(data, 'Vertices', bins=100, 
                       event_id_column=event_id_column, sign_part_column='signVtx')









    












    












    Out[15]:






  
    
      
      $B^+$
      $B^+$, with signal part
      $B^-$
      $B^-$, with signal part
      ROC AUC
      ROC AUC, with signal part
      name
    
  
  
    
      0
      0.102729
      -0.897271
      -0.115973
      0.884027
      0.554682
      0.846214
      Vertices

Define label



In [16]:

    
data.loc[:, 'label'] = (data.signB.values * data.signVtx.values > 0) * 1

DT



In [17]:

    
from decisiontrain import DecisionTrainClassifier
from rep.estimators import SklearnClassifier
from folding_group import FoldingGroupClassifier



In [18]:

    
data_sw_passed_lds = LabeledDataStorage(data, data.label.values)



In [19]:

    
tt_base = DecisionTrainClassifier(learning_rate=0.01, n_estimators=10000, depth=6,
                                  max_features=12, n_threads=12)
tt_folding = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11, 
                                    train_features=features, group_feature='group_column')
%time tt_folding.fit_lds(data_sw_passed_lds)
pass









    



CPU times: user 10min 25s, sys: 1.59 s, total: 10min 27s
Wall time: 1min 5s



In [20]:

    
report = ClassificationReport({'dt': tt_folding}, data_sw_passed_lds)









    



KFold prediction using folds column



In [21]:

    
report.learning_curve(RocAuc(), steps=1)









    



KFold prediction using folds column






    Out[21]:



In [22]:

    
report.compute_metric(RocAuc())









    Out[22]:





OrderedDict([('dt', 0.56186900142958185)])



In [23]:

    
report.roc()









    Out[23]:



In [24]:

    
report.feature_importance()









    Out[24]:



In [25]:

    
tt_folding.estimators[0].clf.estimators = tt_folding.estimators[0].clf.estimators[:5000]
tt_folding.estimators[1].clf.estimators = tt_folding.estimators[1].clf.estimators[:5000]

Calibrating results $p(\text{vrt same sign}|B)$ and combining them



In [26]:

    
models = []



In [27]:

    
from utils import get_result_with_bootstrap_for_given_part



In [ ]:

    
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding, 
                                                      [data], N_B_events=N_B_events,
                                                      logistic=True, name="inclusive vertex", n_calibrations=30,
                                                      sign_part_column='signVtx', part_name='vertex', 
                                                      logistic_combined=False))









    



KFold prediction using folds column






    












    



AUC for tagged: 0.584777819153 AUC with untag: 0.546740835261






    












    



mean AUC after calibration: 0.584610185848 6.09862198963e-09



In [33]:

    
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding, 
                                                       [data], N_B_events=N_B_events,
                                                       logistic=True, name="inclusive vertex, logistic", 
                                                       n_calibrations=30,
                                                       sign_part_column='signVtx', part_name='vertex', 
                                                       logistic_combined=True))









    



KFold prediction using folds column






    












    



AUC for tagged: 0.584777819153 AUC with untag: 0.546740835261






    












    



mean AUC after calibration: 0.584775556537 5.97870693052e-11

Comparison of different models



In [34]:

    
pandas.concat(models)









    Out[34]:






  
    
      
      name
      $\epsilon_{tag}, \%$
      $\Delta \epsilon_{tag}, \%$
      $D^2$
      $\Delta D^2$
      $\epsilon, \%$
      $\Delta \epsilon, \%$
      AUC, with untag
      $\Delta$ AUC, with untag
    
  
  
    
      0
      inclusive vertex
      49.233423
      0.057504
      0.028953
      0.000015
      1.425464
      0.001824
      54.674084
      0
    
    
      0
      inclusive vertex, logistic
      49.233423
      0.057504
      0.028691
      0.000025
      1.412537
      0.002052
      54.674084
      0

Prepare for combination with vertices



In [35]:

    
from utils import prepare_B_data_for_given_part



In [36]:

    
Bdata_prepared = prepare_B_data_for_given_part(tt_folding, [data], logistic=True, sign_part_column='signVtx',
                                               N_B_events=N_B_events, part_name='vertex')









    



KFold prediction using folds column






    












    



AUC for tagged: 0.584777819153 AUC with untag: 0.546740835261



In [37]:

    
Bdata_prepared.to_csv('../models/Bdata_vertices_MC_jets.csv', header=True, index=False)

	run	event	mult	nnkrec	Bmass	ptB	vflag	ptmean	ipsmean	...	W_M_BDphiDir	M_svtau	W_M_svtau	M_pointtheta	W_M_pointtheta	docamax	signB	signVtx	BOosc
0	2517778	1110824	1	4	5.27489	11.70600	2	1.564780	8.62936	...	-1.675360	1.024920	166.444000	0.067733	0.012434	0.000000	1	1	0
1	2517778	1110839	1	3	5.28328	3.88046	4	0.579329	7.54620	...	-2.229240	1.807200	15.986400	0.011598	0.098215	0.016291	-1	1	1
2	2517778	1110869	1	1	5.27815	2.78838	2	1.764510	5.58850	...	0.507730	0.484238	128503.000000	0.190129	0.017458	0.000000	1	-1	0
3	2517778	1110880	1	2	5.28571	8.92569	6	0.881253	6.91230	...	1.651890	2.628070	227.988007	0.029884	0.003415	0.085911	1	1	0
4	2517778	1110894	1	3	5.27156	10.75980	4	0.926523	12.08480	...	0.565922	2.379660	57.450001	0.035372	0.010057	0.014691	-1	1	0

	name	$\epsilon_{tag}, \%$	$\Delta \epsilon_{tag}, \%$	$D^2$	$\Delta D^2$	$\epsilon, \%$	$\Delta \epsilon, \%$	AUC, with untag	$\Delta$ AUC, with untag
0	inclusive vertex	49.233423	0.057504	0.028953	0.000015	1.425464	0.001824	54.674084	0
0	inclusive vertex, logistic	49.233423	0.057504	0.028691	0.000025	1.412537	0.002052	54.674084	0