In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [ ]:

    
import sys
sys.path.insert(0, "../")

Import



In [2]:

    
import pandas
import numpy

from sklearn.metrics import roc_curve, roc_auc_score

from rep.metaml import FoldingClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc



In [3]:

    
from utils import get_N_B_events, get_events_number, get_events_statistics

Reading initial data



In [4]:

    
import root_numpy
data = pandas.DataFrame(root_numpy.root2array('../datasets/MC/csv/Bu_JPsiK/Vertices.root', selection='(vcharge > 0.2)'))



In [5]:

    
data.head()









    Out[5]:






  
    
      
      run
      event
      No
      mult
      nnkrec
      Bmass
      ptB
      vflag
      ptmean
      ipsmean
      ...
      phi1
      phi2
      ghost1
      ghost2
      pointtheta
      seedchi2
      rdist
      probf
      v_cut
      BOosc
    
  
  
    
      0
      2517778
      1110824
      0
      21
      4
      5.27489
      11.70600
      4
      0.940587
      9.29878
      ...
      1.363980
      1.00376
      0.050880
      0.008245
      0.142074
      8.407050
      2.06584
      0.399833
      0
      0
    
    
      1
      2517778
      1110824
      1
      21
      4
      5.27489
      11.70600
      3
      1.172370
      7.18694
      ...
      2.336280
      -2.36953
      0.003858
      0.002580
      0.067733
      0.062761
      0.45721
      0.903930
      1
      0
    
    
      2
      2517778
      1110839
      0
      47
      3
      5.28328
      3.88046
      6
      0.614212
      13.12260
      ...
      0.985463
      2.26765
      0.186784
      0.006160
      0.027074
      3.467780
      1.38931
      0.805029
      0
      1
    
    
      3
      2517778
      1110839
      1
      47
      3
      5.28328
      3.88046
      6
      0.635548
      14.23890
      ...
      0.985463
      2.72677
      0.186784
      0.002705
      0.151040
      2.414490
      1.08309
      0.896430
      0
      1
    
    
      4
      2517778
      1110839
      2
      47
      3
      5.28328
      3.88046
      5
      0.666694
      12.26670
      ...
      0.985463
      1.38842
      0.186784
      0.098220
      0.014452
      0.220461
      2.97054
      0.921938
      0
      1
    
  

5 rows × 34 columns



In [6]:

    
event_id_column = 'event_id'
event_id = data.run.apply(str) + '_' + data.event.apply(str)
data['group_column'] = numpy.unique(event_id, return_inverse=True)[1]

# all weights are 1, because this is MC
data['N_sig_sw'] = 1
data[event_id_column] = event_id



In [7]:

    
get_events_statistics(data)









    Out[7]:





{'Events': 602754, 'parts': 1802872}



In [8]:

    
import json
with open('../models/JPsiKMC.json', 'r') as f:
    N_B_events = json.load(f)['N_B_events']



In [9]:

    
N_B_events









    Out[9]:





1488891.0



In [10]:

    
N_pass = get_events_number(data)
tagging_efficiency = 1. * N_pass / N_B_events
tagging_efficiency_delta = sqrt(N_pass) / N_B_events
print tagging_efficiency, tagging_efficiency_delta









    



0.404834202101 0.000521443365133

Define features



In [11]:

    
features =  ['mult', 'nnkrec', 'ptB', 'ipsmean', 'ptmean', 'vcharge', 
             'svm', 'svp', 'BDphiDir', 'svtau', 'docamax']
features_old =  ['mult', 'nnkrec', 'ptB', 'vflag', 'ipsmean', 'ptmean', 'vcharge', 
                'svm', 'svp', 'BDphiDir', 'svtau', 'docamax']



In [13]:

    
features_all = ['mult', u'nnkrec', u'ptB', 'ptmean', u'ipsmean', u'vcharge', u'svm', u'svp', u'BDphiDir',
                u'svtau', u'docamax',  u'massSeed', 'pt1', u'pt2', u'ips1', u'ips2', u'phi1', u'phi2', u'ghost1',
                u'ghost2', u'pointtheta', u'seedchi2', u'rdist', u'probf']

Vertices sum of charges asymmetry



In [14]:

    
from utils import compute_sum_of_charges



In [15]:

    
old_mask = data.v_cut.values == 1



In [16]:

    
compute_sum_of_charges(data, 'Vertices', bins=100, 
                       event_id_column=event_id_column, sign_part_column='signVtx')









    












    












    Out[16]:






  
    
      
      $B^+$
      $B^+$, with signal part
      $B^-$
      $B^-$, with signal part
      ROC AUC
      ROC AUC, with signal part
      name
    
  
  
    
      0
      0.511523
      -0.488477
      -0.571592
      0.428408
      0.586452
      0.595232
      Vertices

Define label



In [17]:

    
data.loc[:, 'label'] = (data.signB.values * data.signVtx.values > 0) * 1

DT



In [18]:

    
from decisiontrain import DecisionTrainClassifier
from rep.estimators import SklearnClassifier
from folding_group import FoldingGroupClassifier



In [19]:

    
data_sw_passed_lds = LabeledDataStorage(data, data.label.values)



In [21]:

    
tt_base = DecisionTrainClassifier(learning_rate=0.01, n_estimators=10000, depth=6,
                                  max_features=8, n_threads=12)
tt_folding = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11, 
                                    train_features=features, group_feature='group_column')
%time tt_folding.fit_lds(data_sw_passed_lds)
pass









    



CPU times: user 21min 32s, sys: 3.22 s, total: 21min 35s
Wall time: 2min 36s



In [22]:

    
tt_folding_all = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11, 
                                        train_features=features_all, group_feature='group_column')
%time tt_folding_all.fit_lds(data_sw_passed_lds)
pass









    



CPU times: user 20min 26s, sys: 3.61 s, total: 20min 30s
Wall time: 2min 32s



In [23]:

    
data_sw_passed_v_cut_lds = LabeledDataStorage(data.ix[old_mask, :], data.loc[old_mask, 'label'].values)
tt_folding_v_cut = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11, 
                                          train_features=features_old, group_feature='group_column')
%time tt_folding_v_cut.fit_lds(data_sw_passed_v_cut_lds)
pass









    



CPU times: user 2min 52s, sys: 136 ms, total: 2min 52s
Wall time: 14.9 s



In [24]:

    
x = data.loc[old_mask, 'event_id'].values
data.index = data.event_id
data_sel = data.ix[x, :]



In [25]:

    
data_sw_passed_sp_lds = LabeledDataStorage(data_sel, data_sel['label'].values)
tt_folding_sp = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11, 
                                       train_features=features, group_feature='group_column')
%time tt_folding_sp.fit_lds(data_sw_passed_sp_lds)
pass









    



CPU times: user 9min 27s, sys: 1.15 s, total: 9min 28s
Wall time: 59 s



In [26]:

    
report = ClassificationReport({'dt': tt_folding, 'dt with all features': tt_folding_all}, data_sw_passed_lds)









    



KFold prediction using folds column
KFold prediction using folds column



In [27]:

    
report.learning_curve(RocAuc(), steps=1)









    



KFold prediction using folds column
KFold prediction using folds column






    Out[27]:



In [28]:

    
report.compute_metric(RocAuc())









    Out[28]:





OrderedDict([('dt', 0.57453176203912371),
             ('dt with all features', 0.57871140021946177)])



In [29]:

    
report.roc()









    Out[29]:



In [30]:

    
report.feature_importance()









    Out[30]:



In [31]:

    
tt_folding.estimators[0].clf.estimators = tt_folding.estimators[0].clf.estimators[:4000]
tt_folding.estimators[1].clf.estimators = tt_folding.estimators[1].clf.estimators[:4000]



In [32]:

    
tt_folding_all.estimators[0].clf.estimators = tt_folding_all.estimators[0].clf.estimators[:6000]
tt_folding_all.estimators[1].clf.estimators = tt_folding_all.estimators[1].clf.estimators[:6000]



In [33]:

    
report = ClassificationReport({'dt': tt_folding, 'dt with all features': tt_folding_all}, data_sw_passed_lds)









    



KFold prediction using folds column
KFold prediction using folds column



In [34]:

    
report.compute_metric(RocAuc())









    Out[34]:





OrderedDict([('dt', 0.57708235845060607),
             ('dt with all features', 0.57958965278153052)])



In [35]:

    
report_v_cut = ClassificationReport({'tt': tt_folding_v_cut}, data_sw_passed_v_cut_lds)









    



KFold prediction using folds column



In [36]:

    
report_v_cut.learning_curve(RocAuc(), steps=1)









    



KFold prediction using folds column






    Out[36]:



In [38]:

    
report_v_cut.compute_metric(RocAuc())









    Out[38]:





OrderedDict([('tt', 0.58964954257530666)])



In [39]:

    
tt_folding_v_cut.estimators[0].clf.estimators = tt_folding_v_cut.estimators[0].clf.estimators[:7000]
tt_folding_v_cut.estimators[1].clf.estimators = tt_folding_v_cut.estimators[1].clf.estimators[:7000]



In [40]:

    
report_sp = ClassificationReport({'tt': tt_folding_sp}, data_sw_passed_sp_lds)
report_sp.learning_curve(RocAuc(), steps=1)









    



KFold prediction using folds column
KFold prediction using folds column






    Out[40]:



In [41]:

    
report_sp.compute_metric(RocAuc())









    Out[41]:





OrderedDict([('tt', 0.56937902351359659)])



In [42]:

    
tt_folding_sp.estimators[0].clf.estimators = tt_folding_sp.estimators[0].clf.estimators[:3000]
tt_folding_sp.estimators[1].clf.estimators = tt_folding_sp.estimators[1].clf.estimators[:3000]

Calibrating results $p(\text{vrt same sign}|B)$ and combining them



In [43]:

    
N_pass_v_cut = get_events_number(data.ix[old_mask, :])
tagging_efficiency_v_cut = 1. * N_pass_v_cut / N_B_events
tagging_efficiency_delta_v_cut = sqrt(N_pass_v_cut) / N_B_events
print tagging_efficiency_v_cut, tagging_efficiency_delta_v_cut









    



0.122650348481 0.000287013907194



In [44]:

    
models = []



In [45]:

    
from utils import get_result_with_bootstrap_for_given_part



In [46]:

    
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency_v_cut, tagging_efficiency_delta_v_cut, 
                                                       tt_folding_v_cut, 
                                                      [data.ix[old_mask, :]], N_B_events=N_B_events,
                                                      logistic=True, name="old tagging", n_calibrations=30,
                                                      sign_part_column='signVtx', part_name='vertex',
                                                      logistic_combined=True))









    



KFold prediction using folds column






    












    



AUC for tagged: 0.680693059598 AUC with untag: 0.532496481933






    












    



mean AUC after calibration: 0.680680269261 2.50021693792e-10



In [48]:

    
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency_v_cut, tagging_efficiency_delta_v_cut, 
                                                       tt_folding_sp, 
                                                      [data_sel], N_B_events=N_B_events,
                                                      logistic=True, name="inclusive vertex with selection",
                                                       n_calibrations=30,
                                                      sign_part_column='signVtx', part_name='vertex',
                                                      logistic_combined=False))









    



KFold prediction using folds column






    












    



AUC for tagged: 0.660196489267 AUC with untag: 0.532414448103






    












    



mean AUC after calibration: 0.660221103363 1.37338872024e-08



In [49]:

    
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding, 
                                                      [data], N_B_events=N_B_events,
                                                      logistic=True, name="inclusive vertex", n_calibrations=30,
                                                      sign_part_column='signVtx', part_name='vertex', 
                                                      logistic_combined=False))









    



KFold prediction using folds column






    












    



AUC for tagged: 0.606196703067 AUC with untag: 0.5534348621






    












    



mean AUC after calibration: 0.606090879124 1.59221067884e-09



In [50]:

    
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding_all, 
                                                      [data], N_B_events=N_B_events,
                                                      logistic=True, name="inclusive vertex, all features",
                                                       n_calibrations=30,
                                                      sign_part_column='signVtx', part_name='vertex', 
                                                       logistic_combined=False))









    



KFold prediction using folds column






    












    



AUC for tagged: 0.60746652492 AUC with untag: 0.553594994119






    












    



mean AUC after calibration: 0.607332146606 1.33999925542e-09

Comparison of different models



In [54]:

    
pandas.concat(models)









    Out[54]:






  
    
      
      name
      $\epsilon_{tag}, \%$
      $\Delta \epsilon_{tag}, \%$
      $D^2$
      $\Delta D^2$
      $\epsilon, \%$
      $\Delta \epsilon, \%$
      AUC, with untag
      $\Delta$ AUC, with untag
    
  
  
    
      0
      old tagging
      12.265035
      0.028701
      0.099689
      0.000457
      1.222695
      0.006297
      53.249648
      0
    
    
      0
      inclusive vertex with selection
      12.265035
      0.028701
      0.084445
      0.000039
      1.035725
      0.002471
      53.241445
      0
    
    
      0
      inclusive vertex
      40.483420
      0.052144
      0.035009
      0.000018
      1.417284
      0.001962
      55.343486
      0
    
    
      0
      inclusive vertex, all features
      40.483420
      0.052144
      0.036065
      0.000015
      1.460018
      0.001971
      55.359499
      0

experiments of grouping vertices in event



In [55]:

    
from utils import group_max
from scipy.special import logit, expit
from utils import compute_B_prob_using_part_prob
from utils import calculate_auc_with_and_without_untag_events
from utils import compute_mistag
from utils import calibrate_probs, bootstrap_calibrate_prob, result_table

percentile_bins = [10, 20, 30, 40, 50, 60, 70, 80, 90]



In [56]:

    
def estimate_events(Bsign, Bweight, Bprob, Bevent, tagging_efficiency, tagging_efficiency_delta, name):
    auc, auc_full = calculate_auc_with_and_without_untag_events(Bsign, Bprob, Bweight, N_B_events=N_B_events)
    print auc, auc_full
    D2, aucs = bootstrap_calibrate_prob(Bsign, Bweight, Bprob, 
                                        n_calibrations=30, symmetrize=True, logistic=False)
#     Bprob_calibrated, (iso_reg1, iso_reg2) = calibrate_probs(Bsign, Bweight, Bprob,
#                                                              symmetrize=True, return_calibrator=True, logistic=True)
    return result_table(tagging_efficiency, tagging_efficiency_delta, D2, auc_full, name)

max of vertices



In [57]:

    
data['p_vertex'] = tt_folding_all.predict_proba(data)[:, 1] + 1e-8*numpy.random.random(len(data))
max_events = group_max(data.group_column.values, data['p_vertex'].values)
min_events = -group_max(data.group_column.values, -data['p_vertex'].values)
p_events = numpy.where(abs(max_events - 0.5) > abs(min_events - 0.5), max_events, min_events)









    



KFold prediction using folds column



In [58]:

    
data_events = data.ix[abs(p_events - data.p_vertex.values) < 1e-14, :]
data_probs = p_events[abs(p_events - data.p_vertex.values) < 1e-14]



In [59]:

    
len(data_probs), len(numpy.unique(data.group_column))









    Out[59]:





(602754, 602754)



In [60]:

    
Bsign, Bweight, Bprob, Bevent = compute_B_prob_using_part_prob(data_events, data_probs, sign_part_column='signVtx')



In [61]:

    
t = estimate_events(Bsign, Bweight, Bprob, Bevent, tagging_efficiency, 
                    tagging_efficiency_delta, 'inclusive vertex, max val, all features')









    



0.612419732062 0.554796084468



In [63]:

    
t









    Out[63]:






  
    
      
      name
      $\epsilon_{tag}, \%$
      $\Delta \epsilon_{tag}, \%$
      $D^2$
      $\Delta D^2$
      $\epsilon, \%$
      $\Delta \epsilon, \%$
      AUC, with untag
      $\Delta$ AUC, with untag
    
  
  
    
      0
      inclusive vertex, max val, all features
      40.48342
      0.052144
      0.043157
      0.000016
      1.747161
      0.002341
      55.479608
      0



In [64]:

    
x_pd_max = pandas.DataFrame({'event_id': data_events.event_id,
                  'Bweight': numpy.ones(len(data_events)),
                  'Bsign': data_events.signB, 
                  'vertex_relation_prob': Bprob / (1 - Bprob)})



In [65]:

    
x_pd_max.to_csv('../models/Bdata_vertex_MC_max.csv', header=True, index=False)

mean of logits



In [66]:

    
# mean predictions
p_mean = expit(numpy.bincount(data.group_column, weights=logit(data.p_vertex.values) * data.signVtx) / numpy.bincount(data.group_column))
p_svertex = numpy.ones(len(p_mean))



In [67]:

    
Bsign, Bweight, Bprob, Bevent = compute_B_prob_using_part_prob(pandas.DataFrame({
                                                                'signVtx': p_svertex,
                                                               'event_id': numpy.arange(len(p_svertex)),
                                                               'N_sig_sw': numpy.ones(len(p_svertex)),
                                                                'signB': numpy.bincount(data.group_column, weights=data.signB) / numpy.bincount(data.group_column)}), 
                                                                 p_mean, sign_part_column='signVtx')



In [68]:

    
t_mean = estimate_events(Bsign, Bweight, Bprob, Bevent, tagging_efficiency, 
                         tagging_efficiency_delta, 'inclusive vertex, mean, all features')









    



0.611382346187 0.554328667627



In [69]:

    
run_all = map(int, numpy.bincount(data.group_column, weights=data.run) // numpy.bincount(data.group_column))
event_all = map(int, numpy.bincount(data.group_column, weights=data.event) // numpy.bincount(data.group_column))
x = pandas.DataFrame({'run': run_all, 'event': event_all})



In [70]:

    
x_pd = pandas.DataFrame({'event_id': x.run.apply(str) + '_' + x.event.apply(str),
                  'Bweight': numpy.ones(len(p_svertex)),
                  'Bsign': numpy.bincount(data.group_column, weights=data.signB) / numpy.bincount(data.group_column), 
                  'vertex_relation_prob': p_mean / (1 - p_mean)})



In [71]:

    
x_pd.to_csv('../models/Bdata_vertex_MC_sqrt.csv', header=True, index=False)



In [72]:

    
pandas.concat(models + [t, t_mean])









    Out[72]:






  
    
      
      name
      $\epsilon_{tag}, \%$
      $\Delta \epsilon_{tag}, \%$
      $D^2$
      $\Delta D^2$
      $\epsilon, \%$
      $\Delta \epsilon, \%$
      AUC, with untag
      $\Delta$ AUC, with untag
    
  
  
    
      0
      old tagging
      12.265035
      0.028701
      0.099689
      0.000457
      1.222695
      0.006297
      53.249648
      0
    
    
      0
      inclusive vertex with selection
      12.265035
      0.028701
      0.084445
      0.000039
      1.035725
      0.002471
      53.241445
      0
    
    
      0
      inclusive vertex
      40.483420
      0.052144
      0.035009
      0.000018
      1.417284
      0.001962
      55.343486
      0
    
    
      0
      inclusive vertex, all features
      40.483420
      0.052144
      0.036065
      0.000015
      1.460018
      0.001971
      55.359499
      0
    
    
      0
      inclusive vertex, max val, all features
      40.483420
      0.052144
      0.043157
      0.000016
      1.747161
      0.002341
      55.479608
      0
    
    
      0
      inclusive vertex, mean, all features
      40.483420
      0.052144
      0.042633
      0.000011
      1.725921
      0.002265
      55.432867
      0

	run	event	No	mult	nnkrec	Bmass	ptB	vflag	ptmean	ipsmean	...	phi1	phi2	ghost1	ghost2	pointtheta	seedchi2	rdist	probf	v_cut	BOosc
0	2517778	1110824	0	21	4	5.27489	11.70600	4	0.940587	9.29878	...	1.363980	1.00376	0.050880	0.008245	0.142074	8.407050	2.06584	0.399833	0	0
1	2517778	1110824	1	21	4	5.27489	11.70600	3	1.172370	7.18694	...	2.336280	-2.36953	0.003858	0.002580	0.067733	0.062761	0.45721	0.903930	1	0
2	2517778	1110839	0	47	3	5.28328	3.88046	6	0.614212	13.12260	...	0.985463	2.26765	0.186784	0.006160	0.027074	3.467780	1.38931	0.805029	0	1
3	2517778	1110839	1	47	3	5.28328	3.88046	6	0.635548	14.23890	...	0.985463	2.72677	0.186784	0.002705	0.151040	2.414490	1.08309	0.896430	0	1
4	2517778	1110839	2	47	3	5.28328	3.88046	5	0.666694	12.26670	...	0.985463	1.38842	0.186784	0.098220	0.014452	0.220461	2.97054	0.921938	0	1

name	$\epsilon_{tag}, \%$	$\Delta \epsilon_{tag}, \%$	$D^2$	$\Delta D^2$	$\epsilon, \%$	$\Delta \epsilon, \%$	AUC, with untag
old tagging	12.265035	0.028701	0.099689	0.000457	1.222695	0.006297	53.249648
inclusive vertex with selection	12.265035	0.028701	0.084445	0.000039	1.035725	0.002471	53.241445
inclusive vertex	40.483420	0.052144	0.035009	0.000018	1.417284	0.001962	55.343486
inclusive vertex, all features	40.483420	0.052144	0.036065	0.000015	1.460018	0.001971	55.359499