In this notebook, we try simple classifiers

The aim to distinguish prompt leptons (those coming from W boson) from others.

The data is decays with $t \bar{t}$ production


In [1]:
import root_numpy
import hep_ml
import pandas
from sklearn.metrics import roc_curve
from hep_ml.commonutils import train_test_split, print_header
from hep_ml.reports import plot_roc
from hep_ml.rootutilities import list_flat_branches

In [2]:
# 13 TeV files
folder = '/mnt/w76/notebook/datasets/fromLesya/ver4/'
prompt_filename = folder + 'tt_prompts13.root'
fakes_filename  = folder + 'tt_fakes13.root'

treename = "FakeElectrons/fakeTree"

Correspondence in data:


In [3]:
flavours = {0: 'electrons', 1: 'muons'}
origins = {1: 'b', 2: 'c', 3: 'uds'} # and 0: prompt

Listing all available branches


In [4]:
all_columns = root_numpy.list_branches(prompt_filename, treename=treename)
sorted(all_columns)


Out[4]:
['HT',
 '_3dIP',
 '_3dIPerr',
 '_3dIPsig',
 '_PVchi2',
 '_PVerr',
 '_bTagged',
 '_charges',
 '_closeJetAngAll',
 '_closeJetCSVAll',
 '_closeJetEtaAll',
 '_closeJetEtaAllMC',
 '_closeJetPhiAll',
 '_closeJetPhiAllMC',
 '_closeJetPtAll',
 '_closeJetPtAllMC',
 '_closeJetPtAllstatus',
 '_csExercise',
 '_csJetIndex',
 '_csv',
 '_eventNb',
 '_flavors',
 '_hardIeta',
 '_hardIpdg',
 '_hardIphi',
 '_hardIpt',
 '_ipPV',
 '_ipPVerr',
 '_ipPVmc',
 '_ipZPV',
 '_ipZPVerr',
 '_isloose',
 '_isolation',
 '_isolationComponents',
 '_isolationMC',
 '_istight',
 '_jetEta',
 '_jetPhi',
 '_jetPt',
 '_lE',
 '_lEta',
 '_lPhi',
 '_lPt',
 '_lepNum',
 '_lepeta',
 '_lepiso',
 '_leppdg',
 '_lepphi',
 '_leppt',
 '_lepvx',
 '_lepvy',
 '_lepvz',
 '_lumiBlock',
 '_met',
 '_met_phi',
 '_mometa',
 '_mompdg',
 '_momphi',
 '_mompt',
 '_mt',
 '_nHard',
 '_n_Jets',
 '_n_PV',
 '_n_bJets',
 '_origin',
 '_originReduced',
 '_partonIdMatched',
 '_ptRelAll',
 '_runNb',
 '_sameParton',
 'hJet_JECUnc',
 'hJet_SoftLeptId95',
 'hJet_SoftLeptIdlooseMu',
 'hJet_SoftLeptPt',
 'hJet_SoftLeptdR',
 'hJet_SoftLeptptRel',
 'hJet_cef',
 'hJet_e',
 'hJet_eta',
 'hJet_genPt',
 'hJet_nconstituents',
 'hJet_phi',
 'hJet_pt',
 'hJet_ptLeadTrack',
 'hJet_ptRaw',
 'hJet_vtx3dL',
 'hJet_vtx3deL',
 'hJet_vtxMass',
 'hJet_vtxPt']

Train variables


In [5]:
train_variables = [
    '_3dIP',
    '_3dIPerr',
    '_3dIPsig',
    '_PVchi2',
    '_PVerr[0]',
    '_PVerr[1]',
    '_PVerr[2]',
    '_closeJetAngAll',
    '_closeJetCSVAll',
    '_closeJetEtaAll',
    '_closeJetPhiAll',
    '_flavors',
    '_ipPV',
    '_ipPVerr',
    '_ipZPV',
    '_ipZPVerr',
    '_isolation',
    '_isolationComponents[0]',
    '_isolationComponents[1]',
    '_isolationComponents[2]',
    '_isolationComponents[3]',
    '_ptRelAll',
    'hJet_cef',
    'hJet_nconstituents',
    'hJet_vtx3dL',
    'hJet_vtx3deL',
    'hJet_vtxMass',
    'hJet_vtxPt',
    '_lPt / _closeJetPtAll'
]

Columns to read from file


In [6]:
flat_columns = list_flat_branches(prompt_filename, treename=treename, use_dtype=False)
# This column is added for some strange reason
flat_columns.remove('_PVerr[3]')
read_columns = set(flat_columns + train_variables)
def isneeded(column):
    return all([marker not in column for marker in ['_lep', '_jet', '_bTagged']])
read_columns = sorted([col for col in read_columns if isneeded(col)])

read_columns


Out[6]:
['HT',
 '_3dIP',
 '_3dIPerr',
 '_3dIPsig',
 '_PVchi2',
 '_PVerr[0]',
 '_PVerr[1]',
 '_PVerr[2]',
 '_charges',
 '_closeJetAngAll',
 '_closeJetCSVAll',
 '_closeJetEtaAll',
 '_closeJetEtaAllMC',
 '_closeJetPhiAll',
 '_closeJetPhiAllMC',
 '_closeJetPtAll',
 '_closeJetPtAllMC',
 '_closeJetPtAllstatus',
 '_csExercise',
 '_csJetIndex',
 '_csv[0]',
 '_csv[1]',
 '_csv[2]',
 '_csv[3]',
 '_eventNb',
 '_flavors',
 '_hardIeta[0]',
 '_hardIeta[1]',
 '_hardIeta[2]',
 '_hardIeta[3]',
 '_hardIpdg[0]',
 '_hardIpdg[1]',
 '_hardIpdg[2]',
 '_hardIpdg[3]',
 '_hardIpdg[4]',
 '_hardIpdg[5]',
 '_hardIpdg[6]',
 '_hardIpdg[7]',
 '_hardIphi[0]',
 '_hardIphi[1]',
 '_hardIphi[2]',
 '_hardIphi[3]',
 '_hardIpt[0]',
 '_hardIpt[1]',
 '_hardIpt[2]',
 '_hardIpt[3]',
 '_ipPV',
 '_ipPVerr',
 '_ipPVmc',
 '_ipZPV',
 '_ipZPVerr',
 '_isloose',
 '_isolation',
 '_isolationComponents[0]',
 '_isolationComponents[1]',
 '_isolationComponents[2]',
 '_isolationComponents[3]',
 '_isolationMC[0]',
 '_isolationMC[1]',
 '_isolationMC[2]',
 '_isolationMC[3]',
 '_istight',
 '_lE',
 '_lEta',
 '_lPhi',
 '_lPt',
 '_lPt / _closeJetPtAll',
 '_lumiBlock',
 '_met',
 '_met_phi',
 '_mometa',
 '_mompdg',
 '_momphi',
 '_mompt',
 '_mt',
 '_nHard',
 '_n_Jets',
 '_n_PV',
 '_n_bJets',
 '_origin',
 '_originReduced',
 '_partonIdMatched',
 '_ptRelAll',
 '_runNb',
 '_sameParton',
 'hJet_JECUnc',
 'hJet_SoftLeptId95',
 'hJet_SoftLeptIdlooseMu',
 'hJet_SoftLeptPt',
 'hJet_SoftLeptdR',
 'hJet_SoftLeptptRel',
 'hJet_cef',
 'hJet_e',
 'hJet_eta',
 'hJet_genPt',
 'hJet_nconstituents',
 'hJet_phi',
 'hJet_pt',
 'hJet_ptLeadTrack',
 'hJet_ptRaw',
 'hJet_vtx3dL',
 'hJet_vtx3deL',
 'hJet_vtxMass',
 'hJet_vtxPt']

Looking at first 10 events


In [7]:
prompt_test = root_numpy.root2array(prompt_filename, treename=treename, branches=sorted(flat_columns), stop=10)
prompt_test = pandas.DataFrame(prompt_test)

pandas.set_option('display.max_rows', 300)
prompt_test.transpose()


Out[7]:
0 1 2 3 4 5 6 7 8 9
HT 562.0396 306.4465 145.9571 107.8649 206.8899 286.7352 233.8134 298.3255 100.4844 254.8213
_3dIP 0.002351888 0.004028209 0.001867373 0.005099327 0.001823551 0.0007556464 0.004196341 0.0009252675 0.001058759 0.00547249
_3dIPerr 0.001994789 0.002552684 0.002225567 0.004094405 0.002420091 0.001230903 0.004017449 0.002506279 0.002716487 0.004151106
_3dIPsig 1.179016 1.578028 0.839055 1.245438 0.7535053 0.6138963 1.044529 0.3691797 0.3897531 1.318321
_PVchi2 113.347 138.5885 96.84621 51.25286 109.003 107.0158 215.406 72.84936 91.00213 165.4838
_PVerr[0] 0.0006842549 0.0008083756 0.001180186 0.002186815 0.0007408096 0.0009497487 0.0007590571 0.0007345962 0.001199858 0.0006112818
_PVerr[1] 0.0006137387 0.0007616021 0.0009823861 0.001022436 0.001073237 0.000725358 0.0007129138 0.0008790125 0.001704708 0.0007119837
_PVerr[2] 0.001108435 0.001105495 0.001291811 0.001613691 0.001344715 0.001443964 0.001055947 0.001182231 0.002135418 0.0009234237
_bTagged[0] 0 1 0 0 0 1 1 0 1 0
_bTagged[10] 115 115 115 115 115 115 115 115 115 115
_bTagged[11] 111 111 111 111 111 111 111 111 111 111
_bTagged[12] 77 77 77 77 77 77 77 77 77 77
_bTagged[13] 117 117 117 117 117 117 117 117 117 117
_bTagged[14] 49 49 49 49 49 49 49 49 49 49
_bTagged[15] 55 55 55 55 55 55 55 55 55 55
_bTagged[16] 50 50 50 50 50 50 50 50 50 50
_bTagged[17] 112 112 112 112 112 112 112 112 112 112
_bTagged[18] 49 49 49 49 49 49 49 49 49 49
_bTagged[19] 74 74 74 74 74 74 74 74 74 74
_bTagged[1] 0 0 0 1 0 0 0 0 0 0
_bTagged[2] 0 1 0 0 1 0 0 1 1 0
_bTagged[3] 0 0 0 0 0 0 1 0 0 0
_bTagged[4] 0 0 0 0 0 0 0 0 0 0
_bTagged[5] 0 0 0 0 0 0 0 0 0 0
_bTagged[6] 104 0 0 0 0 0 0 0 0 0
_bTagged[7] 108 108 108 108 108 108 108 108 108 108
_bTagged[8] 116 116 116 116 116 116 116 116 116 116
_bTagged[9] 73 73 73 73 73 73 73 73 73 73
_charges 1 -1 -1 -1 1 1 1 1 1 -1
_closeJetAngAll 0.003483663 0.05333991 0.02775094 0.01354293 0.02688196 0.005994058 0.009054836 0.01477894 0.009432347 0.01733169
_closeJetCSVAll 0.2811276 0.2466501 0.2008782 0.212352 0.1256922 0.1340311 -1 0.130646 0.1635179 0.3890781
_closeJetEtaAll 2.187937 0.8559079 -0.4669839 -0.1416299 0.2239791 -0.968832 1.593516 -0.3211527 0.9606606 0.2329774
_closeJetEtaAllMC 1.526395 1.534666 -1.84632 0.5290717 1.129588 -1.012051 1.684479 -0.412269 2.393734 -0.363892
_closeJetPhiAll 3.139925 2.27073 0.3784338 2.682592 1.425687 0.07440661 -2.41098 2.483732 -0.9537438 2.790544
_closeJetPhiAllMC 3.073489 1.656862 -0.7170292 -2.404198 0.8666086 0.343329 -1.633331 1.286973 -0.8220682 3.117129
_closeJetPtAll 123.0686 30.59422 14.14725 21.54022 41.20134 86.4478 19.22381 73.09498 46.77441 15.85786
_closeJetPtAllMC 13.77892 12.58294 5.780418 0.130225 2.502018 5.861141 6.678652 47.60433 8.751426 5.269415
_closeJetPtAllstatus 3 3 3 3 3 3 3 3 3 3
_csExercise True True True True True True True True True True
_csJetIndex 1953261608 1953261608 1953261608 1953261608 1953261608 1953261608 1953261608 1953261608 1953261608 1953261608
_csv[0] 0.4625518 0.8021324 0.2093523 0.5934708 0.09917422 0.9999186 0.999685 0.06859656 0.9769515 0.3978269
_csv[1] 0.06191248 0.13654 0.02745649 0.9280761 0.1467818 0.1340311 0.1011321 0.130646 0.1635179 0.1945986
_csv[2] 0.2811276 0.9735951 0.09210946 0.1232377 0.9697257 0.07310806 0.1056457 0.9638844 0.9638844 0.08972702
_csv[3] 0.07627651 0.2466501 0.3452906 0.639365 0.1256922 0.09619172 0.9970408 0.4624425 0.4624425 0.06386518
_eventNb 29175633 29175649 29175658 29175664 29175666 29175671 29175687 29175688 29175693 29175732
_flavors 1 0 1 0 0 1 0 1 1 0
_hardIeta[0] 9.300873e+165 9.300873e+165 9.300873e+165 9.300873e+165 9.300873e+165 9.300873e+165 9.300873e+165 9.300873e+165 9.300873e+165 9.300873e+165
_hardIeta[1] 2.611307e-57 2.611307e-57 2.611307e-57 2.611307e-57 2.611307e-57 2.611307e-57 2.611307e-57 2.611307e-57 2.611307e-57 2.611307e-57
_hardIeta[2] 1.030124e-71 1.030124e-71 1.030124e-71 1.030124e-71 1.030124e-71 1.030124e-71 1.030124e-71 1.030124e-71 1.030124e-71 1.030124e-71
_hardIeta[3] 8.57608e+179 8.57608e+179 8.57608e+179 8.57608e+179 8.57608e+179 8.57608e+179 8.57608e+179 8.57608e+179 8.57608e+179 8.57608e+179
_hardIpdg[0] 1.717711e+09 1.717711e+09 1.717711e+09 1.717711e+09 1.717711e+09 1.717711e+09 1.717711e+09 1.717711e+09 1.717711e+09 1.717711e+09
_hardIpdg[1] 8.28794e+08 8.28794e+08 8.28794e+08 8.28794e+08 8.28794e+08 8.28794e+08 8.28794e+08 8.28794e+08 8.28794e+08 8.28794e+08
_hardIpdg[2] 9.125364e+08 9.125364e+08 9.125364e+08 9.125364e+08 9.125364e+08 9.125364e+08 9.125364e+08 9.125364e+08 9.125364e+08 9.125364e+08
_hardIpdg[3] 9.432065e+08 9.432065e+08 9.432065e+08 9.432065e+08 9.432065e+08 9.432065e+08 9.432065e+08 9.432065e+08 9.432065e+08 9.432065e+08
_hardIpdg[4] 8.928881e+08 8.928881e+08 8.928881e+08 8.928881e+08 8.928881e+08 8.928881e+08 8.928881e+08 8.928881e+08 8.928881e+08 8.928881e+08
_hardIpdg[5] 8.956939e+08 8.956939e+08 8.956939e+08 8.956939e+08 8.956939e+08 8.956939e+08 8.956939e+08 8.956939e+08 8.956939e+08 8.956939e+08
_hardIpdg[6] 9.094027e+08 9.094027e+08 9.094027e+08 9.094027e+08 9.094027e+08 9.094027e+08 9.094027e+08 9.094027e+08 9.094027e+08 9.094027e+08
_hardIpdg[7] 1.634745e+09 1.634745e+09 1.634745e+09 1.634745e+09 1.634745e+09 1.634745e+09 1.634745e+09 1.634745e+09 1.634745e+09 1.634745e+09
_hardIphi[0] 4.084873e+286 4.084873e+286 4.084873e+286 4.084873e+286 4.084873e+286 4.084873e+286 4.084873e+286 4.084873e+286 4.084873e+286 4.084873e+286
_hardIphi[1] 3.567152e-57 3.567152e-57 3.567152e-57 3.567152e-57 3.567152e-57 3.567152e-57 3.567152e-57 3.567152e-57 3.567152e-57 3.567152e-57
_hardIphi[2] 2.040295e+160 2.040295e+160 2.040295e+160 2.040295e+160 2.040295e+160 2.040295e+160 2.040295e+160 2.040295e+160 2.040295e+160 2.040295e+160
_hardIphi[3] 1.427376e-71 1.427376e-71 1.427376e-71 1.427376e-71 1.427376e-71 1.427376e-71 1.427376e-71 1.427376e-71 1.427376e-71 1.427376e-71
_hardIpt[0] 2.099971e-100 2.099971e-100 2.099971e-100 2.099971e-100 2.099971e-100 2.099971e-100 2.099971e-100 2.099971e-100 2.099971e-100 2.099971e-100
_hardIpt[1] 2.682876e+25 2.682876e+25 2.682876e+25 2.682876e+25 2.682876e+25 2.682876e+25 2.682876e+25 2.682876e+25 2.682876e+25 2.682876e+25
_hardIpt[2] 1.75887e+30 1.75887e+30 1.75887e+30 1.75887e+30 1.75887e+30 1.75887e+30 1.75887e+30 1.75887e+30 1.75887e+30 1.75887e+30
_hardIpt[3] 1.75918e+30 1.75918e+30 1.75918e+30 1.75918e+30 1.75918e+30 1.75918e+30 1.75918e+30 1.75918e+30 1.75918e+30 1.75918e+30
_ipPV 0.002323557 0.003343177 0.001424958 0.004244771 0.001398955 0.0007381775 0.003920058 1.133983e-05 0.0002618763 0.0009812138
_ipPVerr 0.001966262 0.001966262 0.001747079 0.001747079 0.001358323 0.0008724643 0.0008724643 0.001709919 0.001344182 0.003305437
_ipPVmc 0.001446555 0.002681651 0.002766517 0.004045943 0.002531716 0.0007227922 0.004433197 0.0003564575 4.089015e-05 0.001527832
_ipZPV 0.001637558 0.001637558 -0.001354329 -0.001354329 0.1078136 -0.0002432884 -0.0002432884 0.0009749132 0.001547645 -0.003307462
_ipZPVerr 0.004416595 0.004416595 0.002291177 0.002291177 0.004130829 0.002475497 0.002475497 0.002367276 0.00356501 0.01408471
_isloose True True True True True True True True True True
_isolation 0.009098196 0.2969082 0.04716077 0.05221348 0.1250686 0.05125329 0.1458393 0.04716658 0 0.2285026
_isolationComponents[0] 0 0 0 0.01393059 0.009009957 0.01808796 0 0.003635034 0 0
_isolationComponents[1] 0 0.06642095 0 0 0 0.007542181 0 0.01743187 0 0.1130554
_isolationComponents[2] 0.01532364 0.3127138 0.09646295 0.1337906 0.1861215 0.04158479 0.2158311 0.04574546 0.0339281 0.233999
_isolationComponents[3] 0.006225447 0.08222654 0.04930218 0.09550775 0.07006283 0.01596164 0.06999175 0.01964578 0.1156423 0.1185518
_isolationMC[0] 0.7917012 1.204702 12.27248 2.257037 0.7285869 1.430862 1.541843 0.09026315 1.712841 2.599526
_isolationMC[1] 0 0 0 0 0 0 0 0 0 0
_isolationMC[2] -0.01387228 -0.01221676 -0.003285554 0.02618557 -0.004545696 -0.03897074 0.197272 0.01242995 0.0009971152 -0.03772252
_isolationMC[3] -0.01387228 -0.01221676 -0.003285554 0.02618557 -0.004545696 -0.03897074 0.197272 0.01242995 0.0009971152 -0.03772252
_istight True False True True True True True True True False
_jetEta[0] 0.2936637 0.846476 0.6477343 0.5488711 -0.3492329 -1.851225 -0.8839593 -0.4261761 0.09650438 -1.567335
_jetEta[1] 0.7547328 2.350007 0.2507659 1.983726 -0.06600413 -0.968832 1.445821 -0.3211527 0.9606606 2.013326
_jetEta[2] 2.187937 1.769622 -2.041458 -1.218641 0.2972394 -1.07076 0.795118 -1.440986 -1.440986 -0.6855305
_jetEta[3] 1.378695 0.8559079 -0.338257 -1.124829 0.2239791 0.2590305 2.087603 -2.209866 -2.209866 -0.2741159
_jetPhi[0] 0.570747 -0.698512 2.720121 -0.2748311 0.4112687 3.097615 0.0220625 1.225223 1.383983 2.19616
_jetPhi[1] 0.00877945 -3.06715 2.421816 -1.649537 -2.688955 0.07440661 2.411934 2.483732 -0.9537438 -0.3216435
_jetPhi[2] 3.139925 1.603137 -1.306127 -0.8832337 -1.039161 1.583953 -1.402697 -1.690671 -1.690671 -1.25159
_jetPhi[3] -3.060726 2.27073 -2.512224 1.708376 1.425687 -1.259694 -0.3276829 -1.639461 -1.639461 1.00956
_jetPt[0] 182.602 104.7221 69.04283 76.95268 67.20926 163.7851 73.92871 78.08574 53.70997 101.3043
_jetPt[1] 144.6194 89.87665 46.49815 30.9122 51.42277 86.4478 73.22262 73.09498 46.77441 44.64951
_jetPt[2] 123.0686 81.25346 30.41609 57.10862 47.05657 36.5023 53.64611 53.32458 53.32458 40.02017
_jetPt[3] 111.7496 30.59422 62.58019 46.97557 41.20134 57.09789 33.01599 49.50954 49.50954 37.12507
_lE 547.6212 36.53807 13.42121 22.84068 41.79377 117.7691 50.81778 76.09296 65.18059 19.04598
_lEta 2.18472 0.8612641 -0.4893539 -0.1313979 0.1996749 -0.967459 1.584489 -0.32702 0.9698986 0.2499308
_lPhi 3.141263 2.323801 0.3948563 2.691464 1.4142 0.06857192 -2.41168 2.497297 -0.9556486 2.794145
_lPt 121.6845 26.20365 11.9597 22.64491 40.97423 78.21811 19.99972 72.19781 43.21166 18.46622
_lepNum[0] 8.589945e+08 8.589945e+08 8.589945e+08 8.589945e+08 8.589945e+08 8.589945e+08 8.589945e+08 8.589945e+08 8.589945e+08 8.589945e+08
_lepNum[1] 8.422165e+08 8.422165e+08 8.422165e+08 8.422165e+08 8.422165e+08 8.422165e+08 8.422165e+08 8.422165e+08 8.422165e+08 8.422165e+08
_lepNum[2] 8.759719e+08 8.759719e+08 8.759719e+08 8.759719e+08 8.759719e+08 8.759719e+08 8.759719e+08 8.759719e+08 8.759719e+08 8.759719e+08
_lepNum[3] 1.635007e+09 1.635007e+09 1.635007e+09 1.635007e+09 1.635007e+09 1.635007e+09 1.635007e+09 1.635007e+09 1.635007e+09 1.635007e+09
_lepNum[4] 1.718504e+09 1.718504e+09 1.718504e+09 1.718504e+09 1.718504e+09 1.718504e+09 1.718504e+09 1.718504e+09 1.718504e+09 1.718504e+09
_lepNum[5] 7.254475e+08 7.254475e+08 7.254475e+08 7.254475e+08 7.254475e+08 7.254475e+08 7.254475e+08 7.254475e+08 7.254475e+08 7.254475e+08
_lepNum[6] 9.14041e+08 9.14041e+08 9.14041e+08 9.14041e+08 9.14041e+08 9.14041e+08 9.14041e+08 9.14041e+08 9.14041e+08 9.14041e+08
_lepNum[7] 9.271517e+08 9.271517e+08 9.271517e+08 9.271517e+08 9.271517e+08 9.271517e+08 9.271517e+08 9.271517e+08 9.271517e+08 9.271517e+08
_lepeta[0] 1.140736e+243 1.140736e+243 1.140736e+243 1.140736e+243 1.140736e+243 1.140736e+243 1.140736e+243 1.140736e+243 1.140736e+243 1.140736e+243
_lepeta[1] 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52
_lepeta[2] 5.161725e-109 5.161725e-109 5.161725e-109 5.161725e-109 5.161725e-109 5.161725e-109 5.161725e-109 5.161725e-109 5.161725e-109 5.161725e-109
_lepeta[3] 5.265186e+170 5.265186e+170 5.265186e+170 5.265186e+170 5.265186e+170 5.265186e+170 5.265186e+170 5.265186e+170 5.265186e+170 5.265186e+170
_lepiso[0] 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52
_lepiso[1] 4.2613e+257 4.2613e+257 4.2613e+257 4.2613e+257 4.2613e+257 4.2613e+257 4.2613e+257 4.2613e+257 4.2613e+257 4.2613e+257
_lepiso[2] 3.984756e+252 3.984756e+252 3.984756e+252 3.984756e+252 3.984756e+252 3.984756e+252 3.984756e+252 3.984756e+252 3.984756e+252 3.984756e+252
_lepiso[3] 1.256258e-71 1.256258e-71 1.256258e-71 1.256258e-71 1.256258e-71 1.256258e-71 1.256258e-71 1.256258e-71 1.256258e-71 1.256258e-71
_leppdg[0] 1.887008e+09 1.887008e+09 1.887008e+09 1.887008e+09 1.887008e+09 1.887008e+09 1.887008e+09 1.887008e+09 1.887008e+09 1.887008e+09
_leppdg[1] 1.818191e+09 1.818191e+09 1.818191e+09 1.818191e+09 1.818191e+09 1.818191e+09 1.818191e+09 1.818191e+09 1.818191e+09 1.818191e+09
_leppdg[2] 9.430734e+08 9.430734e+08 9.430734e+08 9.430734e+08 9.430734e+08 9.430734e+08 9.430734e+08 9.430734e+08 9.430734e+08 9.430734e+08
_leppdg[3] 1.650538e+09 1.650538e+09 1.650538e+09 1.650538e+09 1.650538e+09 1.650538e+09 1.650538e+09 1.650538e+09 1.650538e+09 1.650538e+09
_leppdg[4] 8.92745e+08 8.92745e+08 8.92745e+08 8.92745e+08 8.92745e+08 8.92745e+08 8.92745e+08 8.92745e+08 8.92745e+08 8.92745e+08
_leppdg[5] 1.702126e+09 1.702126e+09 1.702126e+09 1.702126e+09 1.702126e+09 1.702126e+09 1.702126e+09 1.702126e+09 1.702126e+09 1.702126e+09
_leppdg[6] 1.768453e+09 1.768453e+09 1.768453e+09 1.768453e+09 1.768453e+09 1.768453e+09 1.768453e+09 1.768453e+09 1.768453e+09 1.768453e+09
_leppdg[7] 9.925561e+08 9.925561e+08 9.925561e+08 9.925561e+08 9.925561e+08 9.925561e+08 9.925561e+08 9.925561e+08 9.925561e+08 9.925561e+08
_lepphi[0] 5.398424e-62 5.398424e-62 5.398424e-62 5.398424e-62 5.398424e-62 5.398424e-62 5.398424e-62 5.398424e-62 5.398424e-62 5.398424e-62
_lepphi[1] 3.697225e-57 3.697225e-57 3.697225e-57 3.697225e-57 3.697225e-57 3.697225e-57 3.697225e-57 3.697225e-57 3.697225e-57 3.697225e-57
_lepphi[2] 3.377951e-57 3.377951e-57 3.377951e-57 3.377951e-57 3.377951e-57 3.377951e-57 3.377951e-57 3.377951e-57 3.377951e-57 3.377951e-57
_lepphi[3] 5.931601e+228 5.931601e+228 5.931601e+228 5.931601e+228 5.931601e+228 5.931601e+228 5.931601e+228 5.931601e+228 5.931601e+228 5.931601e+228
_leppt[0] 3.10212e+223 3.10212e+223 3.10212e+223 3.10212e+223 3.10212e+223 3.10212e+223 3.10212e+223 3.10212e+223 3.10212e+223 3.10212e+223
_leppt[1] 2.682873e+25 2.682873e+25 2.682873e+25 2.682873e+25 2.682873e+25 2.682873e+25 2.682873e+25 2.682873e+25 2.682873e+25 2.682873e+25
_leppt[2] 6.269648e+15 6.269648e+15 6.269648e+15 6.269648e+15 6.269648e+15 6.269648e+15 6.269648e+15 6.269648e+15 6.269648e+15 6.269648e+15
_leppt[3] 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52 2.213774e-52
_lepvx[0] 1.725073e-47 1.725073e-47 1.725073e-47 1.725073e-47 1.725073e-47 1.725073e-47 1.725073e-47 1.725073e-47 1.725073e-47 1.725073e-47
_lepvx[1] 8.916406e+252 8.916406e+252 8.916406e+252 8.916406e+252 8.916406e+252 8.916406e+252 8.916406e+252 8.916406e+252 8.916406e+252 8.916406e+252
_lepvx[2] 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214
_lepvx[3] 5.043588e+174 5.043588e+174 5.043588e+174 5.043588e+174 5.043588e+174 5.043588e+174 5.043588e+174 5.043588e+174 5.043588e+174 5.043588e+174
_lepvy[0] 2.317793e-52 2.317793e-52 2.317793e-52 2.317793e-52 2.317793e-52 2.317793e-52 2.317793e-52 2.317793e-52 2.317793e-52 2.317793e-52
_lepvy[1] 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214 1.439799e+214
_lepvy[2] 1.677757e-307 1.677757e-307 1.677757e-307 1.677757e-307 1.677757e-307 1.677757e-307 1.677757e-307 1.677757e-307 1.677757e-307 1.677757e-307
_lepvy[3] 8.47785e+175 8.47785e+175 8.47785e+175 8.47785e+175 8.47785e+175 8.47785e+175 8.47785e+175 8.47785e+175 8.47785e+175 8.47785e+175
_lepvz[0] 1.945983e-114 1.945983e-114 1.945983e-114 1.945983e-114 1.945983e-114 1.945983e-114 1.945983e-114 1.945983e-114 1.945983e-114 1.945983e-114
_lepvz[1] 0 0 0 0 0 0 0 0 0 0
_lepvz[2] 1.042568e-23 1.042568e-23 1.042568e-23 1.042568e-23 1.042568e-23 1.042568e-23 1.042568e-23 1.042568e-23 1.042568e-23 1.042568e-23
_lepvz[3] 2.429128e-52 2.429128e-52 2.429128e-52 2.429128e-52 2.429128e-52 2.429128e-52 2.429128e-52 2.429128e-52 2.429128e-52 2.429128e-52
_lumiBlock 291757 291757 291757 291757 291757 291757 291757 291757 291757 291758
_met 137.7307 32.32843 151.7995 130.0223 60.59154 65.21434 48.02085 70.61013 108.604 81.14087
_met_phi -2.011114 -1.244335 2.900481 2.167583 -2.563787 -0.8025349 -0.5175849 -1.559674 -2.467439 -0.331173
_mometa 2.302222 3.362761 -0.5652305 1.317709 -0.9114496 -1.355317 1.589987 -0.8598338 0.9503347 -1.870228
_mompdg 24 -24 -24 -24 24 24 24 24 24 -24
_momphi -2.810621 0.920126 2.871244 1.771453 1.737519 -0.4144169 -0.8429753 2.584181 -1.91425 0.3524473
_mompt 203.0457 13.15634 141.0195 63.64847 65.56386 172.7824 26.6573 74.56921 90.06423 31.15783
_mt 138.7174 56.89191 80.94471 28.10282 91.06537 60.26669 50.30944 128.1017 93.98092 77.41488
_nHard 0 0 0 0 0 0 0 0 0 0
_n_Jets 4 4 3 2 4 3 4 5 2 5
_n_PV 19 18 10 13 16 15 17 23 14 10
_n_bJets 0 2 0 1 1 1 2 1 1 0
_origin 0 0 0 0 0 0 0 0 0 0
_originReduced 0 0 0 0 0 0 0 0 0 0
_partonIdMatched 21 21 1 22 -4 21 -1 1 3 1
_ptRelAll 0.4232437 1.399887 0.3335256 0.3064592 1.098604 0.4686017 0.180344 1.067955 0.4089948 0.3206864
_runNb 1 1 1 1 1 1 1 1 1 1
_sameParton True True True True True True True True True True
hJet_JECUnc 0.01309479 0.01873907 0.05916677 0.02714176 0.01475246 0.01020982 0.03617719 0.01065434 0.01427092 0.03315894
hJet_SoftLeptId95 1 1 1 1 1 1 1 1 1 1
hJet_SoftLeptIdlooseMu 1 1 1 1 1 1 1 1 1 1
hJet_SoftLeptPt 121.6845 26.20365 11.9597 22.64491 40.97423 78.21811 19.99972 72.19781 43.21166 18.46622
hJet_SoftLeptdR 0.003483663 0.05333991 0.02775094 0.01354293 0.02688196 0.005994058 0.009054836 0.01477894 0.009432347 0.01733169
hJet_SoftLeptptRel 0.4232437 1.399887 0.3335256 0.3064592 1.098604 0.4686017 0.180344 1.067955 0.4089948 0.3206864
hJet_cef 0.01064075 0.7378582 0.01966804 0.8936263 0.8090361 0.07389928 0.8403727 0.0122704 0.01957046 0.7771393
hJet_e 584.4357 48.8008 18.09488 26.25952 49.26529 141.0035 63.57457 84.54328 75.57942 21.34508
hJet_eta 2.187937 0.8559079 -0.4669839 -0.1416299 0.2239791 -0.968832 1.593516 -0.3211527 0.9606606 0.2329774
hJet_genPt 13.77892 12.58294 5.780418 0.130225 2.502018 5.861141 6.678652 47.60433 8.751426 5.269415
hJet_nconstituents 8 14 11 12 13 22 14 13 17 10
hJet_phi 3.139925 2.27073 0.3784338 2.682592 1.425687 0.07440661 -2.41098 2.483732 -0.9537438 2.790544
hJet_pt 129.4514 34.94256 16.05413 25.75589 47.61356 93.20529 24.75818 80.05013 50.31358 20.63392
hJet_ptLeadTrack 0 0 0 0 0 0 0 0 0 0
hJet_ptRaw 125.2402 37.03648 16.97453 28.62181 51.52688 94.43665 26.7755 83.19693 50.53211 23.69128
hJet_vtx3dL 0 0 0 0 0 0 0 0 0 0
hJet_vtx3deL 0 0 0 0 0 0 0 0 0 0
hJet_vtxMass 0 0 0 0 0 0 0 0 0 0
hJet_vtxPt 0 0 0 0 0 0 0 0 0 0

Checking that there are no constant columns


In [8]:
prompt_test = pandas.DataFrame(root_numpy.root2array(prompt_filename, treename=treename, branches=sorted(read_columns), stop=1000))
stds = prompt_test.std()
suspicious_columns = stds[stds < 1e-8].index

In [9]:
list(suspicious_columns)


Out[9]:
['_closeJetPtAllstatus',
 '_csExercise',
 '_hardIeta[1]',
 '_hardIeta[2]',
 '_hardIphi[1]',
 '_hardIphi[3]',
 '_hardIpt[0]',
 '_isloose',
 '_nHard',
 '_originReduced',
 '_runNb',
 '_sameParton',
 'hJet_SoftLeptId95',
 'hJet_SoftLeptIdlooseMu',
 'hJet_ptLeadTrack',
 'hJet_vtx3dL',
 'hJet_vtx3deL',
 'hJet_vtxMass',
 'hJet_vtxPt']

In [10]:
prompt_check = pandas.DataFrame(root_numpy.root2array(prompt_filename, treename=treename, branches=suspicious_columns, stop=100000))
pandas.DataFrame({'mean': prompt_check.mean(), 'std': prompt_check.std() })


Out[10]:
mean std
_closeJetPtAllstatus 3.000000e+00 0.000000e+00
_csExercise 1.000000e+00 0.000000e+00
_hardIeta[1] 6.364745e+63 8.737390e+63
_hardIeta[2] 2.534294e+228 NaN
_hardIphi[1] 3.453504e+213 NaN
_hardIphi[3] 1.006585e-71 6.508192e-72
_hardIpt[0] 1.818479e+170 NaN
_isloose 1.000000e+00 0.000000e+00
_nHard 0.000000e+00 0.000000e+00
_originReduced 0.000000e+00 0.000000e+00
_runNb 1.000000e+00 0.000000e+00
_sameParton 1.000000e+00 0.000000e+00
hJet_SoftLeptId95 1.000000e+00 0.000000e+00
hJet_SoftLeptIdlooseMu 1.000000e+00 0.000000e+00
hJet_ptLeadTrack 0.000000e+00 0.000000e+00
hJet_vtx3dL 0.000000e+00 0.000000e+00
hJet_vtx3deL 0.000000e+00 0.000000e+00
hJet_vtxMass 0.000000e+00 0.000000e+00
hJet_vtxPt 0.000000e+00 0.000000e+00

In [11]:
fake_check = pandas.DataFrame(root_numpy.root2array(fakes_filename, treename=treename, branches=suspicious_columns, stop=100000))
pandas.DataFrame({'mean': fake_check.mean(), 'std': fake_check.std() })


Out[11]:
mean std
_closeJetPtAllstatus 3.000000e+00 0.000000e+00
_csExercise 8.109100e-01 3.915821e-01
_hardIeta[1] 1.456928e-52 7.035390e-53
_hardIeta[2] 4.809618e-62 2.322527e-62
_hardIphi[1] 1.409497e+247 NaN
_hardIphi[3] 1.030235e+242 NaN
_hardIpt[0] 3.781800e-01 7.831641e-01
_isloose 1.000000e+00 0.000000e+00
_nHard 0.000000e+00 0.000000e+00
_originReduced 1.192750e+00 4.877702e-01
_runNb 1.000000e+00 0.000000e+00
_sameParton 8.109100e-01 3.915821e-01
hJet_SoftLeptId95 1.000000e+00 0.000000e+00
hJet_SoftLeptIdlooseMu 1.000000e+00 0.000000e+00
hJet_ptLeadTrack 0.000000e+00 0.000000e+00
hJet_vtx3dL 0.000000e+00 0.000000e+00
hJet_vtx3deL 0.000000e+00 0.000000e+00
hJet_vtxMass 0.000000e+00 0.000000e+00
hJet_vtxPt 0.000000e+00 0.000000e+00

Reading the data


In [12]:
# number of events we will use
n_events = 1000000

In [13]:
prompt_data = root_numpy.root2array(prompt_filename, treename=treename, branches=read_columns, stop=n_events)
prompt_data = pandas.DataFrame(prompt_data)

fake_data = root_numpy.root2array(fakes_filename, treename=treename, branches=read_columns, stop=n_events)
fake_data = pandas.DataFrame(fake_data)

Plotting the distributions


In [14]:
columns = sorted(prompt_data.columns)
figure(figsize=[18, 100])
for i, column in enumerate(columns, 1):
    subplot((len(columns) + 2) // 3, 3, i)
    concatenated = numpy.concatenate([prompt_data[column], fake_data[column]])
    limits = numpy.percentile(concatenated, [1, 99])
    hist(prompt_data[column], bins=30, normed=True, range=limits, alpha=0.3, label='prompt')
    hist(fake_data[column],   bins=30, normed=True, range=limits, alpha=0.3, label='fake')
    legend(loc='best')
    title(column)


Separate hists for electrons ans muos


In [15]:
for column in ['hJet_cef', '_isolationComponents[3]']:
    figure(figsize=[17, 6])
    for flavour, flavour_name in flavours.items():
        subplot(1, 2, flavour + 1)
        concatenated = numpy.concatenate([prompt_data[column], fake_data[column]])
        limits = numpy.percentile(concatenated, [1, 99])
        hist((prompt_data.ix[prompt_data._flavors == flavour, column]).values, 
             bins=30, normed=True, range=limits, alpha=0.3, label='prompt')
        hist((fake_data.ix[fake_data._flavors == flavour, column]).values,
             bins=30, normed=True, range=limits, alpha=0.3, label='fake')
        legend()
        title(column + ", " + flavour_name)


Features we use in training


In [16]:
data = pandas.concat([prompt_data, fake_data], ignore_index=True)
# Three IP features contain the events with huge values not convertible to float32 
data = numpy.clip(data, -1e20, 1e20)
labels = numpy.array([1] * len(prompt_data) + [0] * len(fake_data))

trainX, testX, trainY, testY = train_test_split(data, labels, train_size=0.2)

In [17]:
len(trainX), len(testX)


Out[17]:
(400000, 1600000)

Training classifiers


In [18]:
from sklearn.ensemble import GradientBoostingClassifier
from hep_ml import HidingClassifier
from hep_ml.experiments import metaclassifiers

In [19]:
classifiers = hep_ml.ClassifiersDict()
base_gb = GradientBoostingClassifier(subsample=0.3, n_estimators=100, max_depth=8, min_samples_split=300, 
                                     max_features=8, learning_rate=0.03)
base_splitter = metaclassifiers.DumbSplitter(base_estimator=base_gb, feature_name='_flavors')
classifiers['gb'] = HidingClassifier(base_estimator=base_splitter, train_variables=train_variables)

In [20]:
train_mask = numpy.array(trainX._flavors == 0)
train_mask = numpy.array(trainX._flavors == trainX._flavors)
classifiers.fit(trainX[train_mask], trainY[train_mask])
predictions = classifiers.test_on(testX, testY)


Classifier           gb is learnt in 154.48 seconds
Totally spent 154.48 seconds on training

In [21]:
predictions.roc()


Out[21]:
<hep_ml.reports.Predictions at 0xcf197d0>

In [22]:
predictions.learning_curves(step=10)


/mnt/w76/venv/py27/local/lib/python2.7/site-packages/matplotlib/axes.py:4747: UserWarning: No labeled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labeled objects found. "
Out[22]:
<hep_ml.reports.Predictions at 0xcf197d0>

Feature importances


In [32]:
splitter = classifiers['gb']._trained_estimator
for flavour, flavour_name in flavours.items(): 
    gb = splitter.classifiers[flavour]
    feature_imps = pandas.Series(data=gb.feature_importances_, index=train_variables)
    feature_imps.sort(ascending=False)
    print_header(flavour_name)
    print feature_imps


electrons

_lPt / _closeJetPtAll      0.230678
_isolationComponents[0]    0.198296
_isolation                 0.096916
_closeJetCSVAll            0.091087
_3dIPsig                   0.066135
_ptRelAll                  0.060856
_isolationComponents[2]    0.054387
hJet_cef                   0.037090
_3dIP                      0.032069
_closeJetAngAll            0.027564
_isolationComponents[3]    0.018690
hJet_nconstituents         0.017906
_ipPV                      0.013323
_PVerr[1]                  0.010525
_PVerr[0]                  0.007883
_closeJetEtaAll            0.007509
_3dIPerr                   0.005694
_PVerr[2]                  0.005445
_PVchi2                    0.004418
_isolationComponents[1]    0.003921
_ipPVerr                   0.002954
_ipZPVerr                  0.002418
_closeJetPhiAll            0.002157
_ipZPV                     0.002078
_flavors                   0.000000
hJet_vtx3deL               0.000000
hJet_vtxPt                 0.000000
hJet_vtx3dL                0.000000
hJet_vtxMass               0.000000
dtype: float64

muons

_isolation                 0.227140
_isolationComponents[0]    0.183779
_lPt / _closeJetPtAll      0.177757
hJet_cef                   0.075157
_isolationComponents[2]    0.072705
_3dIPsig                   0.062400
_closeJetCSVAll            0.046959
_ptRelAll                  0.037369
_ipPV                      0.026193
_3dIP                      0.025949
_closeJetAngAll            0.019523
hJet_nconstituents         0.008424
_PVerr[0]                  0.005689
_isolationComponents[1]    0.004995
_PVerr[1]                  0.003957
_ipPVerr                   0.003836
_PVchi2                    0.003507
_PVerr[2]                  0.003407
_isolationComponents[3]    0.003165
_ipZPV                     0.001931
_closeJetEtaAll            0.001779
_ipZPVerr                  0.001577
_closeJetPhiAll            0.001518
_3dIPerr                   0.001285
hJet_vtx3deL               0.000000
_flavors                   0.000000
hJet_vtx3dL                0.000000
hJet_vtxPt                 0.000000
hJet_vtxMass               0.000000
dtype: float64

By flavour


In [24]:
for flavour, flavour_name in flavours.items():
    mask = numpy.array(testX._flavors == flavour)
    classifiers.test_on(testX[mask], testY[mask]).roc()    
    # comparing with istight
    fpr, tpr, _ = roc_curve(testY[mask], testX._istight[mask])
    plot(tpr[1:2], 1 - fpr[1:2], 'o', label='tight')
    title(flavour_name)
    xlim(0.5, 1.)
    ylim(0.5, 1.)
    for origin, origin_name in origins.items():
        mask = (testX._flavors == flavour) & ( (testX._originReduced == 0) | (testX._originReduced == origin) ) 
        mask = numpy.array(mask)
        preds = classifiers['gb'].predict_proba(testX[mask])[:, 1]
        plot_roc(testY[mask], preds, classifier_name=origin_name)
    legend(loc='lower left')
    grid()


Measuring improvements at those levels of efficency (or bck rejection) as tight cut does


In [25]:
for flavour, flavour_name in flavours.items():
    mask = numpy.array(testX._flavors == flavour)
    # comparing with istight
    fpr_base, tpr_base, _ = roc_curve(testY[mask], testX._istight[mask])
    fpr_base = fpr_base[1]
    tpr_base = tpr_base[1]
    print flavour_name, 'istight'
    print 'istight fpr', fpr_base
    print 'istight tpr', tpr_base
    
    print flavour_name, 'all'
    fpr, tpr, _ = roc_curve(testY[mask], classifiers['gb'].predict_proba(testX[mask])[:, 1])

    print 'tpr at the same fpr', numpy.interp(fpr_base, fpr, tpr)
    print 'fpr at the same tpr', numpy.interp(tpr_base, tpr, fpr)
   
    preds = classifiers['gb'].predict_proba(testX[mask])[:, 1]
    fpr, tpr, _ = roc_curve(testY[mask], preds)
    
    for origin, origin_name in origins.items():
        mask = (testX._flavors == flavour) & ( (testX._originReduced == 0) | (testX._originReduced == origin) ) 
        mask = numpy.array(mask)
        fpr, tpr, _ = roc_curve(testY[mask], classifiers['gb'].predict_proba(testX[mask])[:, 1])
        print flavour_name, origin_name
        print 'tpr at the same fpr', numpy.interp(fpr_base, fpr, tpr)
        print 'fpr at the same tpr', numpy.interp(tpr_base, tpr, fpr)


electrons istight
istight fpr 0.0127384693734
istight tpr 0.586618233222
electrons all
tpr at the same fpr 0.753959839637
fpr at the same tpr 0.00568909021463
electrons b
tpr at the same fpr 0.877163862625
fpr at the same tpr 0.00103980986334
electrons c
tpr at the same fpr 0.768689747868
fpr at the same tpr 0.00411743644826
electrons uds
tpr at the same fpr 0.412795060397
fpr at the same tpr 0.0289732294138
muons istight
istight fpr 0.00449789396526
istight tpr 0.689271375236
muons all
tpr at the same fpr 0.884310095382
fpr at the same tpr 0.000588552575571
muons b
tpr at the same fpr 0.890798633251
fpr at the same tpr 0.000429385519283
muons c
tpr at the same fpr 0.87580749425
fpr at the same tpr 0.000833796553641
muons uds
tpr at the same fpr 0.524489760325
fpr at the same tpr 0.00831264169276

ROC for electrons / muons that passed tight cut + separately for origins


In [26]:
for flavour, flavour_name in flavours.items():
    mask = (testX._flavors == flavour) & (testX._istight == 1)
    mask = numpy.array(mask)
    classifiers.test_on(testX[mask], testY[mask]).roc()
    for origin, origin_name in origins.items():
        mask = (testX._flavors == flavour) & (testX._istight == 1) & ( (testX._originReduced == 0) | (testX._originReduced == origin) ) 
        mask = numpy.array(mask)
        preds = classifiers['gb'].predict_proba(testX[mask])[:, 1]
        plot_roc(testY[mask], preds, classifier_name=origin_name)
        
    legend(loc='lower left')
    title(flavour_name + ' (istight == 1)')
    grid()


Efficiencies for leptons


In [27]:
# added _lPt < 250, otherwise many empty bins
for flavour, flavour_name in flavours.items():
    mask = (testX._flavors == flavour) & (testX._lPt < 250)
    mask = numpy.array(mask)
    pred = classifiers.test_on(testX[mask], testY[mask])
    pred.hist(['_lPt']), title(flavour_name)
    pred.efficiency(['_lPt'], label=1), title(flavour_name + ' Signal efficiency')
    pred.efficiency(['_lPt'], label=0), title(flavour_name + ' Background rejection')


ROC for different _n_Bjets


In [28]:
for flavour, flavour_name in flavours.items():
    figure(figsize=[14, 10])
    for min_jets in [[0], [1], [2], range(3, 10)]:
        mask = numpy.array((testX._flavors == flavour) & (numpy.in1d(testX._n_bJets, min_jets)))
        # comparing with istight
        preds = classifiers['gb'].predict_proba(testX[mask])[:, 1]
        plot_roc(testY[mask], preds, classifier_name="n_bjets in {}".format(min_jets))
        xlim(0.7, 1.)
        ylim(0.7, 1.)
    grid()
    title(flavour_name)
    legend(loc='lower left')


Selecting cut on classifier

separately for electrons and muons


In [90]:
def compute_cuts(filename, treename, train_variables, classifier, left_events_per_flavour=5000):
    data = pandas.DataFrame(root_numpy.root2array(filename, treename=treename, branches=train_variables))
    data_flavours = pandas.DataFrame(root_numpy.root2array(filename, treename=treename, branches=['_flavors']))['_flavors']
    predictions = classifier.predict_proba(data)[:, 1]
    result = {}
    for flavour, flavour_name in flavours.items():
        mask = numpy.array(data_flavours == flavour)
        masked_prediction = predictions[mask]
        cut = numpy.sort(masked_prediction)[- left_events_per_flavour - 1]
        print flavour_name, cut 
        print 'passed', numpy.sum(masked_prediction > cut),
        print 'not passed', numpy.sum(masked_prediction <= cut)
        result[flavour] = cut
    return result

In [91]:
cuts = compute_cuts(fakes_filename, treename, train_variables, classifiers['gb'])


electrons 0.935238759386 passed 5000
not passed 258365
muons 0.890498071719 passed 5000
not passed 1142962

In [135]:
cuts


Out[135]:
{0: 0.93523875938645662, 1: 0.89049807171856099}

Comparing those events passed cut and those don't

TODO get rid of those events used in train


In [83]:
prompt_train_vars = pandas.DataFrame(root_numpy.root2array(prompt_filename, treename=treename, branches=train_variables))
prompt_train_vars = numpy.clip(prompt_train_vars, -1e20, 1e20)

In [86]:
prompt_predictions = classifiers['gb'].predict_proba(prompt_train_vars)[:, 1]

In [88]:
prompt_flavours = pandas.DataFrame(root_numpy.root2array(prompt_filename, treename=treename, branches=['_flavors']))['_flavors']

In [93]:
for flavour, flavour_name in flavours.items():
    print flavour_name
    mask = numpy.array(prompt_flavours == flavour)
    print numpy.mean(prompt_predictions[mask] > cuts[flavour])


electrons
0.818576245078
muons
0.883270625204

Comparing fakes that passed the cut with those didn't


In [127]:
fake_test_data = pandas.DataFrame(root_numpy.root2array(fakes_filename, treename=treename, branches=read_columns))
fake_test_data = numpy.clip(fake_test_data, -1e20, 1e20)

In [128]:
fake_predictions = classifiers['gb'].predict_proba(fake_test_data)[:, 1]

In [131]:
def compute_passed(predictions, flavours, cuts):
    result = (flavours == 1) & (predictions > cuts[1])
    result |= (flavours == 0) & (predictions > cuts[0])
    return result

In [132]:
fake_passed = numpy.array(compute_passed(fake_predictions, fake_test_data._flavors, cuts=cuts))

In [146]:
def plot_for_passed(dataset, ispassed):
    for flavour, flavour_name in flavours.items():
        print_header(flavour_name)
        flavour_mask = numpy.array(dataset['_flavors'] == flavour)
        figure(figsize=[18, 100])
        for i, column in enumerate(sorted(dataset.columns), 1):
            subplot((len(columns) + 2) // 3, 3, i)
            limits = numpy.percentile(dataset[column], [.1, 99.9])
            hist(dataset.ix[flavour_mask &  ispassed, column].values, bins=30, normed=True, range=limits, alpha=0.3, label='passed')
            hist(dataset.ix[flavour_mask & ~ispassed, column].values, bins=30, normed=True, range=limits, alpha=0.3, label='not passed')
            legend(loc='best')
            title(column)
        show()

In [148]:
plot_for_passed(fake_test_data, fake_passed)


electrons

muons

Comparing fakes that passed the cut with those didn't with _isolation < 1


In [147]:
mask = numpy.array(fake_test_data._isolation < 1)
plot_for_passed(fake_test_data[mask], fake_passed[mask])


electrons

muons


In [ ]: