In this notebook we distinguish prompt leptons with different origin



In [1]:

    
import root_numpy
import hep_ml
import pandas
from hep_ml.commonutils import train_test_split
from hep_ml.reports import plot_roc
from sklearn.metrics import roc_curve



In [2]:

    
folder = '/mnt/w76/notebook/datasets/fromLesya/ver3/'
prompt_filename = folder + 'tt_prompts.root'
fakes_filename  = folder + 'tt_fakes.root'

treename = "fakeTree"

Correspondence in data



In [3]:

    
flavours = {0: 'electrons', 1: 'muons'}
origins = {1: 'b', 2: 'c', 3: 'uds'} # and 0: prompt

Markers for features not used in training



In [4]:

    
all_columns = root_numpy.list_branches(prompt_filename, treename=treename)

markers = ['_mom', 'gen', '_lep', 'AllMC', '_n_', '_met', 
           'HT', '_closeJetPtAllstatus', '_charges', '_ipPVmc', '_isloose', '_istight',
           '_eventNb', '_runNb', '_lumiBlock', '_origin', '_originReduced',
           '_isolationMC', '_partonIdMatched', '_sameParton', 'hJet_SoftLeptId']

markers += ['_lE', '_lEta', '_lPhi', '_lPt', '_mt', 'hJet_e', 'hJet_Soft',
            'hJet_JECUnc', 'hJet_phi', 'hJet_pt', '_closeJetPtAll']



In [5]:

    
flattened_columns = dict()
for column in all_columns:
    data = root_numpy.root2array(prompt_filename, treename=treename, branches=[column], stop=10)
    try:
        for index in range(len(data[0][0])):
            flattened_columns["{column}[{index}]".format(column=column, index=index)] = column + str(index)
    except:
        flattened_columns[column] = column
# for some strange reason this column is also added
flattened_columns.pop('_PVerr[3]')









    Out[5]:





'_PVerr3'

Reading data, splitting to train/test

signal: _origin = 0,
bck: _origin = 1



In [6]:

    
n_events = 1000000



In [7]:

    
read_columns = sorted(flattened_columns.keys())

data = pandas.DataFrame(root_numpy.root2array(prompt_filename, treename=treename, branches=read_columns, stop=n_events))
data['ptRatio'] = data.eval('_lPt / _closeJetPtAll')
# Three IP features contain the events with huge values not convertible to float32 
data = numpy.clip(data, -1e20, 1e20)
labels = 1 - data._origin

trainX, testX, trainY, testY = train_test_split(data, labels, train_size=0.2)

Features used in training



In [8]:

    
train_variables = [col for col in data.columns if not any([marker in col for marker in markers])]
sorted(train_variables)









    Out[8]:





['_3dIP',
 '_3dIPerr',
 '_3dIPsig',
 '_PVchi2',
 '_PVerr[0]',
 '_PVerr[1]',
 '_PVerr[2]',
 '_closeJetAngAll',
 '_closeJetCSVAll',
 '_closeJetEtaAll',
 '_closeJetPhiAll',
 '_flavors',
 '_ipPV',
 '_ipPVerr',
 '_ipZPV',
 '_ipZPVerr',
 '_isolation',
 '_isolationComponents[0]',
 '_isolationComponents[1]',
 '_isolationComponents[2]',
 '_isolationComponents[3]',
 '_ptRelAll',
 'hJet_cef',
 'hJet_nconstituents',
 'hJet_vtx3dL',
 'hJet_vtx3deL',
 'hJet_vtxMass',
 'hJet_vtxPt',
 'ptRatio']

Training Classifier



In [9]:

    
from sklearn.ensemble import GradientBoostingClassifier
from hep_ml import HidingClassifier



In [10]:

    
classifiers = hep_ml.ClassifiersDict()
base_gb = GradientBoostingClassifier(subsample=0.2, n_estimators=200, max_depth=8, min_samples_split=300, 
                                     max_features=8, learning_rate=0.05)

classifiers['gb'] = HidingClassifier(base_estimator=base_gb, train_variables=train_variables)



In [11]:

    
classifiers.fit(trainX, trainY)









    



Classifier           gb is learnt in 109.62 seconds
Totally spent 109.62 seconds on training






    Out[11]:





ClassifiersDict([('gb', HidingClassifier(base_estimator=GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=8, max_features=8, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=300, n_estimators=200,
              random_state=None, subsample=0.2, verbose=0,
              warm_start=False),
         train_variables=['_3dIP', '_3dIPerr', '_3dIPsig', '_PVchi2', '_PVerr[0]', '_PVerr[1]', '_PVerr[2]', '_closeJetAngAll', '_closeJetCSVAll', '_closeJetEtaAll', '_closeJetPhiAll', '_flavors', '_ipPV', '_ipPVerr', '_ipZPV', '_ipZPVerr', '_isolation', '_isolationComponents[0]', '_isolationComponents[1]', '_isolationComponents[2]', '_isolationComponents[3]', '_ptRelAll', 'hJet_cef', 'hJet_nconstituents', 'hJet_vtx3dL', 'hJet_vtx3deL', 'hJet_vtxMass', 'hJet_vtxPt', 'ptRatio']))])



In [15]:

    
for flavour,  flavour_name in flavours.items():
    mask = testX._flavors == flavour
    predictions = classifiers.test_on(testX[mask], testY[mask]).roc()
    fpr, tpr, _ = roc_curve(testY[mask], testX._istight[mask])
    plot(tpr[1:2], 1 - fpr[1:2], 'o', label='tight')   
    title(flavour_name)
    grid()



In [14]:

    
predictions.learning_curves()









    Out[14]:





<hep_ml.reports.Predictions at 0x6e2dd50>

Feature importances



In [18]:

    
gb = classifiers['gb']._trained_estimator
feature_imps = pandas.Series(data=gb.feature_importances_, index=train_variables)
feature_imps.sort(ascending=False)
feature_imps









    Out[18]:





_3dIPsig                   0.122946
_closeJetAngAll            0.077345
ptRatio                    0.067477
_ptRelAll                  0.066567
_isolationComponents[3]    0.062813
_3dIP                      0.058037
_ipPV                      0.055088
_closeJetCSVAll            0.037188
_3dIPerr                   0.036935
_closeJetEtaAll            0.034482
hJet_cef                   0.033311
_PVchi2                    0.031673
_ipPVerr                   0.030341
_PVerr[2]                  0.030043
_PVerr[0]                  0.028383
_ipZPVerr                  0.028304
_PVerr[1]                  0.027201
_closeJetPhiAll            0.026685
_ipZPV                     0.025860
_isolation                 0.021323
_isolationComponents[2]    0.021231
hJet_nconstituents         0.020410
_isolationComponents[1]    0.016525
_isolationComponents[0]    0.015663
_flavors                   0.008665
hJet_vtxPt                 0.004793
hJet_vtxMass               0.004792
hJet_vtx3dL                0.003007
hJet_vtx3deL               0.002914
dtype: float64