In this notebook we distinguish prompt leptons with different origin


In [1]:
import root_numpy
import hep_ml
import pandas
from hep_ml.commonutils import train_test_split
from hep_ml.reports import plot_roc
from sklearn.metrics import roc_curve

In [2]:
folder = '/mnt/w76/notebook/datasets/fromLesya/ver3/'
prompt_filename = folder + 'tt_prompts.root'
fakes_filename  = folder + 'tt_fakes.root'

treename = "fakeTree"

Correspondence in data


In [3]:
flavours = {0: 'electrons', 1: 'muons'}
origins = {1: 'b', 2: 'c', 3: 'uds'} # and 0: prompt

Markers for features not used in training


In [4]:
all_columns = root_numpy.list_branches(prompt_filename, treename=treename)

markers = ['_mom', 'gen', '_lep', 'AllMC', '_n_', '_met', 
           'HT', '_closeJetPtAllstatus', '_charges', '_ipPVmc', '_isloose', '_istight',
           '_eventNb', '_runNb', '_lumiBlock', '_origin', '_originReduced',
           '_isolationMC', '_partonIdMatched', '_sameParton', 'hJet_SoftLeptId']

markers += ['_lE', '_lEta', '_lPhi', '_lPt', '_mt', 'hJet_e', 'hJet_Soft',
            'hJet_JECUnc', 'hJet_phi', 'hJet_pt', '_closeJetPtAll']

In [5]:
flattened_columns = dict()
for column in all_columns:
    data = root_numpy.root2array(prompt_filename, treename=treename, branches=[column], stop=10)
    try:
        for index in range(len(data[0][0])):
            flattened_columns["{column}[{index}]".format(column=column, index=index)] = column + str(index)
    except:
        flattened_columns[column] = column
# for some strange reason this column is also added
flattened_columns.pop('_PVerr[3]')


Out[5]:
'_PVerr3'

Reading data, splitting to train/test

signal: _origin = 0,
bck: _origin = 1


In [6]:
n_events = 1000000

In [7]:
read_columns = sorted(flattened_columns.keys())

data = pandas.DataFrame(root_numpy.root2array(prompt_filename, treename=treename, branches=read_columns, stop=n_events))
data['ptRatio'] = data.eval('_lPt / _closeJetPtAll')
# Three IP features contain the events with huge values not convertible to float32 
data = numpy.clip(data, -1e20, 1e20)
labels = 1 - data._origin

trainX, testX, trainY, testY = train_test_split(data, labels, train_size=0.2)

Features used in training


In [8]:
train_variables = [col for col in data.columns if not any([marker in col for marker in markers])]
sorted(train_variables)


Out[8]:
['_3dIP',
 '_3dIPerr',
 '_3dIPsig',
 '_PVchi2',
 '_PVerr[0]',
 '_PVerr[1]',
 '_PVerr[2]',
 '_closeJetAngAll',
 '_closeJetCSVAll',
 '_closeJetEtaAll',
 '_closeJetPhiAll',
 '_flavors',
 '_ipPV',
 '_ipPVerr',
 '_ipZPV',
 '_ipZPVerr',
 '_isolation',
 '_isolationComponents[0]',
 '_isolationComponents[1]',
 '_isolationComponents[2]',
 '_isolationComponents[3]',
 '_ptRelAll',
 'hJet_cef',
 'hJet_nconstituents',
 'hJet_vtx3dL',
 'hJet_vtx3deL',
 'hJet_vtxMass',
 'hJet_vtxPt',
 'ptRatio']

Training Classifier


In [9]:
from sklearn.ensemble import GradientBoostingClassifier
from hep_ml import HidingClassifier

In [10]:
classifiers = hep_ml.ClassifiersDict()
base_gb = GradientBoostingClassifier(subsample=0.2, n_estimators=200, max_depth=8, min_samples_split=300, 
                                     max_features=8, learning_rate=0.05)

classifiers['gb'] = HidingClassifier(base_estimator=base_gb, train_variables=train_variables)

In [11]:
classifiers.fit(trainX, trainY)


Classifier           gb is learnt in 109.62 seconds
Totally spent 109.62 seconds on training
Out[11]:
ClassifiersDict([('gb', HidingClassifier(base_estimator=GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=8, max_features=8, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=300, n_estimators=200,
              random_state=None, subsample=0.2, verbose=0,
              warm_start=False),
         train_variables=['_3dIP', '_3dIPerr', '_3dIPsig', '_PVchi2', '_PVerr[0]', '_PVerr[1]', '_PVerr[2]', '_closeJetAngAll', '_closeJetCSVAll', '_closeJetEtaAll', '_closeJetPhiAll', '_flavors', '_ipPV', '_ipPVerr', '_ipZPV', '_ipZPVerr', '_isolation', '_isolationComponents[0]', '_isolationComponents[1]', '_isolationComponents[2]', '_isolationComponents[3]', '_ptRelAll', 'hJet_cef', 'hJet_nconstituents', 'hJet_vtx3dL', 'hJet_vtx3deL', 'hJet_vtxMass', 'hJet_vtxPt', 'ptRatio']))])

In [15]:
for flavour,  flavour_name in flavours.items():
    mask = testX._flavors == flavour
    predictions = classifiers.test_on(testX[mask], testY[mask]).roc()
    fpr, tpr, _ = roc_curve(testY[mask], testX._istight[mask])
    plot(tpr[1:2], 1 - fpr[1:2], 'o', label='tight')   
    title(flavour_name)
    grid()



In [14]:
predictions.learning_curves()


Out[14]:
<hep_ml.reports.Predictions at 0x6e2dd50>

Feature importances


In [18]:
gb = classifiers['gb']._trained_estimator
feature_imps = pandas.Series(data=gb.feature_importances_, index=train_variables)
feature_imps.sort(ascending=False)
feature_imps


Out[18]:
_3dIPsig                   0.122946
_closeJetAngAll            0.077345
ptRatio                    0.067477
_ptRelAll                  0.066567
_isolationComponents[3]    0.062813
_3dIP                      0.058037
_ipPV                      0.055088
_closeJetCSVAll            0.037188
_3dIPerr                   0.036935
_closeJetEtaAll            0.034482
hJet_cef                   0.033311
_PVchi2                    0.031673
_ipPVerr                   0.030341
_PVerr[2]                  0.030043
_PVerr[0]                  0.028383
_ipZPVerr                  0.028304
_PVerr[1]                  0.027201
_closeJetPhiAll            0.026685
_ipZPV                     0.025860
_isolation                 0.021323
_isolationComponents[2]    0.021231
hJet_nconstituents         0.020410
_isolationComponents[1]    0.016525
_isolationComponents[0]    0.015663
_flavors                   0.008665
hJet_vtxPt                 0.004793
hJet_vtxMass               0.004792
hJet_vtx3dL                0.003007
hJet_vtx3deL               0.002914
dtype: float64