In this notebook, we try simple classifiers

The aim to distinguish prompt leptons (those coming from W boson) from others.

The data is decays with $t \bar{t}$ production


In [1]:
import root_numpy
import hep_ml
import pandas
from sklearn.metrics import roc_curve
from hep_ml import reports
from hep_ml.commonutils import train_test_split, print_header
from hep_ml.reports import plot_roc
from hep_ml.rootutilities import list_flat_branches, tree2pandas
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
# 13 TeV files (new)
folder = '/mnt/w76/notebook/datasets/fromLesya/ver5/'
prompt_filename = folder + 'tt13_prompt.root'
fakes_filename  = folder + 'tt13_fake.root'

treename = "FakeElectrons/fakeTree"

Correspondence in data:


In [3]:
flavours = {0: 'electrons', 1: 'muons'}
origins = {1: 'b', 2: 'c', 3: 'uds'} # and 0: prompt

Train variables


In [4]:
train_variables = [
    '_3dIP',
    '_3dIPerr',
    '_3dIPsig',
    '_PVchi2',
    '_PVerr[0]',
    '_PVerr[1]',
    '_PVerr[2]',
    '_closeJetAngAll',
    '_closeJetCSVAll',
    '_closeJetEtaAll',
    '_closeJetPhiAll',
    '_flavors',
    '_ipPV',
    '_ipPVerr',
    '_ipZPV',
    '_ipZPVerr',
    '_isolation',
    '_isolationComponents[0]',
    '_isolationComponents[1]',
    '_isolationComponents[2]',
    '_isolationComponents[3]',
    '_ptRelAll',
    'hJet_cef',
    'hJet_nconstituents',
    'hJet_vtx3dL',
    'hJet_vtx3deL',
    'hJet_vtxMass',
    'hJet_vtxNt',
    '_lPt / _closeJetPtAll'
]

Columns to read from file


In [5]:
flat_columns = list_flat_branches(prompt_filename, treename=treename, use_dtype=False)
# This column is added for some strange reason
flat_columns.remove('_PVerr[3]')
read_columns = set(flat_columns + train_variables)
def isneeded(column):
    return all([marker not in column for marker in ['_lep', '_jet', '_bTagged']])
read_columns = sorted([col for col in read_columns if isneeded(col)])

read_columns


Out[5]:
['HT',
 '_3dIP',
 '_3dIPerr',
 '_3dIPsig',
 '_PVchi2',
 '_PVerr[0]',
 '_PVerr[1]',
 '_PVerr[2]',
 '_charges',
 '_closeJetAngAll',
 '_closeJetCSVAll',
 '_closeJetEtaAll',
 '_closeJetEtaAllMC',
 '_closeJetPhiAll',
 '_closeJetPhiAllMC',
 '_closeJetPtAll',
 '_closeJetPtAllMC',
 '_closeJetPtAllstatus',
 '_csExercise',
 '_csJetIndex',
 '_csv[0]',
 '_csv[1]',
 '_csv[2]',
 '_csv[3]',
 '_eventNb',
 '_flavors',
 '_hardIeta[0]',
 '_hardIeta[1]',
 '_hardIeta[2]',
 '_hardIeta[3]',
 '_hardIpdg[0]',
 '_hardIpdg[1]',
 '_hardIpdg[2]',
 '_hardIpdg[3]',
 '_hardIpdg[4]',
 '_hardIpdg[5]',
 '_hardIpdg[6]',
 '_hardIpdg[7]',
 '_hardIphi[0]',
 '_hardIphi[1]',
 '_hardIphi[2]',
 '_hardIphi[3]',
 '_hardIpt[0]',
 '_hardIpt[1]',
 '_hardIpt[2]',
 '_hardIpt[3]',
 '_ipPV',
 '_ipPVerr',
 '_ipPVmc',
 '_ipZPV',
 '_ipZPVerr',
 '_isloose',
 '_isolation',
 '_isolationComponents[0]',
 '_isolationComponents[1]',
 '_isolationComponents[2]',
 '_isolationComponents[3]',
 '_isolationMC[0]',
 '_isolationMC[1]',
 '_isolationMC[2]',
 '_isolationMC[3]',
 '_istight',
 '_lE',
 '_lEta',
 '_lPhi',
 '_lPt',
 '_lPt / _closeJetPtAll',
 '_lumiBlock',
 '_met',
 '_met_phi',
 '_mometa',
 '_mompdg',
 '_momphi',
 '_mompt',
 '_mt',
 '_nHard',
 '_n_Jets',
 '_n_PV',
 '_n_bJets',
 '_origin',
 '_originReduced',
 '_partonIdMatched',
 '_ptRelAll',
 '_runNb',
 '_sameParton',
 'hJet_JECUnc',
 'hJet_SoftLeptId95',
 'hJet_SoftLeptIdlooseMu',
 'hJet_SoftLeptPt',
 'hJet_SoftLeptdR',
 'hJet_SoftLeptptRel',
 'hJet_cef',
 'hJet_e',
 'hJet_eta',
 'hJet_genPt',
 'hJet_nconstituents',
 'hJet_phi',
 'hJet_pt',
 'hJet_ptLeadTrack',
 'hJet_ptRaw',
 'hJet_vtx3dL',
 'hJet_vtx3deL',
 'hJet_vtxMass',
 'hJet_vtxNt']

Reading the data


In [6]:
# number of events we will use (first events from file)
n_train_events = 200000

In [7]:
def read_events(columns, start=None, stop=None):
    prompt_data = root_numpy.root2array(prompt_filename, treename=treename, branches=columns, start=start, stop=stop)
    prompt_data = pandas.DataFrame(prompt_data)

    fake_data = root_numpy.root2array(fakes_filename, treename=treename, branches=columns, start=start, stop=stop)
    fake_data = pandas.DataFrame(fake_data)
    
    data = pandas.concat([prompt_data, fake_data], ignore_index=True)
    # Three IP features contain the events with huge values not convertible to float32 
    data = numpy.clip(data, -1e20, 1e20)
    labels = numpy.array([1] * len(prompt_data) + [0] * len(fake_data))
    
    return data, labels

Features we use in training


In [8]:
data, labels = read_events(read_columns, stop=n_train_events)
trainX, testX, trainY, testY = train_test_split(data, labels, train_size=0.5)

In [9]:
len(trainX), len(testX)


Out[9]:
(200000, 200000)

Plotting the distributions


In [10]:
figure(figsize=[18, 100])
reports.plot_features_pdf(data, labels, n_bins=30, sig_label='prompt', bck_label='fake')


Training classifiers


In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from hep_ml import HidingClassifier
from hep_ml import ugradientboosting as ugb
from hep_ml.experiments import metaclassifiers

In [12]:
# base_gb = GradientBoostingClassifier(subsample=0.3, n_estimators=100, max_depth=8, min_samples_split=300, 
#                                      max_features=8, learning_rate=0.03)
# base_splitter = metaclassifiers.DumbSplitter(base_estimator=base_gb, feature_name='_flavors')
# classifiers['gb'] = HidingClassifier(base_estimator=base_splitter, train_variables=train_variables)

In [13]:
classifiers = hep_ml.ClassifiersDict()

fl_loss = ugb.BinFlatnessLossFunction(['_mt'], uniform_label=0, ada_coefficient=0.05)
base_gb = ugb.uGradientBoostingClassifier(loss=fl_loss, n_estimators=100, subsample=0.15, max_depth=8, min_samples_split=300, 
                                          max_features=8, learning_rate=0.03, train_variables=train_variables)
classifiers['gb'] = metaclassifiers.DumbSplitter(base_estimator=base_gb, feature_name='_flavors')

In [14]:
classifiers.fit(trainX, trainY)
predictions = classifiers.test_on(testX, testY)


Classifier           gb is learnt in 41.74 seconds
Totally spent 41.74 seconds on training

Feature importances


In [15]:
# splitter = classifiers['gb']._trained_estimator
# for flavour, flavour_name in flavours.items(): 
#     gb = splitter.classifiers[flavour]
#     feature_imps = pandas.Series(data=gb.feature_importances_, index=train_variables)
#     feature_imps.sort(ascending=False)
#     print_header(flavour_name)
#     print feature_imps

By flavour


In [16]:
gb_preds = classifiers['gb'].predict_proba(testX)[:, 1]
for flavour, flavour_name in flavours.items():
    figure(figsize=[13, 9])
    xlim(0.5, 1.), ylim(0.5, 1.), grid()    
    mask = numpy.array(testX._flavors == flavour)
    plot_roc(testY, gb_preds, classifier_name='all', mask=mask)
    plot_roc(testY, testX._istight, classifier_name='tight', mask=mask, is_cut=True)
    for origin, origin_name in origins.items():
        mask = (testX._flavors == flavour) & ( (testX._originReduced == 0) | (testX._originReduced == origin) )
        mask = numpy.array(mask)
        plot_roc(testY, gb_preds, classifier_name=origin_name, mask=mask)
    legend(loc='lower left')


/mnt/w76/venv/py27/local/lib/python2.7/site-packages/numpy/lib/function_base.py:1114: DeprecationWarning: numpy boolean subtract (the binary `-` operator) is deprecated, use the bitwise_xor (the `^` operator) or the logical_xor function instead.
  return a[slice1]-a[slice2]

ROC for electrons / muons that passed tight cut + separately for origins


In [17]:
for flavour, flavour_name in flavours.items():
    figure(figsize=[13, 9]), grid()
    
    mask = (testX._flavors == flavour) & (testX._istight == 1)
    plot_roc(testY, gb_preds, classifier_name='all', mask=numpy.array(mask))
    for origin, origin_name in origins.items():
        mask = (testX._flavors == flavour) & (testX._istight == 1) & numpy.in1d(testX._originReduced, [0, origin]) 
        plot_roc(testY, gb_preds, classifier_name=origin_name, mask=numpy.array(mask))
        
    legend(loc='lower left')
    title(flavour_name + ' (istight == 1)')


Efficiencies for leptons


In [18]:
# added _lPt < 250, otherwise many empty bins
# for flavour, flavour_name in flavours.items():
#     mask = (testX._flavors == flavour) & (testX._lPt < 250)
#     mask = numpy.array(mask)
#     pred = classifiers.test_on(testX[mask], testY[mask])
#     pred.hist(['_lPt']), title(flavour_name)
#     pred.efficiency(['_lPt'], label=1), title(flavour_name + ' Signal efficiency')
#     pred.efficiency(['_lPt'], label=0), title(flavour_name + ' Background rejection')

ROC for different _n_Bjets


In [19]:
for flavour, flavour_name in flavours.items():
    figure(figsize=[14, 10])
    for min_jets in [[0], [1], [2], range(3, 10)]:
        mask = numpy.array((testX._flavors == flavour) & (numpy.in1d(testX._n_bJets, min_jets)))
        plot_roc(testY, gb_preds, classifier_name="n_bjets in {}".format(min_jets), mask=mask)
        xlim(0.7, 1.)
        ylim(0.7, 1.)
    grid()
    title(flavour_name)
    legend(loc='lower left')


Selecting cut on classifier

separately for electrons and muons


In [20]:
def compute_cuts(filename, treename, train_variables, classifier, left_events_per_flavour=5000):
    data = tree2pandas(filename, treename, branches=train_variables, start=n_train_events)
    data_flavours = tree2pandas(filename, treename=treename, branches=['_flavors'], start=n_train_events)['_flavors']
    predictions = classifier.predict_proba(data)[:, 1]
    result = {}
    for flavour, flavour_name in flavours.items():
        mask = numpy.array(data_flavours == flavour)
        masked_prediction = predictions[mask]
        cut = numpy.sort(masked_prediction)[- left_events_per_flavour - 1]
        print flavour_name, cut 
        print 'passed', numpy.sum(masked_prediction > cut),
        print 'not passed', numpy.sum(masked_prediction <= cut)
        result[flavour] = cut
    return result

In [21]:
cuts = compute_cuts(fakes_filename, treename, train_variables, classifiers['gb'])


electrons 0.50403444538
passed 5000 not passed 68825
muons 0.502879806193
passed 5000 not passed 287253

In [22]:
cuts


Out[22]:
{0: 0.50403444538026243, 1: 0.5028798061926717}

How many prompt events pass the cut


In [23]:
prompt_train_vars = tree2pandas(prompt_filename, treename=treename, branches=train_variables, start=n_train_events)
prompt_predictions = classifiers['gb'].predict_proba(prompt_train_vars)[:, 1]
prompt_flavours = tree2pandas(prompt_filename, treename=treename, branches=['_flavors'], start=n_train_events)['_flavors']

In [24]:
for flavour, flavour_name in flavours.items():
    print 'prompt passed,', flavour_name
    mask = numpy.array(prompt_flavours == flavour)
    print numpy.mean(prompt_predictions[mask] > cuts[flavour])


prompt passed, electrons
0.940074395025
prompt passed, muons
0.945268814459

Comparing fakes that passed the cut with those didn't


In [25]:
fake_test_data = tree2pandas(fakes_filename, treename=treename, branches=read_columns)

In [26]:
fake_predictions = classifiers['gb'].predict_proba(fake_test_data)[:, 1]

In [27]:
def compute_passed(predictions, flavours, cuts):
    result = (flavours == 1) & (predictions > cuts[1])
    result |= (flavours == 0) & (predictions > cuts[0])
    return result

In [28]:
fake_passed = numpy.array(compute_passed(fake_predictions, fake_test_data._flavors, cuts=cuts))

In [29]:
figure(figsize=[18, 80])
reports.plot_features_pdf(fake_test_data, fake_passed, sig_label='passed', bck_label='not passed')


Comparing fakes that passed the cut with those didn't with _isolation < 1


In [30]:
mask = fake_test_data._isolation < 1
figure(figsize=[18, 80])
reports.plot_features_pdf(fake_test_data, fake_passed, mask=mask, sig_label='passed', bck_label='not passed')


Looking at efficiency


In [31]:
for feature in ['_met', '_mt', 'HT', '_n_Jets', '_n_bJets', '_lPt']:
    predictions.rcp(feature, label=0, ignored_sidebands=0.001)
    title('Efficiency for ' + feature + ', Fake')

    predictions.rcp(feature, label=1, ignored_sidebands=0.001)
    title('Rejection for ' + feature + ', Prompt')


Looking at efficiency for _isolation < 1


In [32]:
mask = numpy.array(testX._isolation < 1)
for feature in ['_met', '_mt', 'HT', '_n_Jets', '_n_bJets', '_lPt']:
    predictions.rcp(feature, label=0, ignored_sidebands=0.001, mask=mask)
    title('Efficiency for ' + feature + ', Fake')

    predictions.rcp(feature, label=1, ignored_sidebands=0.001, mask=mask)
    title('Rejection for ' + feature + ', Prompt')


Defining most correlated features for background rejection

current hypothesis: major dependence is on ratio lPt / closeJetPtAll


In [107]:
main_feature = '_lPt / _closeJetPtAll'
parts = 5
bin_limits = numpy.percentile( fake_test_data[main_feature], numpy.linspace(0, 100, parts + 1)[1:-1]) 
splitter_feature = numpy.searchsorted(bin_limits, fake_test_data[main_feature])
fake_test_data['splitter'] = splitter_feature

In [132]:
base_forest = HidingClassifier(read_columns, RandomForestClassifier(n_estimators=30, max_depth=3, min_samples_leaf=50, max_features=10))
fake_clf = metaclassifiers.DumbSplitter(feature_name='splitter', base_estimator=base_forest)
fake_clf.fit(fake_test_data, fake_passed)


Out[132]:
DumbSplitter(base_estimator=HidingClassifier(base_estimator=RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=3, max_features=10,
            max_leaf_nodes=None, min_density=None, min_samples_leaf=50,
            min_samples_split=2, n_estimators=30, n_jobs...pt', 'hJet_ptLeadTrack', 'hJet_ptRaw', 'hJet_vtx3dL', 'hJet_vtx3deL', 'hJet_vtxMass', 'hJet_vtxNt']),
       feature_name='splitter')

In [133]:
feature_importances = sum(cl._trained_estimator.feature_importances_  for cl in fake_clf.classifiers.values()) 
x = pandas.Series(data=feature_importances, index=read_columns)
x.sort(ascending=False)
x[:40]


Out[133]:
hJet_SoftLeptptRel         0.511936
_3dIPsig                   0.502259
_istight                   0.465176
_ptRelAll                  0.381617
_isolation                 0.374023
_isolationComponents[0]    0.373593
_3dIP                      0.361758
_ipPV                      0.258294
_closeJetAngAll            0.235199
hJet_SoftLeptdR            0.203992
_ipPVmc                    0.144796
_closeJetCSVAll            0.139400
hJet_vtx3dL                0.073490
_isolationComponents[2]    0.071567
hJet_vtxMass               0.069872
hJet_nconstituents         0.069393
_lPt / _closeJetPtAll      0.064117
hJet_pt                    0.057968
hJet_vtx3deL               0.052519
hJet_cef                   0.051076
_lPt                       0.042377
_closeJetPtAllMC           0.041427
_closeJetPtAll             0.041121
_origin                    0.033740
hJet_ptRaw                 0.033384
_lE                        0.029531
hJet_JECUnc                0.027745
hJet_SoftLeptPt            0.026490
_isolationMC[3]            0.025534
_originReduced             0.025394
_ipZPV                     0.023055
hJet_genPt                 0.021107
_isolationMC[0]            0.020090
hJet_vtxNt                 0.017641
_3dIPerr                   0.013814
_isolationComponents[3]    0.013430
HT                         0.011848
_mompt                     0.009683
_isolationMC[1]            0.009075
_PVerr[1]                  0.008922
dtype: float64

The same is done for top 40000 leptons,

comparing isolation > 0.15 with isolation < 0.15


In [115]:
fake_loose_cuts = compute_cuts(fakes_filename, treename, train_variables, classifiers['gb'], left_events_per_flavour=20000)
fake_passed_loose_cut = compute_passed(fake_predictions, fakes_flavours, fake_loose_cuts).values


electrons 0.479209886156
passed 20000 not passed 53825
muons 0.479214174501
passed 20000 not passed 272253

In [126]:
fake_test_data_passed_loose = fake_test_data.ix[fake_passed_loose_cut, :].copy()
fakes_flavours_passed_loose = fakes_flavours[fake_passed_loose_cut]
isolations_passed_loose = fake_test_data_passed_loose._isolation > 0.15
_ = hist(isolations_passed_loose.values)



In [127]:
main_feature = '_lPt / _closeJetPtAll'
parts = 5
bin_limits = numpy.percentile(fake_test_data_passed_loose[main_feature], numpy.linspace(0, 100, parts + 1)[1:-1]) 
splitter_feature_passed_loose = numpy.searchsorted(bin_limits, fake_test_data_passed_loose[main_feature])
fake_test_data_passed_loose['splitter'] = splitter_feature_passed_loose

In [137]:
columns_without_isolation = list(set(read_columns) - set(['_isolation']))
base_forest = HidingClassifier(columns_without_isolation, RandomForestClassifier(n_estimators=30, max_depth=3, min_samples_leaf=50, max_features=10))
fake_clf = metaclassifiers.DumbSplitter(feature_name='splitter', base_estimator=base_forest)
fake_clf.fit(fake_test_data_passed_loose, isolations_passed_loose)


Out[137]:
DumbSplitter(base_estimator=HidingClassifier(base_estimator=RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=3, max_features=10,
            max_leaf_nodes=None, min_density=None, min_samples_leaf=50,
            min_samples_split=2, n_estimators=30, n_jobs...rdIpt[1]', '_met_phi', '_closeJetPtAllMC', '_3dIPsig', '_csv[3]', '_hardIpdg[5]', '_closeJetPtAll']),
       feature_name='splitter')

In [139]:
feature_importances = sum(cl._trained_estimator.feature_importances_  for cl in fake_clf.classifiers.values()) 
x = pandas.Series(data=feature_importances, index=columns_without_isolation)
x.sort(ascending=False)
x[:40]


Out[139]:
_istight                   1.167863
_isolationComponents[0]    0.802970
_isolationComponents[3]    0.417814
_isolationComponents[2]    0.381333
_3dIP                      0.379600
_ipPV                      0.328047
_3dIPsig                   0.312408
_ipPVmc                    0.090935
_closeJetCSVAll            0.089105
hJet_SoftLeptptRel         0.083299
_closeJetAngAll            0.078341
_lPt / _closeJetPtAll      0.077436
hJet_JECUnc                0.055534
hJet_vtx3deL               0.050712
hJet_vtx3dL                0.050277
hJet_pt                    0.043474
hJet_cef                   0.038949
_isolationComponents[1]    0.034132
_closeJetPtAll             0.033748
hJet_SoftLeptdR            0.032239
hJet_ptRaw                 0.032078
_PVerr[0]                  0.031403
_PVerr[2]                  0.028846
hJet_nconstituents         0.028641
hJet_SoftLeptPt            0.025635
_ipZPV                     0.024042
_ptRelAll                  0.022223
_flavors                   0.020593
_PVchi2                    0.019274
_closeJetPtAllMC           0.018621
_PVerr[1]                  0.018171
hJet_genPt                 0.016671
_origin                    0.015718
_isolationMC[3]            0.013407
_closeJetEtaAllMC          0.013084
_lPt                       0.012719
hJet_vtxMass               0.012640
_isolationMC[0]            0.011451
_mompt                     0.009522
hJet_vtxNt                 0.009127
dtype: float64