In this notebook, we select features across which the fake rate should be computed


In [1]:
import root_numpy
import hep_ml
import pandas
from sklearn.metrics import roc_curve
from hep_ml import reports
from hep_ml.commonutils import train_test_split, print_header
from hep_ml.reports import plot_roc
from hep_ml.rootutilities import list_flat_branches, tree2pandas, print_root_structure
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
# 8 TeV files 
folder = '/mnt/w76/notebook/datasets/fromLesya/ver3/'
prompt_filename = folder + 'tt_prompts.root'
fakes_filename  = folder + 'tt_fakes.root'

treename = "fakeTree"

Correspondence in data:


In [3]:
flavours = {0: 'electrons', 1: 'muons'}
origins = {1: 'b', 2: 'c', 3: 'uds'} # and 0: prompt

Train variables


In [4]:
train_variables = [
    '_3dIP',
    '_3dIPerr',
    '_3dIPsig',
    '_PVchi2',
    '_PVerr[0]',
    '_PVerr[1]',
    '_PVerr[2]',
    '_closeJetAngAll',
    '_closeJetCSVAll',
    '_closeJetEtaAll',
    '_closeJetPhiAll',
    '_flavors',
    '_ipPV',
    '_ipPVerr',
    '_ipZPV',
    '_ipZPVerr',
    '_isolation',
    '_isolationComponents[0]',
    '_isolationComponents[1]',
    '_isolationComponents[2]',
    '_isolationComponents[3]',
    '_ptRelAll',
#     'hJet_cef',
#     'hJet_nconstituents',
#     'hJet_vtx3dL',
#     'hJet_vtx3deL',
#     'hJet_vtxMass',
#     'hJet_vtxPt',
    '_lPt / _closeJetPtAll'
]

uniform_variables = ['_mt']

read_columns = train_variables + uniform_variables + ['_istight', '_originReduced' ]

Reading the data


In [5]:
# number of events we will use (first events from file)
n_train_events = 100000
n_test_events  = 1000000
n_used_events  = n_train_events + n_test_events

In [6]:
def read_events(columns, start=None, stop=None):
    prompt_data = root_numpy.root2array(prompt_filename, treename=treename, branches=columns, start=start, stop=stop)
    prompt_data = pandas.DataFrame(prompt_data)

    fake_data = root_numpy.root2array(fakes_filename, treename=treename, branches=columns, start=start, stop=stop)
    fake_data = pandas.DataFrame(fake_data)
    
    data = pandas.concat([prompt_data, fake_data], ignore_index=True)
    # Three IP features contain the events with huge values not convertible to float32 
    data = numpy.clip(data, -1e20, 1e20)
    labels = numpy.array([1] * len(prompt_data) + [0] * len(fake_data))
    
    return data, labels

In [7]:
trainX, trainY = read_events(read_columns, stop=n_train_events)
testX, testY = read_events(read_columns, start=n_train_events, stop=n_used_events)

In [8]:
len(trainX), len(testX)


Out[8]:
(200000, 2000000)

Plotting the distributions


In [10]:
figure(figsize=[18, 40])
reports.plot_features_pdf(trainX, trainY, n_bins=30, sig_label='prompt', bck_label='fake')


Training classifiers


In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from hep_ml import HidingClassifier
from hep_ml import ugradientboosting as ugb
from hep_ml.experiments import metaclassifiers

In [12]:
classifiers = hep_ml.ClassifiersDict()
fl_loss = ugb.BinFlatnessLossFunction(uniform_variables, uniform_label=0, ada_coefficient=0.05)
base_gb = ugb.uGradientBoostingClassifier(loss=fl_loss, n_estimators=100, subsample=0.15, max_depth=8, min_samples_split=300, 
                                          max_features=8, learning_rate=0.03, train_variables=train_variables)
classifiers['gb'] = metaclassifiers.DumbSplitter(base_estimator=base_gb, feature_name='_flavors')

In [13]:
classifiers.fit(trainX, trainY)
predictions = classifiers.test_on(testX, testY)


Classifier           gb is learnt in 44.22 seconds
Totally spent 44.22 seconds on training

By flavour


In [14]:
gb_preds = classifiers['gb'].predict_proba(testX)[:, 1]
for flavour, flavour_name in flavours.items():
    figure(figsize=[13, 9])
    xlim(0.5, 1.), ylim(0.5, 1.), grid()    
    mask = numpy.array(testX._flavors == flavour)
    plot_roc(testY, gb_preds, classifier_name='all', mask=mask)
    plot_roc(testY, testX._istight, classifier_name='tight', mask=mask, is_cut=True)
    for origin, origin_name in origins.items():
        mask = (testX._flavors == flavour) & ( (testX._originReduced == 0) | (testX._originReduced == origin) )
        mask = numpy.array(mask)
        plot_roc(testY, gb_preds, classifier_name=origin_name, mask=mask)
    legend(loc='lower left')


/mnt/w76/venv/py27/local/lib/python2.7/site-packages/numpy/lib/function_base.py:1114: DeprecationWarning: numpy boolean subtract (the binary `-` operator) is deprecated, use the bitwise_xor (the `^` operator) or the logical_xor function instead.
  return a[slice1]-a[slice2]

Selecting cut on classifier

separately for electrons and muons


In [15]:
def compute_cuts(filename, treename, train_variables, classifier, left_events_per_flavour=5000):
    data = tree2pandas(filename, treename, branches=train_variables, start=n_train_events)
    data_flavours = tree2pandas(filename, treename=treename, branches=['_flavors'], 
                                start=n_train_events, stop=n_used_events)['_flavors']
    predictions = classifier.predict_proba(data)[:, 1]
    result = {}
    for flavour, flavour_name in flavours.items():
        mask = numpy.array(data_flavours == flavour)
        masked_prediction = predictions[mask]
        cut = numpy.sort(masked_prediction)[- left_events_per_flavour - 1]
        print flavour_name, cut 
        print 'passed', numpy.sum(masked_prediction > cut),
        print 'not passed', numpy.sum(masked_prediction <= cut)
        result[flavour] = cut
    return result

In [16]:
cuts = compute_cuts(fakes_filename, treename, train_variables, classifiers['gb'])


electrons 0.522692751064
passed 5000 not passed 158493
muons 0.515872535489
passed 5000 not passed 831507

In [17]:
cuts


Out[17]:
{0: 0.52269275106405844, 1: 0.51587253548911116}

How many prompt events pass the cut


In [18]:
prompt_train_vars = tree2pandas(prompt_filename, treename=treename, branches=train_variables, 
                                start=n_train_events, stop=n_used_events)
prompt_flavours = tree2pandas(prompt_filename, treename=treename, branches=['_flavors'], 
                              start=n_train_events, stop=n_used_events)['_flavors']

prompt_predictions = classifiers['gb'].predict_proba(prompt_train_vars)[:, 1]

In [19]:
for flavour, flavour_name in flavours.items():
    print 'prompt passed,', flavour_name
    mask = numpy.array(prompt_flavours == flavour)
    print numpy.mean(prompt_predictions[mask] > cuts[flavour])


prompt passed, electrons
0.899934773256
prompt passed, muons
0.90356761594

Comparing fakes that passed the cut with those didn't


In [20]:
fake_test_data = tree2pandas(fakes_filename, treename=treename, branches=read_columns, start=n_train_events, stop=n_used_events)

In [21]:
fake_predictions = classifiers['gb'].predict_proba(fake_test_data)[:, 1]

In [22]:
def compute_passed(predictions, flavours, cuts):
    result = (flavours == 1) & (predictions > cuts[1])
    result |= (flavours == 0) & (predictions > cuts[0])
    return result

In [23]:
fake_passed = numpy.array(compute_passed(fake_predictions, fake_test_data._flavors, cuts=cuts))

In [24]:
figure(figsize=[18, 50])
reports.plot_features_pdf(fake_test_data, fake_passed, sig_label='passed', bck_label='not passed')


Comparing fakes that passed the cut with those didn't with _isolation < 1


In [25]:
# mask = fake_test_data._isolation < 1
# figure(figsize=[18, 80])
# reports.plot_features_pdf(fake_test_data, fake_passed, mask=mask, sig_label='passed', bck_label='not passed')

Looking at different decays


In [26]:
check_folder = '/mnt/w76/notebook/datasets/fromLesya/ver6/8TeV/'
from os import listdir
files = [filename for filename in listdir(check_folder)]
check_params = {'stop': 10000, 
                'selection': '(_flavors == 0) || (_lPt > 20)', 
                'treename': 'FakeElectrons/fakeTree' }

Predicting the check fakes


In [33]:
check_predictions = {}
check_passed = {}
check_flavors = {}
for name in files:
    data = tree2pandas(check_folder + name, branches=train_variables, **check_params)
    preds = classifiers['gb'].predict_proba(data)[:, 1]
    check_predictions[name] = preds
    check_passed[name] = compute_passed(preds, flavours=data._flavors, cuts=cuts)
    check_flavors[name] = data._flavors.values
    print name, ' avg passed:', check_passed[name].mean()


QCD.root  avg passed: 0.0654205607477
bb1a.root  avg passed: 0.0187095226673
tt1l.root  avg passed: 0.0148148148148
Wjets1.root  avg passed: 0.0402724904829
Wjets2.root  avg passed: 0.0363239380707
Wjets4.root  avg passed: 0.0248470239199
bb4a.root  avg passed: 0.0032154340836
bb3a.root  avg passed: 0.00434293067396
bb2a.root  avg passed: 0.00596957442711
Wjets3.root  avg passed: 0.028731199383

In [48]:
def plot_hist_and_fakerate(n_bins=10):
    for column in ['log(_lE)', '_closeJetPtAll', '_closeJetPtAllMC', 'log(_ipPV)' , 'log(abs(_ipZPV))', 'log(_lPt)']:
        column_data = {name: tree2pandas(check_folder + name, branches=[column], **check_params)[column].values 
                       for name in files}
        for flavour, flavour_name in flavours.items():
            print_header(column + " " + flavour_name)
            figure(figsize=[18, 8])
            subplot(121) 
            
            title('Distribution for ' + column + " " + flavour_name)
            conc = numpy.concatenate(column_data.values())
            # _, edges = numpy.histogram(conc, bins=n_bins)
            edges = numpy.percentile(conc, numpy.linspace(1, 99, n_bins + 1))
            for i, (name, data) in enumerate(column_data.items()):
                mask = (check_flavors[name] == flavour) * 1
                if i == 0:
                    hist(data, bins=edges, weights=mask, label=name, fill=False, edgecolor='b', normed=True)
                else:
                    hist(data, bins=edges, weights=mask, label=name, histtype='step', normed=True)

            legend(loc='best')
            subplot(122)
            title('Passed fraction for ' + column + " " + flavour_name)
            for i, (name, data) in enumerate(column_data.items()):
                mask = check_flavors[name] == flavour
                bins = numpy.searchsorted(edges, data) - 1
                bins = numpy.clip(bins, 0, n_bins - 1)
                passed = check_passed[name]
                mean_passed = numpy.bincount(bins, weights=passed & mask, minlength=n_bins) 
                mean_passed /= numpy.bincount(bins, weights=mask, minlength=n_bins) + 1e-10
                centers = (edges[:-1] + edges[1:]) / 2
                errorbar(centers, mean_passed, label=name)
            ylim(0, 1)
            legend(loc='best')
            show()

In [49]:
plot_hist_and_fakerate()


log(_lE) electrons

log(_lE) muons

_closeJetPtAll electrons

_closeJetPtAll muons

_closeJetPtAllMC electrons

_closeJetPtAllMC muons

log(_ipPV) electrons

log(_ipPV) muons

log(abs(_ipZPV)) electrons

log(abs(_ipZPV)) muons

log(_lPt) electrons

log(_lPt) muons


In [ ]: