In [1]:
import root_numpy
import hep_ml
import pandas
from sklearn.metrics import roc_curve
from hep_ml import reports
from hep_ml.commonutils import train_test_split, print_header
from hep_ml.reports import plot_roc
from hep_ml.rootutilities import list_flat_branches, tree2pandas
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
In [2]:
# 13 TeV files (new)
folder = '/mnt/w76/notebook/datasets/fromLesya/ver5/'
prompt_filename = folder + 'tt13_prompt.root'
fakes_filename = folder + 'tt13_fake.root'
treename = "FakeElectrons/fakeTree"
In [3]:
flavours = {0: 'electrons', 1: 'muons'}
origins = {1: 'b', 2: 'c', 3: 'uds'} # and 0: prompt
In [4]:
train_variables = [
'_3dIP',
'_3dIPerr',
'_3dIPsig',
'_PVchi2',
'_PVerr[0]',
'_PVerr[1]',
'_PVerr[2]',
'_closeJetAngAll',
'_closeJetCSVAll',
'_closeJetEtaAll',
'_closeJetPhiAll',
'_flavors',
'_ipPV',
'_ipPVerr',
'_ipZPV',
'_ipZPVerr',
'_isolation',
'_isolationComponents[0]',
'_isolationComponents[1]',
'_isolationComponents[2]',
'_isolationComponents[3]',
'_ptRelAll',
'hJet_cef',
'hJet_nconstituents',
'hJet_vtx3dL',
'hJet_vtx3deL',
'hJet_vtxMass',
'hJet_vtxNt',
'_lPt / _closeJetPtAll'
]
In [5]:
flat_columns = list_flat_branches(prompt_filename, treename=treename, use_dtype=False)
# This column is added for some strange reason
flat_columns.remove('_PVerr[3]')
read_columns = set(flat_columns + train_variables)
def isneeded(column):
return all([marker not in column for marker in ['_lep', '_jet', '_bTagged']])
read_columns = sorted([col for col in read_columns if isneeded(col)])
read_columns
Out[5]:
In [6]:
# number of events we will use (first events from file)
n_train_events = 200000
In [7]:
def read_events(columns, start=None, stop=None):
prompt_data = root_numpy.root2array(prompt_filename, treename=treename, branches=columns, start=start, stop=stop)
prompt_data = pandas.DataFrame(prompt_data)
fake_data = root_numpy.root2array(fakes_filename, treename=treename, branches=columns, start=start, stop=stop)
fake_data = pandas.DataFrame(fake_data)
data = pandas.concat([prompt_data, fake_data], ignore_index=True)
# Three IP features contain the events with huge values not convertible to float32
data = numpy.clip(data, -1e20, 1e20)
labels = numpy.array([1] * len(prompt_data) + [0] * len(fake_data))
return data, labels
In [8]:
data, labels = read_events(read_columns, stop=n_train_events)
trainX, testX, trainY, testY = train_test_split(data, labels, train_size=0.5)
In [9]:
len(trainX), len(testX)
Out[9]:
In [10]:
figure(figsize=[18, 100])
reports.plot_features_pdf(data, labels, n_bins=30, sig_label='prompt', bck_label='fake')
In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from hep_ml import HidingClassifier
from hep_ml import ugradientboosting as ugb
from hep_ml.experiments import metaclassifiers
In [12]:
# base_gb = GradientBoostingClassifier(subsample=0.3, n_estimators=100, max_depth=8, min_samples_split=300,
# max_features=8, learning_rate=0.03)
# base_splitter = metaclassifiers.DumbSplitter(base_estimator=base_gb, feature_name='_flavors')
# classifiers['gb'] = HidingClassifier(base_estimator=base_splitter, train_variables=train_variables)
In [13]:
classifiers = hep_ml.ClassifiersDict()
fl_loss = ugb.BinFlatnessLossFunction(['_mt'], uniform_label=0, ada_coefficient=0.05)
base_gb = ugb.uGradientBoostingClassifier(loss=fl_loss, n_estimators=100, subsample=0.15, max_depth=8, min_samples_split=300,
max_features=8, learning_rate=0.03, train_variables=train_variables)
classifiers['gb'] = metaclassifiers.DumbSplitter(base_estimator=base_gb, feature_name='_flavors')
In [14]:
classifiers.fit(trainX, trainY)
predictions = classifiers.test_on(testX, testY)
In [15]:
# splitter = classifiers['gb']._trained_estimator
# for flavour, flavour_name in flavours.items():
# gb = splitter.classifiers[flavour]
# feature_imps = pandas.Series(data=gb.feature_importances_, index=train_variables)
# feature_imps.sort(ascending=False)
# print_header(flavour_name)
# print feature_imps
In [16]:
gb_preds = classifiers['gb'].predict_proba(testX)[:, 1]
for flavour, flavour_name in flavours.items():
figure(figsize=[13, 9])
xlim(0.5, 1.), ylim(0.5, 1.), grid()
mask = numpy.array(testX._flavors == flavour)
plot_roc(testY, gb_preds, classifier_name='all', mask=mask)
plot_roc(testY, testX._istight, classifier_name='tight', mask=mask, is_cut=True)
for origin, origin_name in origins.items():
mask = (testX._flavors == flavour) & ( (testX._originReduced == 0) | (testX._originReduced == origin) )
mask = numpy.array(mask)
plot_roc(testY, gb_preds, classifier_name=origin_name, mask=mask)
legend(loc='lower left')
In [17]:
for flavour, flavour_name in flavours.items():
figure(figsize=[13, 9]), grid()
mask = (testX._flavors == flavour) & (testX._istight == 1)
plot_roc(testY, gb_preds, classifier_name='all', mask=numpy.array(mask))
for origin, origin_name in origins.items():
mask = (testX._flavors == flavour) & (testX._istight == 1) & numpy.in1d(testX._originReduced, [0, origin])
plot_roc(testY, gb_preds, classifier_name=origin_name, mask=numpy.array(mask))
legend(loc='lower left')
title(flavour_name + ' (istight == 1)')
In [18]:
# added _lPt < 250, otherwise many empty bins
# for flavour, flavour_name in flavours.items():
# mask = (testX._flavors == flavour) & (testX._lPt < 250)
# mask = numpy.array(mask)
# pred = classifiers.test_on(testX[mask], testY[mask])
# pred.hist(['_lPt']), title(flavour_name)
# pred.efficiency(['_lPt'], label=1), title(flavour_name + ' Signal efficiency')
# pred.efficiency(['_lPt'], label=0), title(flavour_name + ' Background rejection')
In [19]:
for flavour, flavour_name in flavours.items():
figure(figsize=[14, 10])
for min_jets in [[0], [1], [2], range(3, 10)]:
mask = numpy.array((testX._flavors == flavour) & (numpy.in1d(testX._n_bJets, min_jets)))
plot_roc(testY, gb_preds, classifier_name="n_bjets in {}".format(min_jets), mask=mask)
xlim(0.7, 1.)
ylim(0.7, 1.)
grid()
title(flavour_name)
legend(loc='lower left')
In [20]:
def compute_cuts(filename, treename, train_variables, classifier, left_events_per_flavour=5000):
data = tree2pandas(filename, treename, branches=train_variables, start=n_train_events)
data_flavours = tree2pandas(filename, treename=treename, branches=['_flavors'], start=n_train_events)['_flavors']
predictions = classifier.predict_proba(data)[:, 1]
result = {}
for flavour, flavour_name in flavours.items():
mask = numpy.array(data_flavours == flavour)
masked_prediction = predictions[mask]
cut = numpy.sort(masked_prediction)[- left_events_per_flavour - 1]
print flavour_name, cut
print 'passed', numpy.sum(masked_prediction > cut),
print 'not passed', numpy.sum(masked_prediction <= cut)
result[flavour] = cut
return result
In [21]:
cuts = compute_cuts(fakes_filename, treename, train_variables, classifiers['gb'])
In [22]:
cuts
Out[22]:
In [23]:
prompt_train_vars = tree2pandas(prompt_filename, treename=treename, branches=train_variables, start=n_train_events)
prompt_predictions = classifiers['gb'].predict_proba(prompt_train_vars)[:, 1]
prompt_flavours = tree2pandas(prompt_filename, treename=treename, branches=['_flavors'], start=n_train_events)['_flavors']
In [24]:
for flavour, flavour_name in flavours.items():
print 'prompt passed,', flavour_name
mask = numpy.array(prompt_flavours == flavour)
print numpy.mean(prompt_predictions[mask] > cuts[flavour])
In [25]:
fake_test_data = tree2pandas(fakes_filename, treename=treename, branches=read_columns)
In [26]:
fake_predictions = classifiers['gb'].predict_proba(fake_test_data)[:, 1]
In [27]:
def compute_passed(predictions, flavours, cuts):
result = (flavours == 1) & (predictions > cuts[1])
result |= (flavours == 0) & (predictions > cuts[0])
return result
In [28]:
fake_passed = numpy.array(compute_passed(fake_predictions, fake_test_data._flavors, cuts=cuts))
In [29]:
figure(figsize=[18, 80])
reports.plot_features_pdf(fake_test_data, fake_passed, sig_label='passed', bck_label='not passed')
In [30]:
mask = fake_test_data._isolation < 1
figure(figsize=[18, 80])
reports.plot_features_pdf(fake_test_data, fake_passed, mask=mask, sig_label='passed', bck_label='not passed')
In [31]:
for feature in ['_met', '_mt', 'HT', '_n_Jets', '_n_bJets', '_lPt']:
predictions.rcp(feature, label=0, ignored_sidebands=0.001)
title('Efficiency for ' + feature + ', Fake')
predictions.rcp(feature, label=1, ignored_sidebands=0.001)
title('Rejection for ' + feature + ', Prompt')
In [32]:
mask = numpy.array(testX._isolation < 1)
for feature in ['_met', '_mt', 'HT', '_n_Jets', '_n_bJets', '_lPt']:
predictions.rcp(feature, label=0, ignored_sidebands=0.001, mask=mask)
title('Efficiency for ' + feature + ', Fake')
predictions.rcp(feature, label=1, ignored_sidebands=0.001, mask=mask)
title('Rejection for ' + feature + ', Prompt')
In [107]:
main_feature = '_lPt / _closeJetPtAll'
parts = 5
bin_limits = numpy.percentile( fake_test_data[main_feature], numpy.linspace(0, 100, parts + 1)[1:-1])
splitter_feature = numpy.searchsorted(bin_limits, fake_test_data[main_feature])
fake_test_data['splitter'] = splitter_feature
In [132]:
base_forest = HidingClassifier(read_columns, RandomForestClassifier(n_estimators=30, max_depth=3, min_samples_leaf=50, max_features=10))
fake_clf = metaclassifiers.DumbSplitter(feature_name='splitter', base_estimator=base_forest)
fake_clf.fit(fake_test_data, fake_passed)
Out[132]:
In [133]:
feature_importances = sum(cl._trained_estimator.feature_importances_ for cl in fake_clf.classifiers.values())
x = pandas.Series(data=feature_importances, index=read_columns)
x.sort(ascending=False)
x[:40]
Out[133]:
In [115]:
fake_loose_cuts = compute_cuts(fakes_filename, treename, train_variables, classifiers['gb'], left_events_per_flavour=20000)
fake_passed_loose_cut = compute_passed(fake_predictions, fakes_flavours, fake_loose_cuts).values
In [126]:
fake_test_data_passed_loose = fake_test_data.ix[fake_passed_loose_cut, :].copy()
fakes_flavours_passed_loose = fakes_flavours[fake_passed_loose_cut]
isolations_passed_loose = fake_test_data_passed_loose._isolation > 0.15
_ = hist(isolations_passed_loose.values)
In [127]:
main_feature = '_lPt / _closeJetPtAll'
parts = 5
bin_limits = numpy.percentile(fake_test_data_passed_loose[main_feature], numpy.linspace(0, 100, parts + 1)[1:-1])
splitter_feature_passed_loose = numpy.searchsorted(bin_limits, fake_test_data_passed_loose[main_feature])
fake_test_data_passed_loose['splitter'] = splitter_feature_passed_loose
In [137]:
columns_without_isolation = list(set(read_columns) - set(['_isolation']))
base_forest = HidingClassifier(columns_without_isolation, RandomForestClassifier(n_estimators=30, max_depth=3, min_samples_leaf=50, max_features=10))
fake_clf = metaclassifiers.DumbSplitter(feature_name='splitter', base_estimator=base_forest)
fake_clf.fit(fake_test_data_passed_loose, isolations_passed_loose)
Out[137]:
In [139]:
feature_importances = sum(cl._trained_estimator.feature_importances_ for cl in fake_clf.classifiers.values())
x = pandas.Series(data=feature_importances, index=columns_without_isolation)
x.sort(ascending=False)
x[:40]
Out[139]: