In [1]:
import root_numpy
import hep_ml
import pandas
from sklearn.metrics import roc_curve
from hep_ml import reports
from hep_ml.commonutils import train_test_split, print_header
from hep_ml.reports import plot_roc
from hep_ml.rootutilities import list_flat_branches, tree2pandas, print_root_structure
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
In [2]:
# 8 TeV files
folder = '/mnt/w76/notebook/datasets/fromLesya/ver3/'
prompt_filename = folder + 'tt_prompts.root'
fakes_filename = folder + 'tt_fakes.root'
treename = "fakeTree"
In [3]:
flavours = {0: 'electrons', 1: 'muons'}
origins = {1: 'b', 2: 'c', 3: 'uds'} # and 0: prompt
In [4]:
train_variables = [
'_3dIP',
'_3dIPerr',
'_3dIPsig',
'_PVchi2',
'_PVerr[0]',
'_PVerr[1]',
'_PVerr[2]',
'_closeJetAngAll',
'_closeJetCSVAll',
'_closeJetEtaAll',
'_closeJetPhiAll',
'_flavors',
'_ipPV',
'_ipPVerr',
'_ipZPV',
'_ipZPVerr',
'_isolation',
'_isolationComponents[0]',
'_isolationComponents[1]',
'_isolationComponents[2]',
'_isolationComponents[3]',
'_ptRelAll',
# 'hJet_cef',
# 'hJet_nconstituents',
# 'hJet_vtx3dL',
# 'hJet_vtx3deL',
# 'hJet_vtxMass',
# 'hJet_vtxPt',
'_lPt / _closeJetPtAll'
]
uniform_variables = ['_mt']
read_columns = train_variables + uniform_variables + ['_istight', '_originReduced' ]
In [5]:
# number of events we will use (first events from file)
n_train_events = 100000
n_test_events = 1000000
n_used_events = n_train_events + n_test_events
In [6]:
def read_events(columns, start=None, stop=None):
prompt_data = root_numpy.root2array(prompt_filename, treename=treename, branches=columns, start=start, stop=stop)
prompt_data = pandas.DataFrame(prompt_data)
fake_data = root_numpy.root2array(fakes_filename, treename=treename, branches=columns, start=start, stop=stop)
fake_data = pandas.DataFrame(fake_data)
data = pandas.concat([prompt_data, fake_data], ignore_index=True)
# Three IP features contain the events with huge values not convertible to float32
data = numpy.clip(data, -1e20, 1e20)
labels = numpy.array([1] * len(prompt_data) + [0] * len(fake_data))
return data, labels
In [7]:
trainX, trainY = read_events(read_columns, stop=n_train_events)
testX, testY = read_events(read_columns, start=n_train_events, stop=n_used_events)
In [8]:
len(trainX), len(testX)
Out[8]:
In [10]:
figure(figsize=[18, 40])
reports.plot_features_pdf(trainX, trainY, n_bins=30, sig_label='prompt', bck_label='fake')
In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from hep_ml import HidingClassifier
from hep_ml import ugradientboosting as ugb
from hep_ml.experiments import metaclassifiers
In [12]:
classifiers = hep_ml.ClassifiersDict()
fl_loss = ugb.BinFlatnessLossFunction(uniform_variables, uniform_label=0, ada_coefficient=0.05)
base_gb = ugb.uGradientBoostingClassifier(loss=fl_loss, n_estimators=100, subsample=0.15, max_depth=8, min_samples_split=300,
max_features=8, learning_rate=0.03, train_variables=train_variables)
classifiers['gb'] = metaclassifiers.DumbSplitter(base_estimator=base_gb, feature_name='_flavors')
In [13]:
classifiers.fit(trainX, trainY)
predictions = classifiers.test_on(testX, testY)
In [14]:
gb_preds = classifiers['gb'].predict_proba(testX)[:, 1]
for flavour, flavour_name in flavours.items():
figure(figsize=[13, 9])
xlim(0.5, 1.), ylim(0.5, 1.), grid()
mask = numpy.array(testX._flavors == flavour)
plot_roc(testY, gb_preds, classifier_name='all', mask=mask)
plot_roc(testY, testX._istight, classifier_name='tight', mask=mask, is_cut=True)
for origin, origin_name in origins.items():
mask = (testX._flavors == flavour) & ( (testX._originReduced == 0) | (testX._originReduced == origin) )
mask = numpy.array(mask)
plot_roc(testY, gb_preds, classifier_name=origin_name, mask=mask)
legend(loc='lower left')
In [15]:
def compute_cuts(filename, treename, train_variables, classifier, left_events_per_flavour=5000):
data = tree2pandas(filename, treename, branches=train_variables, start=n_train_events)
data_flavours = tree2pandas(filename, treename=treename, branches=['_flavors'],
start=n_train_events, stop=n_used_events)['_flavors']
predictions = classifier.predict_proba(data)[:, 1]
result = {}
for flavour, flavour_name in flavours.items():
mask = numpy.array(data_flavours == flavour)
masked_prediction = predictions[mask]
cut = numpy.sort(masked_prediction)[- left_events_per_flavour - 1]
print flavour_name, cut
print 'passed', numpy.sum(masked_prediction > cut),
print 'not passed', numpy.sum(masked_prediction <= cut)
result[flavour] = cut
return result
In [16]:
cuts = compute_cuts(fakes_filename, treename, train_variables, classifiers['gb'])
In [17]:
cuts
Out[17]:
In [18]:
prompt_train_vars = tree2pandas(prompt_filename, treename=treename, branches=train_variables,
start=n_train_events, stop=n_used_events)
prompt_flavours = tree2pandas(prompt_filename, treename=treename, branches=['_flavors'],
start=n_train_events, stop=n_used_events)['_flavors']
prompt_predictions = classifiers['gb'].predict_proba(prompt_train_vars)[:, 1]
In [19]:
for flavour, flavour_name in flavours.items():
print 'prompt passed,', flavour_name
mask = numpy.array(prompt_flavours == flavour)
print numpy.mean(prompt_predictions[mask] > cuts[flavour])
In [20]:
fake_test_data = tree2pandas(fakes_filename, treename=treename, branches=read_columns, start=n_train_events, stop=n_used_events)
In [21]:
fake_predictions = classifiers['gb'].predict_proba(fake_test_data)[:, 1]
In [22]:
def compute_passed(predictions, flavours, cuts):
result = (flavours == 1) & (predictions > cuts[1])
result |= (flavours == 0) & (predictions > cuts[0])
return result
In [23]:
fake_passed = numpy.array(compute_passed(fake_predictions, fake_test_data._flavors, cuts=cuts))
In [24]:
figure(figsize=[18, 50])
reports.plot_features_pdf(fake_test_data, fake_passed, sig_label='passed', bck_label='not passed')
In [25]:
# mask = fake_test_data._isolation < 1
# figure(figsize=[18, 80])
# reports.plot_features_pdf(fake_test_data, fake_passed, mask=mask, sig_label='passed', bck_label='not passed')
In [26]:
check_folder = '/mnt/w76/notebook/datasets/fromLesya/ver6/8TeV/'
from os import listdir
files = [filename for filename in listdir(check_folder)]
check_params = {'stop': 10000,
'selection': '(_flavors == 0) || (_lPt > 20)',
'treename': 'FakeElectrons/fakeTree' }
In [33]:
check_predictions = {}
check_passed = {}
check_flavors = {}
for name in files:
data = tree2pandas(check_folder + name, branches=train_variables, **check_params)
preds = classifiers['gb'].predict_proba(data)[:, 1]
check_predictions[name] = preds
check_passed[name] = compute_passed(preds, flavours=data._flavors, cuts=cuts)
check_flavors[name] = data._flavors.values
print name, ' avg passed:', check_passed[name].mean()
In [48]:
def plot_hist_and_fakerate(n_bins=10):
for column in ['log(_lE)', '_closeJetPtAll', '_closeJetPtAllMC', 'log(_ipPV)' , 'log(abs(_ipZPV))', 'log(_lPt)']:
column_data = {name: tree2pandas(check_folder + name, branches=[column], **check_params)[column].values
for name in files}
for flavour, flavour_name in flavours.items():
print_header(column + " " + flavour_name)
figure(figsize=[18, 8])
subplot(121)
title('Distribution for ' + column + " " + flavour_name)
conc = numpy.concatenate(column_data.values())
# _, edges = numpy.histogram(conc, bins=n_bins)
edges = numpy.percentile(conc, numpy.linspace(1, 99, n_bins + 1))
for i, (name, data) in enumerate(column_data.items()):
mask = (check_flavors[name] == flavour) * 1
if i == 0:
hist(data, bins=edges, weights=mask, label=name, fill=False, edgecolor='b', normed=True)
else:
hist(data, bins=edges, weights=mask, label=name, histtype='step', normed=True)
legend(loc='best')
subplot(122)
title('Passed fraction for ' + column + " " + flavour_name)
for i, (name, data) in enumerate(column_data.items()):
mask = check_flavors[name] == flavour
bins = numpy.searchsorted(edges, data) - 1
bins = numpy.clip(bins, 0, n_bins - 1)
passed = check_passed[name]
mean_passed = numpy.bincount(bins, weights=passed & mask, minlength=n_bins)
mean_passed /= numpy.bincount(bins, weights=mask, minlength=n_bins) + 1e-10
centers = (edges[:-1] + edges[1:]) / 2
errorbar(centers, mean_passed, label=name)
ylim(0, 1)
legend(loc='best')
show()
In [49]:
plot_hist_and_fakerate()
In [ ]: