In [1]:
import root_numpy
import hep_ml
import pandas
from sklearn.metrics import roc_curve
from hep_ml.commonutils import train_test_split, print_header
from hep_ml.reports import plot_roc
from hep_ml.rootutilities import list_flat_branches
In [2]:
# 13 TeV files
folder = '/mnt/w76/notebook/datasets/fromLesya/ver4/'
prompt_filename = folder + 'tt_prompts13.root'
fakes_filename = folder + 'tt_fakes13.root'
treename = "FakeElectrons/fakeTree"
In [3]:
flavours = {0: 'electrons', 1: 'muons'}
origins = {1: 'b', 2: 'c', 3: 'uds'} # and 0: prompt
In [4]:
all_columns = root_numpy.list_branches(prompt_filename, treename=treename)
sorted(all_columns)
Out[4]:
In [5]:
train_variables = [
'_3dIP',
'_3dIPerr',
'_3dIPsig',
'_PVchi2',
'_PVerr[0]',
'_PVerr[1]',
'_PVerr[2]',
'_closeJetAngAll',
'_closeJetCSVAll',
'_closeJetEtaAll',
'_closeJetPhiAll',
'_flavors',
'_ipPV',
'_ipPVerr',
'_ipZPV',
'_ipZPVerr',
'_isolation',
'_isolationComponents[0]',
'_isolationComponents[1]',
'_isolationComponents[2]',
'_isolationComponents[3]',
'_ptRelAll',
'hJet_cef',
'hJet_nconstituents',
'hJet_vtx3dL',
'hJet_vtx3deL',
'hJet_vtxMass',
'hJet_vtxPt',
'_lPt / _closeJetPtAll'
]
In [6]:
flat_columns = list_flat_branches(prompt_filename, treename=treename, use_dtype=False)
# This column is added for some strange reason
flat_columns.remove('_PVerr[3]')
read_columns = set(flat_columns + train_variables)
def isneeded(column):
return all([marker not in column for marker in ['_lep', '_jet', '_bTagged']])
read_columns = sorted([col for col in read_columns if isneeded(col)])
read_columns
Out[6]:
In [7]:
prompt_test = root_numpy.root2array(prompt_filename, treename=treename, branches=sorted(flat_columns), stop=10)
prompt_test = pandas.DataFrame(prompt_test)
pandas.set_option('display.max_rows', 300)
prompt_test.transpose()
Out[7]:
In [8]:
prompt_test = pandas.DataFrame(root_numpy.root2array(prompt_filename, treename=treename, branches=sorted(read_columns), stop=1000))
stds = prompt_test.std()
suspicious_columns = stds[stds < 1e-8].index
In [9]:
list(suspicious_columns)
Out[9]:
In [10]:
prompt_check = pandas.DataFrame(root_numpy.root2array(prompt_filename, treename=treename, branches=suspicious_columns, stop=100000))
pandas.DataFrame({'mean': prompt_check.mean(), 'std': prompt_check.std() })
Out[10]:
In [11]:
fake_check = pandas.DataFrame(root_numpy.root2array(fakes_filename, treename=treename, branches=suspicious_columns, stop=100000))
pandas.DataFrame({'mean': fake_check.mean(), 'std': fake_check.std() })
Out[11]:
In [12]:
# number of events we will use
n_events = 1000000
In [13]:
prompt_data = root_numpy.root2array(prompt_filename, treename=treename, branches=read_columns, stop=n_events)
prompt_data = pandas.DataFrame(prompt_data)
fake_data = root_numpy.root2array(fakes_filename, treename=treename, branches=read_columns, stop=n_events)
fake_data = pandas.DataFrame(fake_data)
In [14]:
columns = sorted(prompt_data.columns)
figure(figsize=[18, 100])
for i, column in enumerate(columns, 1):
subplot((len(columns) + 2) // 3, 3, i)
concatenated = numpy.concatenate([prompt_data[column], fake_data[column]])
limits = numpy.percentile(concatenated, [1, 99])
hist(prompt_data[column], bins=30, normed=True, range=limits, alpha=0.3, label='prompt')
hist(fake_data[column], bins=30, normed=True, range=limits, alpha=0.3, label='fake')
legend(loc='best')
title(column)
In [15]:
for column in ['hJet_cef', '_isolationComponents[3]']:
figure(figsize=[17, 6])
for flavour, flavour_name in flavours.items():
subplot(1, 2, flavour + 1)
concatenated = numpy.concatenate([prompt_data[column], fake_data[column]])
limits = numpy.percentile(concatenated, [1, 99])
hist((prompt_data.ix[prompt_data._flavors == flavour, column]).values,
bins=30, normed=True, range=limits, alpha=0.3, label='prompt')
hist((fake_data.ix[fake_data._flavors == flavour, column]).values,
bins=30, normed=True, range=limits, alpha=0.3, label='fake')
legend()
title(column + ", " + flavour_name)
In [16]:
data = pandas.concat([prompt_data, fake_data], ignore_index=True)
# Three IP features contain the events with huge values not convertible to float32
data = numpy.clip(data, -1e20, 1e20)
labels = numpy.array([1] * len(prompt_data) + [0] * len(fake_data))
trainX, testX, trainY, testY = train_test_split(data, labels, train_size=0.2)
In [17]:
len(trainX), len(testX)
Out[17]:
In [18]:
from sklearn.ensemble import GradientBoostingClassifier
from hep_ml import HidingClassifier
from hep_ml.experiments import metaclassifiers
In [19]:
classifiers = hep_ml.ClassifiersDict()
base_gb = GradientBoostingClassifier(subsample=0.3, n_estimators=100, max_depth=8, min_samples_split=300,
max_features=8, learning_rate=0.03)
base_splitter = metaclassifiers.DumbSplitter(base_estimator=base_gb, feature_name='_flavors')
classifiers['gb'] = HidingClassifier(base_estimator=base_splitter, train_variables=train_variables)
In [20]:
train_mask = numpy.array(trainX._flavors == 0)
train_mask = numpy.array(trainX._flavors == trainX._flavors)
classifiers.fit(trainX[train_mask], trainY[train_mask])
predictions = classifiers.test_on(testX, testY)
In [21]:
predictions.roc()
Out[21]:
In [22]:
predictions.learning_curves(step=10)
Out[22]:
In [32]:
splitter = classifiers['gb']._trained_estimator
for flavour, flavour_name in flavours.items():
gb = splitter.classifiers[flavour]
feature_imps = pandas.Series(data=gb.feature_importances_, index=train_variables)
feature_imps.sort(ascending=False)
print_header(flavour_name)
print feature_imps
In [24]:
for flavour, flavour_name in flavours.items():
mask = numpy.array(testX._flavors == flavour)
classifiers.test_on(testX[mask], testY[mask]).roc()
# comparing with istight
fpr, tpr, _ = roc_curve(testY[mask], testX._istight[mask])
plot(tpr[1:2], 1 - fpr[1:2], 'o', label='tight')
title(flavour_name)
xlim(0.5, 1.)
ylim(0.5, 1.)
for origin, origin_name in origins.items():
mask = (testX._flavors == flavour) & ( (testX._originReduced == 0) | (testX._originReduced == origin) )
mask = numpy.array(mask)
preds = classifiers['gb'].predict_proba(testX[mask])[:, 1]
plot_roc(testY[mask], preds, classifier_name=origin_name)
legend(loc='lower left')
grid()
In [25]:
for flavour, flavour_name in flavours.items():
mask = numpy.array(testX._flavors == flavour)
# comparing with istight
fpr_base, tpr_base, _ = roc_curve(testY[mask], testX._istight[mask])
fpr_base = fpr_base[1]
tpr_base = tpr_base[1]
print flavour_name, 'istight'
print 'istight fpr', fpr_base
print 'istight tpr', tpr_base
print flavour_name, 'all'
fpr, tpr, _ = roc_curve(testY[mask], classifiers['gb'].predict_proba(testX[mask])[:, 1])
print 'tpr at the same fpr', numpy.interp(fpr_base, fpr, tpr)
print 'fpr at the same tpr', numpy.interp(tpr_base, tpr, fpr)
preds = classifiers['gb'].predict_proba(testX[mask])[:, 1]
fpr, tpr, _ = roc_curve(testY[mask], preds)
for origin, origin_name in origins.items():
mask = (testX._flavors == flavour) & ( (testX._originReduced == 0) | (testX._originReduced == origin) )
mask = numpy.array(mask)
fpr, tpr, _ = roc_curve(testY[mask], classifiers['gb'].predict_proba(testX[mask])[:, 1])
print flavour_name, origin_name
print 'tpr at the same fpr', numpy.interp(fpr_base, fpr, tpr)
print 'fpr at the same tpr', numpy.interp(tpr_base, tpr, fpr)
In [26]:
for flavour, flavour_name in flavours.items():
mask = (testX._flavors == flavour) & (testX._istight == 1)
mask = numpy.array(mask)
classifiers.test_on(testX[mask], testY[mask]).roc()
for origin, origin_name in origins.items():
mask = (testX._flavors == flavour) & (testX._istight == 1) & ( (testX._originReduced == 0) | (testX._originReduced == origin) )
mask = numpy.array(mask)
preds = classifiers['gb'].predict_proba(testX[mask])[:, 1]
plot_roc(testY[mask], preds, classifier_name=origin_name)
legend(loc='lower left')
title(flavour_name + ' (istight == 1)')
grid()
In [27]:
# added _lPt < 250, otherwise many empty bins
for flavour, flavour_name in flavours.items():
mask = (testX._flavors == flavour) & (testX._lPt < 250)
mask = numpy.array(mask)
pred = classifiers.test_on(testX[mask], testY[mask])
pred.hist(['_lPt']), title(flavour_name)
pred.efficiency(['_lPt'], label=1), title(flavour_name + ' Signal efficiency')
pred.efficiency(['_lPt'], label=0), title(flavour_name + ' Background rejection')
In [28]:
for flavour, flavour_name in flavours.items():
figure(figsize=[14, 10])
for min_jets in [[0], [1], [2], range(3, 10)]:
mask = numpy.array((testX._flavors == flavour) & (numpy.in1d(testX._n_bJets, min_jets)))
# comparing with istight
preds = classifiers['gb'].predict_proba(testX[mask])[:, 1]
plot_roc(testY[mask], preds, classifier_name="n_bjets in {}".format(min_jets))
xlim(0.7, 1.)
ylim(0.7, 1.)
grid()
title(flavour_name)
legend(loc='lower left')
In [90]:
def compute_cuts(filename, treename, train_variables, classifier, left_events_per_flavour=5000):
data = pandas.DataFrame(root_numpy.root2array(filename, treename=treename, branches=train_variables))
data_flavours = pandas.DataFrame(root_numpy.root2array(filename, treename=treename, branches=['_flavors']))['_flavors']
predictions = classifier.predict_proba(data)[:, 1]
result = {}
for flavour, flavour_name in flavours.items():
mask = numpy.array(data_flavours == flavour)
masked_prediction = predictions[mask]
cut = numpy.sort(masked_prediction)[- left_events_per_flavour - 1]
print flavour_name, cut
print 'passed', numpy.sum(masked_prediction > cut),
print 'not passed', numpy.sum(masked_prediction <= cut)
result[flavour] = cut
return result
In [91]:
cuts = compute_cuts(fakes_filename, treename, train_variables, classifiers['gb'])
In [135]:
cuts
Out[135]:
In [83]:
prompt_train_vars = pandas.DataFrame(root_numpy.root2array(prompt_filename, treename=treename, branches=train_variables))
prompt_train_vars = numpy.clip(prompt_train_vars, -1e20, 1e20)
In [86]:
prompt_predictions = classifiers['gb'].predict_proba(prompt_train_vars)[:, 1]
In [88]:
prompt_flavours = pandas.DataFrame(root_numpy.root2array(prompt_filename, treename=treename, branches=['_flavors']))['_flavors']
In [93]:
for flavour, flavour_name in flavours.items():
print flavour_name
mask = numpy.array(prompt_flavours == flavour)
print numpy.mean(prompt_predictions[mask] > cuts[flavour])
In [127]:
fake_test_data = pandas.DataFrame(root_numpy.root2array(fakes_filename, treename=treename, branches=read_columns))
fake_test_data = numpy.clip(fake_test_data, -1e20, 1e20)
In [128]:
fake_predictions = classifiers['gb'].predict_proba(fake_test_data)[:, 1]
In [131]:
def compute_passed(predictions, flavours, cuts):
result = (flavours == 1) & (predictions > cuts[1])
result |= (flavours == 0) & (predictions > cuts[0])
return result
In [132]:
fake_passed = numpy.array(compute_passed(fake_predictions, fake_test_data._flavors, cuts=cuts))
In [146]:
def plot_for_passed(dataset, ispassed):
for flavour, flavour_name in flavours.items():
print_header(flavour_name)
flavour_mask = numpy.array(dataset['_flavors'] == flavour)
figure(figsize=[18, 100])
for i, column in enumerate(sorted(dataset.columns), 1):
subplot((len(columns) + 2) // 3, 3, i)
limits = numpy.percentile(dataset[column], [.1, 99.9])
hist(dataset.ix[flavour_mask & ispassed, column].values, bins=30, normed=True, range=limits, alpha=0.3, label='passed')
hist(dataset.ix[flavour_mask & ~ispassed, column].values, bins=30, normed=True, range=limits, alpha=0.3, label='not passed')
legend(loc='best')
title(column)
show()
In [148]:
plot_for_passed(fake_test_data, fake_passed)
In [147]:
mask = numpy.array(fake_test_data._isolation < 1)
plot_for_passed(fake_test_data[mask], fake_passed[mask])
In [ ]: