In [1]:
import root_numpy
import hep_ml
import pandas
from hep_ml.commonutils import train_test_split
from hep_ml.reports import plot_roc
from sklearn.metrics import roc_curve
In [2]:
folder = '/mnt/w76/notebook/datasets/fromLesya/ver3/'
prompt_filename = folder + 'tt_prompts.root'
fakes_filename = folder + 'tt_fakes.root'
treename = "fakeTree"
In [3]:
flavours = {0: 'electrons', 1: 'muons'}
origins = {1: 'b', 2: 'c', 3: 'uds'} # and 0: prompt
In [4]:
all_columns = root_numpy.list_branches(prompt_filename, treename=treename)
markers = ['_mom', 'gen', '_lep', 'AllMC', '_n_', '_met',
'HT', '_closeJetPtAllstatus', '_charges', '_ipPVmc', '_isloose', '_istight',
'_eventNb', '_runNb', '_lumiBlock', '_origin', '_originReduced',
'_isolationMC', '_partonIdMatched', '_sameParton', 'hJet_SoftLeptId']
markers += ['_lE', '_lEta', '_lPhi', '_lPt', '_mt', 'hJet_e', 'hJet_Soft',
'hJet_JECUnc', 'hJet_phi', 'hJet_pt', '_closeJetPtAll']
In [5]:
flattened_columns = dict()
for column in all_columns:
data = root_numpy.root2array(prompt_filename, treename=treename, branches=[column], stop=10)
try:
for index in range(len(data[0][0])):
flattened_columns["{column}[{index}]".format(column=column, index=index)] = column + str(index)
except:
flattened_columns[column] = column
# for some strange reason this column is also added
flattened_columns.pop('_PVerr[3]')
Out[5]:
In [6]:
n_events = 1000000
In [7]:
read_columns = sorted(flattened_columns.keys())
data = pandas.DataFrame(root_numpy.root2array(prompt_filename, treename=treename, branches=read_columns, stop=n_events))
data['ptRatio'] = data.eval('_lPt / _closeJetPtAll')
# Three IP features contain the events with huge values not convertible to float32
data = numpy.clip(data, -1e20, 1e20)
labels = 1 - data._origin
trainX, testX, trainY, testY = train_test_split(data, labels, train_size=0.2)
In [8]:
train_variables = [col for col in data.columns if not any([marker in col for marker in markers])]
sorted(train_variables)
Out[8]:
In [9]:
from sklearn.ensemble import GradientBoostingClassifier
from hep_ml import HidingClassifier
In [10]:
classifiers = hep_ml.ClassifiersDict()
base_gb = GradientBoostingClassifier(subsample=0.2, n_estimators=200, max_depth=8, min_samples_split=300,
max_features=8, learning_rate=0.05)
classifiers['gb'] = HidingClassifier(base_estimator=base_gb, train_variables=train_variables)
In [11]:
classifiers.fit(trainX, trainY)
Out[11]:
In [15]:
for flavour, flavour_name in flavours.items():
mask = testX._flavors == flavour
predictions = classifiers.test_on(testX[mask], testY[mask]).roc()
fpr, tpr, _ = roc_curve(testY[mask], testX._istight[mask])
plot(tpr[1:2], 1 - fpr[1:2], 'o', label='tight')
title(flavour_name)
grid()
In [14]:
predictions.learning_curves()
Out[14]:
In [18]:
gb = classifiers['gb']._trained_estimator
feature_imps = pandas.Series(data=gb.feature_importances_, index=train_variables)
feature_imps.sort(ascending=False)
feature_imps
Out[18]: