In physics applications frequently we need to achieve uniformity of predictions along some features. For instance, when testing for the existence of a new particle, we need a classifier to be uniform in background along the mass (otherwise one can get false discovery due to so-called peaking background).
This notebook contains some comparison of classifiers. The target is to obtain flat effiency in signal (without significally loosing quality of classification) in Dalitz features.
The classifiers compared are
We use dataset from paper about uBoost for demonstration purposes.
We have a plenty of data here, so results are quite stable
In [1]:
# downloading data
!wget -O ../data/dalitzdata.root -nc https://github.com/arogozhnikov/hep_ml/blob/data/data_to_download/dalitzdata.root?raw=true
In [2]:
# we are installing rep for nice plots and reports here
!pip install rep --no-dependencies
In [3]:
import pandas, numpy
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
# this wrapper makes it possible to train on subset of features
from rep.estimators import SklearnClassifier
from matplotlib import pyplot as plt
from hep_ml.commonutils import train_test_split
from hep_ml import uboost, gradientboosting as ugb, losses
In [4]:
import root_numpy
used_columns = ["Y1", "Y2", "Y3", "M2AB", "M2AC"]
data = pandas.DataFrame(root_numpy.root2array('../data/dalitzdata.root', treename='tree'))
labels = data['labels']
data = data.drop('labels', axis=1)
In [5]:
def plot_distribution(data_frame, var_name1='M2AB', var_name2='M2AC', bins=40):
"""The function to plot 2D distribution histograms"""
plt.hist2d(data_frame[var_name1], data_frame[var_name2], bins = 40, cmap=plt.cm.Blues)
plt.xlabel(var_name1)
plt.ylabel(var_name2)
plt.colorbar()
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1), plt.title("signal"), plot_distribution(data[labels==1])
plt.subplot(1, 2, 2), plt.title("background"), plot_distribution(data[labels==0])
pass
In [6]:
trainX, testX, trainY, testY = train_test_split(data, labels, random_state=42)
In [7]:
uniform_features = ["M2AB", "M2AC"]
train_features = ["Y1", "Y2", "Y3"]
n_estimators = 150
base_estimator = DecisionTreeClassifier(max_depth=4)
uBoost training takes much time, so we reduce number of efficiency_steps, use prediction smoothing and run uBoost in threads
In [8]:
from rep.metaml import ClassifiersFactory
classifiers = ClassifiersFactory()
base_ada = GradientBoostingClassifier(max_depth=4, n_estimators=n_estimators, learning_rate=0.1)
classifiers['AdaBoost'] = SklearnClassifier(base_ada, features=train_features)
knnloss = ugb.KnnAdaLossFunction(uniform_features, knn=10, uniform_label=1)
ugbKnn = ugb.UGradientBoostingClassifier(loss=knnloss, max_depth=4, n_estimators=n_estimators,
learning_rate=0.4, train_features=train_features)
classifiers['uGB+knnAda'] = SklearnClassifier(ugbKnn)
uboost_clf = uboost.uBoostClassifier(uniform_features=uniform_features, uniform_label=1,
base_estimator=base_estimator,
n_estimators=n_estimators, train_features=train_features,
efficiency_steps=12, n_threads=4)
classifiers['uBoost'] = SklearnClassifier(uboost_clf)
flatnessloss = ugb.KnnFlatnessLossFunction(uniform_features, fl_coefficient=3., power=1.3, uniform_label=1)
ugbFL = ugb.UGradientBoostingClassifier(loss=flatnessloss, max_depth=4,
n_estimators=n_estimators,
learning_rate=0.1, train_features=train_features)
classifiers['uGB+FL'] = SklearnClassifier(ugbFL)
classifiers.fit(trainX, trainY, parallel_profile='threads-4')
pass
In [9]:
from rep.report.metrics import RocAuc
report = classifiers.test_on(testX, testY)
plt.ylim(0.88, 0.94)
report.learning_curve(RocAuc(), steps=1)
Out[9]:
In [10]:
from hep_ml.metrics import BinBasedSDE, KnnBasedCvM
report.learning_curve(BinBasedSDE(uniform_features, uniform_label=1))
Out[10]:
In [11]:
report.learning_curve(KnnBasedCvM(uniform_features, uniform_label=1))
Out[11]:
In [12]:
report.roc().plot(new_plot=True, figsize=[10, 9])
In [13]:
report.efficiencies_2d(uniform_features, efficiency=0.5, signal_label=1, n_bins=15,
labels_dict={1: 'signal'})
Out[13]:
the same for global efficiency = 0.7
In [14]:
report.efficiencies_2d(uniform_features, efficiency=0.7, signal_label=1, n_bins=15,
labels_dict={1: 'signal'})
Out[14]:
In [ ]: