This notebook contains some comparison of classifiers. The target is to obtain flat effiency in signal (without significally loosing quality of classification).
The classifiers compared are
uBoost, AdaBoost on decision trees (from sklearn lib), knnAdaBoost uniformGradientBoosting (uGB) with different losses.This dataset was used for demonstartion purposes in paper about uBoost.
We have plenty of data here, so results are very stable
In [1]:
import pandas, numpy
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from hep_ml.commonutils import train_test_split
from hep_ml import uboost, ugradientboosting as ugb
from hep_ml import ClassifiersDict, HidingClassifier
# ipc_profile is used for parallel computations on IPython cluster.
# If you don't have cluster, this will be None and computations will be done on same machine
from hep_ml.config import ipc_profile
In [2]:
used_columns = ["Y1", "Y2", "Y3", "M2AB", "M2AC"]
folder = '../hep_ml/datasets/dalitzplot/'
signalDF = pandas.read_csv(folder + 'signal.csv', sep='\t', usecols=used_columns)
signal5DF = pandas.read_csv(folder + 'signal5e5.csv', sep='\t', usecols=used_columns)
bgDF = pandas.read_csv(folder + 'bkgd.csv', sep='\t', usecols=used_columns)
In [3]:
def plot_distribution(data_frame, var_name1='M2AB', var_name2='M2AC', bins=40):
"""The function to plot 2D distribution histograms"""
pylab.hist2d(data_frame[var_name1], data_frame[var_name2], bins = 40, cmap=cm.Blues)
pylab.xlabel(var_name1)
pylab.ylabel(var_name2)
pylab.colorbar()
pylab.figure(figsize=(18, 6))
subplot(1, 3, 1), pylab.title("signal"), plot_distribution(signalDF)
subplot(1, 3, 2), pylab.title("background"), plot_distribution(bgDF)
subplot(1, 3, 3), pylab.title("dense signal"), plot_distribution(signal5DF)
pass
In [4]:
data = pandas.concat([signalDF, bgDF])
labels = numpy.array([1] * len(signalDF) + [0] * len(bgDF))
trainX, testX, trainY, testY = train_test_split(data, labels)
base_estimator = DecisionTreeClassifier(max_depth=4)
uniform_variables = ["M2AB", "M2AC"]
train_variables = ["Y1", "Y2", "Y3"]
In [5]:
n_estimators = 150 + 1
classifiers = ClassifiersDict()
base_ada = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=0.1)
classifiers['AdaBoost'] = HidingClassifier(train_variables=train_variables, base_estimator=base_ada)
knnloss = ugb.SimpleKnnLossFunction(uniform_variables, knn=7)
classifiers['uGB+knnAda'] = ugb.uGradientBoostingClassifier(loss=knnloss, max_depth=4, n_estimators=n_estimators,
learning_rate=0.4, train_variables=train_variables)
classifiers['uBoost'] = uboost.uBoostClassifier(uniform_variables=uniform_variables,
base_estimator=base_estimator,
n_estimators=n_estimators,
train_variables=train_variables,
efficiency_steps=12)
flatnessloss = ugb.BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.35, power=1.3)
classifiers['uGB+FL'] = ugb.uGradientBoostingClassifier(loss=flatnessloss, max_depth=4, n_estimators=n_estimators,
learning_rate=0.5, train_variables=train_variables)
classifiers.fit(trainX, trainY, ipc_profile=ipc_profile)
pass
In [6]:
pred = classifiers.test_on(testX, testY, low_memory=True)
ylim(0.88, 0.94)
pred.learning_curves()
Out[6]:
In [7]:
pred.sde_curves(uniform_variables)
We can also look at ROC curves at different stages
In [8]:
pred.roc(stages=[n_estimators//2, n_estimators - 1])
Out[8]:
In [9]:
pred5e5 = classifiers.test_on(signal5DF, numpy.ones(len(signal5DF)))
pred5e5.sde_curves(uniform_variables, step=10)
In [10]:
pred5e5.efficiency(uniform_variables, target_efficiencies=[0.6, 0.7, 0.8, 0.9], n_bins=30)
Out[10]:
In [11]:
pred5e5.efficiency(uniform_variables, target_efficiencies=[0.7], stages=[50, 100], n_bins=30)
Out[11]: