This notebook is devoted to uniformGradientBoosting, which is gradient boosting on trees, with custom loss function:
$$ L_\text{general} = \sum_i \exp \left[- \sum_j a_{ij} \textrm{score}_j y_j \right] $$Here $y_j \in \{+1, -1\}$, $\textrm{score}_j \in \mathbb{R}$ - are real class and score prediction of j-th event in the train set
If we take $a_{ij}$ to be identity matrix, it is simply AdaLoss: $$ L_\text{ada} = \sum_i \exp \left[- \textrm{score}_i y_i \right] $$
SimpleKnnLoss(knn) is particular case, where in each line we set ones to closest knn events of the same class, and zeros to all others.
The matrix is square, if we take knn=1, this is the same as Ada loss.
PairwiseKnnLossFunction(knn) we take knn neighbours for each event, for each pair of neighboring events we create separate row in matrix, ones are placed in the corresponding to events columns (thus we have only two 1's in each row). This one gives poor uniformity and doen't semm to have any advantages. If knn=1, this one is equivalent to Ada loss too.
RandomKnnLossFunction(nrows, knn, knnfactor=3), the resulting A matrix wil have nrows rows, each of them is generated so:
we take random event from train dataset, from knn * knnfactor of closest neighbours we take knn in a random way, and place ones to the corresponding columns. Each row has knn 1's.
The classifier should produce good quality of classification (measured by ROC)
And also uniformity in Dalitz plot is desired. This is hard not to drop the quality in corners of Dalitz plot.
In [1]:
import pandas, numpy
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from IPython.display import display_html
from hep_ml import ugradientboosting as ugb
from hep_ml import commonutils, reports, ClassifiersDict, HidingClassifier
from hep_ml.uboost import uBoostBDT, uBoostClassifier
from hep_ml.config import ipc_profile
In [2]:
from hep_ml.experiments.losses import PairwiseKnnLossFunction, RandomKnnLossFunction, DistanceBasedKnnFunction
from hep_ml.experiments.losses import ExperimentalSimpleKnnLossFunction
In [3]:
used_columns = ["Y1", "Y2", "Y3", "M2AB", "M2AC"]
folder = '../hep_ml/datasets/dalitzplot/'
signalDF = pandas.read_csv(folder + 'signal.csv', sep='\t', usecols=used_columns)
signal5DF = pandas.read_csv(folder + 'signal5e5.csv', sep='\t', usecols=used_columns)
bgDF = pandas.read_csv(folder + 'bkgd.csv', sep='\t', usecols=used_columns)
In [4]:
def plot_distribution(data_frame, var_name1='M2AB', var_name2='M2AC', bins=40):
"""The function to plot 2D distribution histograms"""
pylab.hist2d(data_frame[var_name1], data_frame[var_name2], bins = 40, cmap=cm.Blues)
pylab.xlabel(var_name1)
pylab.ylabel(var_name2)
pylab.colorbar()
pylab.figure(figsize=(18, 6))
subplot(1, 3, 1), pylab.title("signal"), plot_distribution(signalDF)
subplot(1, 3, 2), pylab.title("background"), plot_distribution(bgDF)
subplot(1, 3, 3), pylab.title("dense signal"), plot_distribution(signal5DF)
pass
In [5]:
def test_classifiers(classifiers, roc_stages=None, sde_stages=None, parallelize=True):
used_ipc = ipc_profile if parallelize else None
test_preds = classifiers.fit(trainX, trainY, ipc_profile=used_ipc).test_on(testX, testY)
pylab.figure(figsize=(17, 7))
pylab.subplot(121), pylab.title('Learning curves'), test_preds.learning_curves()
pylab.subplot(122), pylab.title('Staged SDE'), test_preds.sde_curves(uniform_variables)
show()
if roc_stages is not None:
test_preds.roc(stages=roc_stages).show()
if sde_stages is not None:
classifiers.test_on(signal5DF, numpy.ones(len(signal5DF)))\
.efficiency(uniform_variables, stages=sde_stages, target_efficiencies=[0.7])
In [6]:
data = pandas.concat([signalDF, bgDF])
labels = numpy.array([1] * len(signalDF) + [0] * len(bgDF))
trainX, testX, trainY, testY = commonutils.train_test_split(data, labels, train_size=0.5)
base_tree = DecisionTreeClassifier(max_depth=4)
uniform_variables = ["M2AB", "M2AC"]
train_variables = ["Y1", "Y2", "Y3"]
In [7]:
params = {
'max_depth': 5,
'n_estimators': 300,
'subsample': 0.7,
'update_tree': True,
'learning_rate': 0.1,
'train_variables': train_variables,
}
In [8]:
sknn_classifiers = ClassifiersDict()
for knn in [1, 5, 10, 20, 30, 60]:
knnloss = ugb.SimpleKnnLossFunction(uniform_variables, knn=knn)
sknn_classifiers["sknn=%i" % knn] = ugb.uGradientBoostingClassifier(loss=knnloss, **params)
test_classifiers(sknn_classifiers)
In [9]:
sknn2_classifiers = ClassifiersDict()
for diagonal in [0, 0.5, 1, 2]:
knnloss = ExperimentalSimpleKnnLossFunction(uniform_variables, knn=25, diagonal=diagonal)
sknn2_classifiers["diag=%.1f" % diagonal] = ugb.uGradientBoostingClassifier(loss=knnloss, **params)
test_classifiers(sknn2_classifiers)
In [10]:
pw_classifiers = ClassifiersDict()
for knn in [5, 15, 30]:
pw_loss = PairwiseKnnLossFunction(uniform_variables, knn=knn)
pw_classifiers["pw_knn=%i" % knn] = ugb.uGradientBoostingClassifier(loss=pw_loss, **params)
test_classifiers(pw_classifiers)
In [11]:
rknn_classifiers = ClassifiersDict()
for knn in [1, 5, 10]:
rknn_loss = RandomKnnLossFunction(uniform_variables, knn=knn, n_rows=len(trainX) * 3, large_preds_penalty=0.)
rknn_classifiers["rknn=%i" % knn] = ugb.uGradientBoostingClassifier(loss=rknn_loss, **params)
test_classifiers(rknn_classifiers)
In [12]:
rknn2_classifiers = ClassifiersDict()
for factor in [0.5, 1, 2, 4, 8]:
n_rows = int(factor * len(trainX))
rknn2_loss = RandomKnnLossFunction(uniform_variables, knn=20, n_rows=n_rows)
rknn2_classifiers["rknn2=%1.1f" % factor] = ugb.uGradientBoostingClassifier(loss=rknn2_loss, **params)
test_classifiers(rknn2_classifiers)
In [13]:
ss_classifiers = ClassifiersDict()
for subsample in [0.5, .7, 0.8, 1.]:
knnloss = ugb.SimpleKnnLossFunction(uniform_variables, knn=10)
new_params = params.copy()
new_params['subsample'] = subsample
ss_classifiers["subsample=%1.2f" % subsample] = ugb.uGradientBoostingClassifier(loss=knnloss, **new_params)
test_classifiers(ss_classifiers)
In [14]:
sknn3_classifiers = ClassifiersDict()
for update in [True, False]:
knnloss = ugb.SimpleKnnLossFunction(uniform_variables, knn=15)
sknn3_classifiers["update=%s" % str(update)] = \
ugb.uGradientBoostingClassifier(loss=knnloss, **params)
test_classifiers(sknn3_classifiers)
In [15]:
sknn4_classifiers = ClassifiersDict()
for label in [0, 1, [0, 1]]:
knnloss = ugb.SimpleKnnLossFunction(uniform_variables, knn=10, uniform_label=label)
sknn4_classifiers["label=%s" % str(label)] = \
ugb.uGradientBoostingClassifier(loss=knnloss, **params)
test_classifiers(sknn4_classifiers)
In [16]:
full_used_columns = ["M2AB", "M2AC", "Y1", "Y2", "Y3", "Y4", "XA", "XB", "XC"]
full_signalDF = pandas.read_csv(folder + 'signal.csv', sep='\t', usecols=full_used_columns)
full_signal5e5DF = pandas.read_csv(folder + 'signal5e5.csv', sep='\t', usecols=full_used_columns)
full_bgDF = pandas.read_csv(folder + 'bkgd.csv', sep='\t', usecols=full_used_columns)
full_data = pandas.concat([full_signalDF, full_bgDF])
full_labels = numpy.array([1] * len(full_signalDF) + [0] * len(full_bgDF))
full_trainX, full_testX, full_trainY, full_testY = commonutils.train_test_split(full_data, full_labels, train_size=0.5)
In [17]:
base_estimator = DecisionTreeClassifier(max_depth=4)
uniform_variables = ["M2AB", "M2AC"]
n_estimators = 101
full_train_variables = ["Y1", "Y2", "Y3", "Y4", "XA", "XB", "XC"]
base_ada = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=0.1)
full_ada_classifiers = ClassifiersDict()
for n_features in [3, 4, 5, 6]:
full_ada_classifiers['Ada_Feat=%i' % n_features] = \
HidingClassifier(train_variables=full_train_variables[:n_features], base_estimator=base_ada)
full_preds = full_ada_classifiers.fit(full_trainX, full_trainY, ipc_profile=ipc_profile).test_on(full_testX, full_testY)
figure(figsize=(17, 7))
subplot(121), full_preds.learning_curves()
subplot(122), full_preds.sde_curves(uniform_variables)
Out[17]:
Square A matrix is constructed by taking $$a_{ij} = \begin{cases} 0, & \text{if class}_i \neq \text{class}_j \\ 0, & \text{if j-th event is not in knn of i-th event }\\ f(r), & \text{otherwise, where $r$ is distance(i,j)} \end{cases}$$
$f(r)$ is the function to choose
The results of these experiments are too unstable and not reliable,
after rebuilding (or reshuffling datasets), the results may significally change.
Need more experiments here (or better - good theoretical idea why some function should be preferred), there are too many things to play with.
NB: clip(x,a,b) is function that places the result x between a and b (to avoid singularities from log, 1/r and so on)
In [18]:
functions = {'exp': lambda r: numpy.exp(-50*r),
'exp2': lambda r: numpy.exp(-1000*r*r),
'exp3': lambda r: numpy.exp(-2000*r*r),
'log': lambda r: numpy.clip(-numpy.log(r), 0, 7),
'1/r': lambda r: numpy.clip(1/r, 0, 100),
'1/sqrt(r)': lambda r: numpy.clip(r ** -0.5, 0, 10)
}
In [19]:
dist_classifiers = ClassifiersDict()
for name, func in functions.iteritems():
loss = DistanceBasedKnnFunction(uniform_variables, knn=150, distance_dependence=func, row_normalize=False)
dist_classifiers[name] = \
ugb.uGradientBoostingClassifier(loss=loss, **params)
test_classifiers(dist_classifiers, parallelize=False)
In [20]:
dist2_classifiers = ClassifiersDict()
for name, func in functions.iteritems():
loss = DistanceBasedKnnFunction(uniform_variables, knn=50, distance_dependence=func, row_normalize=True)
dist2_classifiers[name + "+norm"] = \
ugb.uGradientBoostingClassifier(loss=loss, **params)
test_classifiers(dist2_classifiers, parallelize=False)
In [21]:
dist4_classifiers = ClassifiersDict()
for knn in [1, 10, 20, 30]:
loss = DistanceBasedKnnFunction(uniform_variables, knn=knn, distance_dependence=lambda r: (r + 1e5)**0, row_normalize=True)
dist4_classifiers['knn=%i' %knn] = ugb.uGradientBoostingClassifier(loss=loss, **params)
test_classifiers(dist4_classifiers, parallelize=False)
In [21]: