In [1]:
%pylab inline
In [2]:
import numpy, pandas
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import root_numpy
from hep_ml import uboost, reports
from hep_ml import HidingClassifier
from hep_ml.commonutils import train_test_split
from hep_ml.meanadaboost import MeanAdaBoostClassifier
from hep_ml.ugradientboosting import uGradientBoostingClassifier as uGB
# profile of IPython cluster used to parallelize computations (None = no cluster)
from hep_ml.config import ipc_profile
import hep_ml.ugradientboosting as ugb
In [3]:
numpy.random.seed(42)
In [4]:
# we want to have predictions flat on the mass
uniform_variables = ['D_MM']
# the variables that will be used in training
train_variables = ['D_P', 'D_BKGCAT', 'D_FD_OWNPV',
'D_TAU', 'D_BPVIPCHI2',
'D_DIRA_OWNPV',
'D_DOCA', 'D_DOCA12', 'D_DOCA13', 'D_DOCA23',
'D_vchi2', 'D_MINIP', 'D_PT', 'D_MMERR',
'p1_PT', 'p2_PT', 'p3_PT'
]
helping_variables = ['D_BKGCAT'] + ['p' + str(i) + '_P' + letter for i in [1,2,3] for letter in "EXYZ"] + \
['p' + str(i) + '_TRUEP_' + letter for i in [1,2,3] for letter in "EXYZ"]
In [5]:
all_branches = uniform_variables + train_variables + helping_variables
all_branches = list(set(all_branches) - set(['D_TAU']))
In [6]:
def load_data(name):
return pandas.DataFrame(root_numpy.root2array('../hep_ml/datasets/D23P/' + name, treename='DecayTree', branches=all_branches))
In [7]:
sig1 = load_data('D2PPP_Signal_Stripped_MD_filtered.root')
sig2 = load_data('D2PPP_Signal_Stripped_MU_filtered.root')
bck1 = load_data('bbbar_bkg_ppp_wide_filtered.root').query('D_BKGCAT > 20')
bck2 = load_data('ccbar_bkg_ppp_wide_filtered.root').query('D_BKGCAT > 20')
concatenating data to one dataframe:
In [8]:
data = pandas.concat([bck1, bck2, sig1, sig2], ignore_index=True)
labels = numpy.concatenate([numpy.zeros(len(bck1) + len(bck2)), numpy.ones(len(sig1) + len(sig2))]).astype(int)
In [9]:
hist(data.D_MM[labels == 1].values, bins=40, label='signal')
hist(data.D_MM[labels == 0].values, bins=40, label='bg')
legend()
Out[9]:
In [10]:
# 300 is light speed
data['D_TAU'] = data.eval('D_FD_OWNPV * D_MM / D_P / 300')
def compute_square_inv_mass(df, name1, name2):
pe = df[name1 + 'E'] + df[name2 + 'E']
px = df[name1 + 'X'] + df[name2 + 'X']
py = df[name1 + 'Y'] + df[name2 + 'Y']
pz = df[name1 + 'Z'] + df[name2 + 'Z']
return pe*pe - px*px - py*py - pz*pz
# data['m12'] = compute_square_inv_mass(data, 'p1_P', 'p2_P')
# data['m13'] = compute_square_inv_mass(data, 'p1_P', 'p3_P')
# data['m23'] = compute_square_inv_mass(data, 'p2_P', 'p3_P')
# using true information for computing uniformity / plots
data['m12'] = compute_square_inv_mass(data, 'p1_TRUEP_', 'p2_TRUEP_')
data['m13'] = compute_square_inv_mass(data, 'p1_TRUEP_', 'p3_TRUEP_')
data['m23'] = compute_square_inv_mass(data, 'p2_TRUEP_', 'p3_TRUEP_')
d1 = (1.969-0.1396)**2 - data.m23 * 1e-6
d2 = (1.969-0.1396)**2 - data.m13 * 1e-6
d3 = (1.969-0.1396)**2 - data.m12 * 1e-6
data['min_dist'] = numpy.minimum(numpy.minimum(d1, d2), d3)
In [11]:
mask = numpy.array(data.min_dist > 0)
data = data.ix[mask, :]
labels = labels[mask]
In [12]:
def prepare_classifiers(train_variables, uniform_variables, n_estimators=200, max_depth=4, min_samples_leaf=30, uniform_label=0):
"""
This function prepares the classifiers with proper parameters,
the parameters for decision tree are the same in all classifiers.
uniform_variables: list of variables along which uniformity of predictions is desired
train_variables: list of variables used by classifiers
(should not include uniform variables)
uniform_label: 1, if we want flatness in signal predictions
0, if we want flatness in background
"""
# parameter for gradient boosting
ugb_params = {'max_depth': max_depth,
'min_samples_leaf': min_samples_leaf,
'train_variables': train_variables,
'subsample': 0.5,
'n_estimators': n_estimators}
# parameter fot other classifiers
common_params = {'uniform_variables': uniform_variables,
'train_variables': train_variables,
'n_estimators': n_estimators}
base_tree = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
classifiers = reports.ClassifiersDict()
base_ada = AdaBoostClassifier(base_estimator=base_tree, n_estimators=n_estimators, learning_rate=0.2)
classifiers['ada'] = HidingClassifier(train_variables=train_variables, base_estimator=base_ada)
classifiers['knnAda'] = MeanAdaBoostClassifier(base_estimator=base_tree, learning_rate=0.1, uniform_label=[0,1], **common_params)
classifiers['uGB+nn'] = uGB(loss=ugb.SimpleKnnLossFunction(uniform_variables, knn=10, uniform_label=[0,1]),
learning_rate=0.2, **ugb_params)
binflatnessloss = ugb.BinFlatnessLossFunction(uniform_variables, uniform_label=uniform_label, ada_coefficient=0.03)
classifiers['uGB+binFL'] = uGB(loss=binflatnessloss, learning_rate=0.1, **ugb_params)
knnflatnessloss = ugb.KnnFlatnessLossFunction(uniform_variables, n_neighbours=300,
uniform_label=uniform_label, ada_coefficient=0.03)
classifiers['uGB+knnFL'] = uGB(loss=knnflatnessloss, learning_rate=0.1, **ugb_params)
# classifiers['uBDT'] = uboost.uBoostBDT(base_estimator=base_tree, uniform_label=uniform_label, **common_params)
classifiers['uBoost'] = uboost.uBoostClassifier(base_estimator=base_tree, uniform_label=uniform_label,
efficiency_steps=7, **common_params)
return classifiers
In [13]:
# test mode
# mask = numpy.random.random(len(data)) > 0.9
# data = data.ix[mask, :]
# labels = labels[mask]
In [14]:
trainX, testX, trainY, testY = train_test_split(data, labels, train_size=0.5)
In [15]:
mask = (trainY == 1) | (trainX.D_MM < 1850)
trainX_bck = trainX.ix[mask, :]
trainY_bck = trainY[numpy.array(mask)]
In [16]:
hist(numpy.array(testX.D_MM[testY == 0]), label='test bck', bins=30)
hist(numpy.array(trainX_bck.D_MM[trainY_bck == 0]), label='train bck')
legend()
Out[16]:
In [17]:
classifiers = prepare_classifiers(train_variables, uniform_variables, uniform_label=0)
classifiers.fit(trainX_bck, trainY_bck, ipc_profile=ipc_profile)
preds = classifiers.test_on(testX, testY)
In [18]:
figure(figsize=(18, 7))
subplot(131), title('Learning curves'), preds.learning_curves()
subplot(132), title('SDE curves:sig') , preds.sde_curves(uniform_variables=uniform_variables, step=3, label=1)
subplot(133), title('SDE curves:bg') , preds.sde_curves(uniform_variables=uniform_variables, step=3, label=0)
show()
In [19]:
preds.efficiency(uniform_variables=uniform_variables, label=1)
Out[19]:
In [20]:
preds.prediction_pdf(bins=50)
In [21]:
pr = preds.predictions['uGB+binFL'][ testY == 1 , 1]
hist(pr, bins=100)
numpy.unique(pr), len(numpy.unique(pr))
Out[21]:
In [22]:
preds.roc()
Out[22]:
In [23]:
preds.efficiency(uniform_variables=uniform_variables, label=0)
Out[23]:
In [24]:
preds.roc()
Out[24]:
In [25]:
# preds.root_roc().SaveAs('datasets/D23P/plots/roc_flat_in_bck.root')
# preds.root_roc()
In [26]:
preds.correlation_curves(var_name=uniform_variables[0], label=0)
Out[26]:
In [27]:
figure(figsize=(20, 7))
subplot(141), preds.sde_curves(uniform_variables=uniform_variables, step=5, label=0), title('SDE')
subplot(142), preds.cvm_curves(uniform_variables=uniform_variables, step=5, label=0), title('Cramer-von Mises')
subplot(143), preds.theil_curves(uniform_variables=uniform_variables, step=5, label=0), title('Theil')
subplot(144), preds.ks_curves(uniform_variables=uniform_variables, step=5, label=0), title('KS')
show()
pay attention: uniform_label=1 means we want to be flat in signal
In [28]:
classifiers2 = prepare_classifiers(train_variables, uniform_variables, uniform_label=1)
classifiers2.fit(trainX, trainY, ipc_profile=ipc_profile)
sig_preds = classifiers2.test_on(testX, testY)
In [29]:
figure(figsize=(18, 7))
subplot(131), title('Learning curves'), sig_preds.learning_curves()
subplot(132), title('SDE curves:sig') , sig_preds.sde_curves(uniform_variables=uniform_variables, step=3, label=1)
subplot(133), title('SDE curves:bg') , sig_preds.sde_curves(uniform_variables=uniform_variables, step=3, label=0)
show()
In [30]:
sig_preds.efficiency(uniform_variables=uniform_variables, label=1)
sig_preds.efficiency(uniform_variables=uniform_variables, label=0)
Out[30]:
In [31]:
# sig_preds.root_roc().SaveAs('datasets/D23P/plots/roc_flat_in_sig_Dmass.root')
sig_preds.roc()
Out[31]:
In [32]:
def plotDistribution2D(var_name1, var_name2, data_frame, bins=30):
pylab.hist2d(numpy.array(data_frame[var_name1]), numpy.array(data_frame[var_name2]), bins=bins)
pylab.xlabel(var_name1), pylab.ylabel(var_name2)
pylab.colorbar()
pylab.figure(figsize=(16, 6))
subplot(1, 2, 1), pylab.title("signal"), plotDistribution2D("m12", "m13", data[labels == 1])
subplot(1, 2, 2), pylab.title("background"), plotDistribution2D("m12", "m13", data[labels == 0])
show()
In [33]:
dalitz_vars = ['m12', 'm13']
In [34]:
dalitz_classifiers = prepare_classifiers(train_variables, uniform_variables=dalitz_vars, uniform_label=1)
dalitz_classifiers.fit(trainX, trainY, ipc_profile=ipc_profile)
dalitz_predictions = dalitz_classifiers.test_on(testX, testY)
In [35]:
# dalitz_predictions.root_roc().SaveAs('datasets/D23P/plots/roc_flat_in_sig_Dalitz.root')
dalitz_predictions.roc()
Out[35]:
In [36]:
dalitz_predictions.efficiency(uniform_variables=dalitz_vars, n_bins=15, label=1)
Out[36]:
In [37]:
dalitz_predictions.prediction_pdf()
In [38]:
figure(figsize=[15, 7])
dalitz_predictions.sde_curves(dalitz_vars, step=10)
figure(figsize=[15, 7])
dalitz_predictions.learning_curves(step=10)
Out[38]:
In [39]:
dalitz_predictions.hist(['min_dist'])
Out[39]:
In [40]:
dalitz_predictions.efficiency(['min_dist'], label=1)
Out[40]:
In [41]:
figure(figsize=[13, 7])
dalitz_predictions.sde_curves(['min_dist'], label=1)
In [42]:
fl_clf = reports.ClassifiersDict()
fl_clf['AdaBoost'] = HidingClassifier(train_variables,
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4, min_samples_leaf=30,), n_estimators=200, learning_rate=0.2))
for alpha in [0.01, 0.02, 0.05, 0.1, 0.2]:
flatnessloss = ugb.KnnFlatnessLossFunction(dalitz_vars, uniform_label=1, ada_coefficient=alpha)
fl_clf['uGB+FL, alpha=' + str(alpha)] = uGB(loss=flatnessloss, train_variables=train_variables, subsample=0.5,
learning_rate=0.1, n_estimators=200, max_depth=4, min_samples_leaf=30)
fl_clf.fit(trainX, trainY, ipc_profile=ipc_profile)
fl_pred = fl_clf.test_on(testX, testY)
In [43]:
mpl.rcParams['lines.linewidth'] = 2.
In [44]:
fl_pred.learning_curves(), ylim(0.85, 1.)
legend().set_visible(False)
show()
sde_data = pandas.DataFrame(fl_pred.sde_curves(dalitz_vars, label=1, return_data=True))
legend().set_visible(False)
# sde_data.to_csv('datasets/D23P/plots/sde_for_differend_alpha.csv')
In [45]:
# fl_pred.root_roc().SaveAs('datasets/D23P/plots/roc_for_different_alpha.root')
fl_pred.roc()
Out[45]:
In [46]:
# stop execution here
break
In [ ]:
def save_predictions(X, y, classifiers, filename):
X2 = X.copy()
X2['is_signal'] = y
for name, clf in classifiers.iteritems():
X2['pred_' + name] = clf.predict_proba(X)[:, 1]
root_numpy.array2root(X2.to_records(), mode='recreate', filename=filename)
In [ ]:
save_predictions(testX, testY, dalitz_classifiers, 'datasets/D23P/dalitz_test_prediction.root')
In [ ]:
save_predictions(testX, testY, classifiers, 'datasets/D23P/flat_in_DM_on_bck_prediction.root')
In [ ]:
save_predictions(testX, testY, fl_clf, 'datasets/D23P/different_alpha_prediction.root')
In [ ]:
n_estimators=200
max_depth=4
min_samples_leaf=30
uniform_label=1
# parameter for gradient boosting
ugb_params = {'max_depth': max_depth,
'min_samples_leaf': min_samples_leaf,
'train_variables': train_variables,
'subsample': 0.5,
'n_estimators': n_estimators}
# parameter fot other classifiers
common_params = {'uniform_variables': uniform_variables,
'train_variables': train_variables,
'n_estimators': n_estimators}
base_tree = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
classifiers = reports.ClassifiersDict()
base_ada = AdaBoostClassifier(base_estimator=base_tree, n_estimators=n_estimators, learning_rate=0.2)
classifiers['ada'] = HidingClassifier(train_variables=train_variables, base_estimator=base_ada)
classifiers['knnAda'] = MeanAdaBoostClassifier(base_estimator=base_tree, learning_rate=0.1, uniform_label=[0,1], **common_params)
classifiers['uGB+nn'] = uGB(loss=ugb.SimpleKnnLossFunction(uniform_variables, knn=10, uniform_label=[0,1]),
learning_rate=0.2, **ugb_params)
binflatnessloss = ugb.BinFlatnessLossFunction(uniform_variables, uniform_label=uniform_label, ada_coefficient=0.03)
classifiers['uGB+binFL'] = uGB(loss=binflatnessloss, learning_rate=0.1, **ugb_params)
knnflatnessloss = ugb.KnnFlatnessLossFunction(uniform_variables, n_neighbours=300,
uniform_label=uniform_label, ada_coefficient=0.03)
classifiers['uGB+knnFL'] = uGB(loss=knnflatnessloss, learning_rate=0.1, **ugb_params)
# classifiers['uBDT'] = uboost.uBoostBDT(base_estimator=base_tree, uniform_label=uniform_label, **common_params)
classifiers['uBoost'] = uboost.uBoostClassifier(base_estimator=base_tree, uniform_label=uniform_label,
efficiency_steps=7, **common_params)
return classifiers
In [ ]:
classifiers_plot = prepare_classifiers(train_variables, uniform_variables, uniform_label=1)
# classifiers_plot.pop('uBoost')
classifiers_plot.fit(trainX, trainY, ipc_profile=ipc_profile)
sig_preds = classifiers_plot.test_on(testX, testY)
In [ ]:
figure(figsize=(18, 7))
subplot(131), title('Learning curves'), sig_preds.learning_curves()
subplot(132), title('SDE curves:sig') , sig_preds.sde_curves(uniform_variables=uniform_variables, step=3, label=1)
subplot(133), title('SDE curves:bg') , sig_preds.sde_curves(uniform_variables=uniform_variables, step=3, label=0)
show()
In [ ]:
sig_preds.efficiency(uniform_variables=uniform_variables, label=1)
sig_preds.efficiency(uniform_variables=uniform_variables, label=0)
In [ ]:
hist(trainX.D_MM.values[trainY == 0], normed=True)
hist(trainX.D_MM.values[trainY == 1], normed=True)
pass
In [47]:
import os, sys
import ROOT
from ROOT import TGraph, TCanvas, TLegend
from array import array
ROOT.gROOT.LoadMacro('../assets/paperdraft/graphs/lhcbStyle.h')
ROOT.lhcbStyle()
In [48]:
from rep_tmva import rootnotes
marker = 20
size = 1
colour = 1
In [49]:
dalitz_vars = ['m12', 'm13']
sde_dalitz_data = dalitz_predictions.sde_curves(dalitz_vars, step=10, return_data=True)
In [ ]:
from collections import OrderedDict
sde_dalitz_renamed = OrderedDict()
sde_dalitz_renamed['AdaBoost'] = sde_dalitz_data['ada']
sde_dalitz_renamed['kNNAda'] = sde_dalitz_data['knnAda']
sde_dalitz_renamed['uGBkNN'] = sde_dalitz_data['uGB+nn']
sde_dalitz_renamed['uGBFL(bin)'] = sde_dalitz_data['uGB+binFL']
sde_dalitz_renamed['uGBFL(knn)'] = sde_dalitz_data['uGB+knnFL']
sde_dalitz_renamed['uBoost'] = sde_dalitz_data['uBoost']
In [ ]:
keeper = []
In [ ]:
def plot_sde_root(df):
canvas = rootnotes.canvas("canv%i" % len(keeper), (1500, 800))
canvas.Divide(1)
canvas.cd(1)
legend = TLegend(0.175, 0.17, 0.525, 0.45)
legend.SetFillStyle(1001)
legend.SetFillColor(ROOT.kWhite)
legend.SetMargin(0.35)
legend.SetTextSize(0.04)
keeper.append(legend)
for i, (color_id, (name, graph)) in enumerate(zip([0, 6, 2, 1, 1, 3], df.iteritems())):
plot = TGraph(len(graph), array('f', graph.keys()), array('f', graph.values))
plot.SetLineColor(colour + color_id)
plot.SetMarkerColor(colour + color_id)
plot.SetMarkerSize(2)
plot.SetMarkerStyle(marker + i)
plot.GetXaxis().SetTitle('n trees')
plot.GetXaxis().SetNdivisions(510)
plot.GetYaxis().SetTitle('SDE')
plot.GetXaxis().SetTitleSize(0.1)
plot.GetXaxis().SetTitleOffset(0.65)
plot.GetXaxis().SetLabelSize(0.075)
plot.GetXaxis().SetLabelOffset(0.015)
plot.GetYaxis().SetTitleSize(0.1)
plot.GetYaxis().SetTitleOffset(0.65)
plot.GetYaxis().SetLabelSize(0.075)
plot.GetYaxis().SetRangeUser(0.0, 0.11)
plot.GetXaxis().SetRangeUser(0.0, 200.0)
plot.SetTitle(name)
legend.AddEntry(plot, name ,'pl')
if i == 0:
plot.Draw('APL')
else:
plot.Draw('PL')
keeper.append(plot)
legend.Draw()
keeper.append(canvas)
return canvas
In [ ]:
canvas = plot_sde_root(sde_dalitz_renamed)
In [ ]:
canvas
In [ ]:
# canvas.SaveAs('../assets/paperdraft/graphs/sde_for_dalitz_staged.png')
# canvas.SaveAs('../assets/paperdraft/graphs/sde_for_dalitz_staged.pdf')
# canvas.SaveAs('../assets/paperdraft/graphs/sde_for_dalitz_staged.root')
In [53]:
pandas.DataFrame(sde_dalitz_data).ix[199, :]
Out[53]:
In [ ]: