Training of the BDT to define if track comes from the same side or opposite side.
Labels:
From test we come up with the statement that SS, NAN should have inverted tracks sing for $K_s$ and $K*$ decays. Thus we train OS vs SS, NAN
In [1]:
%pylab inline
In [2]:
import sys
sys.path.insert(0, "../")
In [3]:
import pandas
import root_numpy
from folding_group import FoldingGroupClassifier
from decisiontrain import DecisionTrainClassifier
from rep.estimators import SklearnClassifier
In [4]:
data = pandas.DataFrame(root_numpy.root2array('../datasets/MC/csv/WG/Bu_JPsiK/2012/Tracks.root'))
In [5]:
from utils import data_tracks_preprocessing
data = data_tracks_preprocessing(data)
In [6]:
for group in range(-1, 2, 1):
print group, 1. * numpy.sum(data.OS_SS.values == group) / len(data)
In [7]:
len(data)
Out[7]:
In [8]:
features = ['cos_diff_phi', 'diff_pt', 'partPt', 'partP', 'nnkrec', 'diff_eta', 'EOverP',
'ptB', 'sum_PID_mu_k', 'proj', 'PIDNNe', 'sum_PID_k_e', 'PIDNNk', 'sum_PID_mu_e', 'PIDNNm',
'phi', 'IP', 'IPerr', 'IPs', 'veloch', 'max_PID_k_e', 'ghostProb',
'IPPU', 'eta', 'max_PID_mu_e', 'max_PID_mu_k', 'partlcs']
In [9]:
kw = {'bins': 100, 'alpha': 0.4, 'normed': True}
figure(figsize=(20, 35))
for n, f in enumerate(features):
subplot(10, 4, n + 1)
r = (numpy.min(data.loc[data.OS_SS == -1, f].values), numpy.max(data.loc[data.OS_SS == -1, f].values))
hist(data.loc[data.OS_SS == -1, f].values, label='OS', range=r, **kw)
hist(data.loc[data.OS_SS == 0, f].values, label='NAN', range=r, **kw)
hist(data.loc[data.OS_SS == 1, f].values, label='SS', range=r, **kw)
title(f)
legend()
In [10]:
data_os_ss = data[data.OS_SS != 0]
weight = numpy.ones(len(data_os_ss))
weight[data_os_ss.OS_SS.values >= 0] *= 1. * sum(data_os_ss.OS_SS < 0) / sum(data_os_ss.OS_SS >= 0)
data_os_ss['weight'] = weight
In [20]:
len(data_os_ss)
Out[20]:
In [24]:
from hep_ml.losses import LogLossFunction
In [30]:
loss = LogLossFunction(regularization=100)
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=10000, depth=6, loss=loss,
max_features=15, n_threads=12)
tt_folding = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=432,
train_features=features, group_feature='group_column')
%time tt_folding.fit(data_os_ss, data_os_ss.OS_SS >= 0)
pass
In [31]:
import cPickle
with open('../models/dt_ss_os_only.pkl', 'w') as f:
cPickle.dump(tt_folding, f)
In [32]:
prob = tt_folding.predict_proba(data_os_ss)[:, 1]
In [33]:
from sklearn.metrics import roc_auc_score
roc_auc_score(data_os_ss.OS_SS >= 0, prob, sample_weight=data_os_ss.weight)
Out[33]:
In [34]:
from rep.report.metrics import RocAuc
tt_folding.test_on(data_os_ss, data_os_ss.OS_SS >= 0).learning_curve(RocAuc())
Out[34]:
In [43]:
tt_folding.estimators[0].clf.estimators = tt_folding.estimators[0].clf.estimators[:7000]
tt_folding.estimators[1].clf.estimators = tt_folding.estimators[1].clf.estimators[:7000]
In [46]:
prob = tt_folding.predict_proba(data_os_ss)[:, 1]
In [53]:
report = tt_folding.test_on(data_os_ss, data_os_ss.OS_SS >= 0)
In [55]:
report.feature_importance()
Out[55]:
In [44]:
from utils import plot_calibration
In [47]:
plot_calibration(prob, data_os_ss.OS_SS.values >= 0, weight=data_os_ss.weight.values)
In [48]:
from utils import calibrate_probs
In [49]:
prob_calib, calibrator = calibrate_probs(data_os_ss.OS_SS.values >= 0, data_os_ss.weight.values, prob,
logistic=True)
plot_calibration(prob_calib, data_os_ss.OS_SS.values >= 0, weight=data_os_ss.weight.values)
In [50]:
with open('../models/os_ss_calibrator_only.pkl', 'w') as f:
cPickle.dump(calibrator, f)
In [ ]:
probs_nan = tt_folding.predict_proba(data[data.OS_SS == 0])[:, 1]
probs_nan_calib = calibrator.predict_proba(probs_nan)
In [ ]:
hist(prob_calib[data_os_ss.OS_SS.values < 0], normed=True, alpha=0.4, label='OS', bins=100);
hist(prob_calib[data_os_ss.OS_SS.values > 0], normed=True, alpha=0.4, label='SS', bins=100);
hist(probs_nan_calib, normed=True, alpha=0.4, label='NAN', bins=100);
legend();
In [20]:
hist(prob_calib[data_os_ss.OS_SS.values < 0], normed=True, alpha=0.4, label='OS', bins=100);
hist(prob_calib[data_os_ss.OS_SS.values > 0], normed=True, alpha=0.4, label='SS', bins=100);
hist(prob_calib[data_os_ss.OS_SS.values == 0], normed=True, alpha=0.4, label='NAN', bins=100);
legend();