In [2]:
!ls
In [1]:
%pylab inline
figsize(8, 6)
In [2]:
import sys
sys.path.insert(0, "../")
In [3]:
import pandas
import numpy
from folding_group import FoldingGroupClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc
from sklearn.metrics import roc_curve, roc_auc_score
In [4]:
from utils import get_events_number, get_events_statistics
In [5]:
import root_numpy
data = pandas.DataFrame(root_numpy.root2array('../datasets/MC/csv/WG/Bu_JPsiK/2012/Tracks.root'))
In [6]:
data.head()
Out[6]:
In [9]:
from utils import data_tracks_preprocessing
data = data_tracks_preprocessing(data)
In [11]:
', '.join(data.columns)
Out[11]:
In [12]:
from utils import compute_N_B_events_MC
N_B_events = compute_N_B_events_MC('../datasets/MC/csv/WG/Bu_JPsiK/2012/Tracks.root',
'../datasets/MC/csv/WG/Bu_JPsiK/2012/Vertices.root')
N_B_events
Out[12]:
In [13]:
import json
with open('../models/JPsiKMC.json', 'w') as f:
json.dump({'N_B_events': N_B_events}, f)
In [14]:
os_region_mask = (data.IPs > 3) & ((abs(data.diff_eta) > 0.6) | (abs(data.diff_phi) > 0.825))
ss_region_mask = (abs(data.diff_eta) < 0.6) & (abs(data.diff_phi) < 0.825) & (data.IPs < 3)
In [15]:
e_mask = numpy.abs(data.MCID) == 11
mu_mask = numpy.abs(data.MCID) == 13
k_mask = numpy.abs(data.MCID) == 321
pi_mask = numpy.abs(data.MCID) == 211
p_mask = numpy.abs(data.MCID) == 2212
In [16]:
from utils import compute_sum_of_charges
means_sum_of_charges = [compute_sum_of_charges(data[mask], name, bins=21, show_with_signal=False)
for mask, name in zip([data.signB > -100, os_region_mask, ss_region_mask],
['full', 'OS', 'SS'])]
In [17]:
means_sum_of_charges_by_particle = [compute_sum_of_charges(data[mask], name, bins=21, show_with_signal=False)
for mask, name in zip([e_mask, mu_mask, k_mask, pi_mask, p_mask],
['e', 'mu', 'K', 'pi', 'p'])]
In [18]:
means_sum_of_charges_by_particle_os = [compute_sum_of_charges(data[mask & (data.OS_SS == -1)],
name, bins=21, show_with_signal=False)
for mask, name in zip([e_mask, mu_mask, k_mask, pi_mask, p_mask],
['e', 'mu', 'K', 'pi', 'p'])]
In [19]:
means_sum_of_charges_by_particle_ss = [compute_sum_of_charges(data[mask & (data.OS_SS != -1)],
name, bins=21, show_with_signal=False)
for mask, name in zip([e_mask, mu_mask, k_mask, pi_mask, p_mask],
['e', 'mu', 'K', 'pi', 'p'])]
In [20]:
pandas.concat(means_sum_of_charges)
Out[20]:
In [21]:
pandas.concat(means_sum_of_charges_by_particle)
Out[21]:
In [22]:
pandas.concat(means_sum_of_charges_by_particle_os)
Out[22]:
In [23]:
pandas.concat(means_sum_of_charges_by_particle_ss)
Out[23]:
In [24]:
N_B_passed = float(get_events_number(data))
tagging_efficiency = N_B_passed / N_B_events
tagging_efficiency_delta = sqrt(N_B_passed) / N_B_events
tagging_efficiency, tagging_efficiency_delta
Out[24]:
In [25]:
features = ['cos_diff_phi', 'diff_pt', 'partPt', 'partP', 'nnkrec', 'diff_eta', 'EOverP',
'ptB', 'sum_PID_mu_k', 'proj', 'PIDNNe', 'sum_PID_k_e', 'PIDNNk', 'sum_PID_mu_e', 'PIDNNm',
'phi', 'IP', 'IPerr', 'IPs', 'veloch', 'max_PID_k_e', 'ghostProb',
'IPPU', 'eta', 'max_PID_mu_e', 'max_PID_mu_k', 'partlcs']
In [26]:
from itertools import combinations
figure(figsize=[15, 16])
bins = 60
step = 3
for i, (feature1, feature2) in enumerate(combinations(['PIDNNk', 'PIDNNm', 'PIDNNe', 'PIDNNp', 'PIDNNpi'], 2)):
subplot(4, 3, i + 1)
Z, (x, y) = numpy.histogramdd(data[[feature1, feature2]].values, bins=bins, range=([0, 1], [0, 1]))
pcolor(numpy.log(Z).T, vmin=0)
xlabel(feature1)
ylabel(feature2)
xticks(numpy.arange(bins, step), x[::step]), yticks(numpy.arange(bins, step), y[::step])
In [27]:
figure(figsize=(10, 6))
_, n_tracks_all = numpy.unique(data['event_id'], return_counts=True)
hist(n_tracks_all, bins=100, range=(0, 101))
title('Number of tracks')
Out[27]:
In [28]:
from decisiontrain import DecisionTrainClassifier
from rep.estimators import SklearnClassifier
from hep_ml.losses import LogLossFunction
In [29]:
data_sw_passed_lds = LabeledDataStorage(data, data.label)
In [ ]:
from rep.estimators import XGBoostClassifier
xgb = FoldingGroupClassifier(XGBoostClassifier(subsample=), n_folds=2, random_state=321,
train_features=features, group_feature='group_column')
xgb.fit
In [30]:
loss_function = LogLossFunction(regularization=100)
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=4000, depth=6,
max_features=15, n_threads=12, loss=loss_function)
tt_folding = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=321,
train_features=features, group_feature='group_column')
%time tt_folding.fit_lds(data_sw_passed_lds)
pass
In [31]:
import cPickle
with open('../models/dt_MC.pkl', 'w') as f:
cPickle.dump(tt_folding, f)
In [32]:
comparison_report = tt_folding.test_on_lds(data_sw_passed_lds)
In [33]:
comparison_report.compute_metric(RocAuc())
Out[33]:
In [34]:
comparison_report.roc()
Out[34]:
In [35]:
comparison_report.feature_importance()
Out[35]:
In [36]:
from rep.report.metrics import RocAuc
comparison_report.learning_curve(RocAuc(), steps=1)
Out[36]:
In [37]:
from utils import calibrate_probs, get_B_data_for_given_part
from utils import plot_calibration
In [38]:
p_track_calibrated, calibrator_tracks = calibrate_probs(data.label.values, data.N_sig_sw.values,
tt_folding.predict_proba(data)[:, 1],
logistic=True)
In [39]:
roc_auc_score(data.label, p_track_calibrated)
Out[39]:
In [40]:
plot_calibration(p_track_calibrated, data.label.values)
In [42]:
Bsign, Bweight, Bprob, Bevent, auc_full = get_B_data_for_given_part(p_track_calibrated, data,
N_B_events, logistic=True)
In [43]:
Bprob_calibrated, calibrator_B = calibrate_probs(Bsign, Bweight, Bprob, symmetrize=True)
In [68]:
Bprob_calibrated_log, calibrator_B_log = calibrate_probs(Bsign, Bweight, Bprob, symmetrize=True, logistic=True)
In [45]:
from utils import compute_mistag
bins = [0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
percentile_bins = [10, 20, 30, 40, 50, 60, 70, 80, 90]
In [46]:
figsize(12, 10)
compute_mistag(Bprob, Bsign, Bweight, Bsign > -100, label="$B$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob, Bsign, Bweight, Bsign == 1, label="$B^+$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob, Bsign, Bweight, Bsign == -1, label="$B^-$", uniform=False,
bins=percentile_bins)
legend(loc='best'), xlabel('mistag probability'), ylabel('true mistag probability')
title('B prob, percentile bins')
Out[46]:
In [47]:
figsize(12, 10)
compute_mistag(Bprob_calibrated, Bsign, Bweight, Bsign > -100, label="$B$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_calibrated, Bsign, Bweight, Bsign == 1, label="$B^+$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_calibrated, Bsign, Bweight, Bsign == -1, label="$B^-$", uniform=False,
bins=percentile_bins)
legend(loc='best'), xlabel('mistag probability'), ylabel('true mistag probability')
title('B prob, percentile bins')
Out[47]:
In [48]:
figsize(12, 10)
compute_mistag(Bprob_calibrated_log, Bsign, Bweight, Bsign > -100, label="$B$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_calibrated_log, Bsign, Bweight, Bsign == 1, label="$B^+$", uniform=False,
bins=percentile_bins)
compute_mistag(Bprob_calibrated_log, Bsign, Bweight, Bsign == -1, label="$B^-$", uniform=False,
bins=percentile_bins)
legend(loc='best'), xlabel('mistag probability'), ylabel('true mistag probability')
title('B prob, percentile bins')
Out[48]:
In [49]:
plot_calibration(Bprob_calibrated, Bsign > 0)
In [63]:
hist(Bprob_calibrated_log[Bsign == 1], normed=True, alpha=0.5, bins=100)
hist(Bprob_calibrated_log[Bsign == -1], normed=True, alpha=0.5, bins=100);
In [64]:
hist(Bprob_calibrated[Bsign == 1], normed=True, alpha=0.5, bins=100, range=(0, 1))
hist(Bprob_calibrated[Bsign == -1], normed=True, alpha=0.5, bins=100, range=(0, 1));
In [50]:
with open('../models/calibrator_tracks_MC.pkl', 'w') as f:
cPickle.dump(calibrator_tracks, f)
with open('../models/calibrator_B_MC.pkl', 'w') as f:
cPickle.dump(calibrator_B, f)
In [69]:
with open('../models/calibrator_B_MC_log.pkl', 'w') as f:
cPickle.dump(calibrator_B_log, f)
In [53]:
from utils import prepare_B_data_for_given_part
In [54]:
Bdata_prepared = prepare_B_data_for_given_part(tt_folding, [data], logistic=True, N_B_events=N_B_events)
Bdata_prepared.to_csv('../models/Bdata_tracks_MC.csv', header=True, index=False)
In [55]:
from utils import predict_by_estimator
part_data, part_prob = predict_by_estimator(tt_folding, [data])
pandas.DataFrame({'label': part_data.label.values,
'weight': part_data.N_sig_sw,
'part_prob': part_prob,
'OS_SS': part_data.OS_SS,
'xFlag': part_data.xFlag}).to_csv('../models/part_tracks_MC.csv', header=True, index=False)
In [ ]:
from utils import get_result_with_bootstrap_for_given_part
result = get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data], 'tt-log', logistic=True, N_B_events=N_B_events,
n_calibrations=30)
result.to_csv('../img/tracks_MC.csv', index=False, header=True)
In [57]:
result
Out[57]: