In [1]:
%pylab inline
In [ ]:
import sys
sys.path.insert(0, "../")
In [2]:
import pandas
import numpy
from sklearn.metrics import roc_curve, roc_auc_score
from rep.metaml import FoldingClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc
In [3]:
from utils import get_N_B_events, get_events_number, get_events_statistics
In [4]:
import root_numpy
data = pandas.DataFrame(root_numpy.root2array('../datasets/MC/csv/Bu_JPsiK/Vertices.root', selection='(vcharge > 0.2)'))
In [5]:
data.head()
Out[5]:
In [6]:
event_id_column = 'event_id'
event_id = data.run.apply(str) + '_' + data.event.apply(str)
data['group_column'] = numpy.unique(event_id, return_inverse=True)[1]
# all weights are 1, because this is MC
data['N_sig_sw'] = 1
data[event_id_column] = event_id
In [7]:
get_events_statistics(data)
Out[7]:
In [8]:
import json
with open('../models/JPsiKMC.json', 'r') as f:
N_B_events = json.load(f)['N_B_events']
In [9]:
N_B_events
Out[9]:
In [10]:
N_pass = get_events_number(data)
tagging_efficiency = 1. * N_pass / N_B_events
tagging_efficiency_delta = sqrt(N_pass) / N_B_events
print tagging_efficiency, tagging_efficiency_delta
In [11]:
features = ['mult', 'nnkrec', 'ptB', 'ipsmean', 'ptmean', 'vcharge',
'svm', 'svp', 'BDphiDir', 'svtau', 'docamax']
features_old = ['mult', 'nnkrec', 'ptB', 'vflag', 'ipsmean', 'ptmean', 'vcharge',
'svm', 'svp', 'BDphiDir', 'svtau', 'docamax']
In [13]:
features_all = ['mult', u'nnkrec', u'ptB', 'ptmean', u'ipsmean', u'vcharge', u'svm', u'svp', u'BDphiDir',
u'svtau', u'docamax', u'massSeed', 'pt1', u'pt2', u'ips1', u'ips2', u'phi1', u'phi2', u'ghost1',
u'ghost2', u'pointtheta', u'seedchi2', u'rdist', u'probf']
In [14]:
from utils import compute_sum_of_charges
In [15]:
old_mask = data.v_cut.values == 1
In [16]:
compute_sum_of_charges(data, 'Vertices', bins=100,
event_id_column=event_id_column, sign_part_column='signVtx')
Out[16]:
In [17]:
data.loc[:, 'label'] = (data.signB.values * data.signVtx.values > 0) * 1
In [18]:
from decisiontrain import DecisionTrainClassifier
from rep.estimators import SklearnClassifier
from folding_group import FoldingGroupClassifier
In [19]:
data_sw_passed_lds = LabeledDataStorage(data, data.label.values)
In [21]:
tt_base = DecisionTrainClassifier(learning_rate=0.01, n_estimators=10000, depth=6,
max_features=8, n_threads=12)
tt_folding = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
train_features=features, group_feature='group_column')
%time tt_folding.fit_lds(data_sw_passed_lds)
pass
In [22]:
tt_folding_all = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
train_features=features_all, group_feature='group_column')
%time tt_folding_all.fit_lds(data_sw_passed_lds)
pass
In [23]:
data_sw_passed_v_cut_lds = LabeledDataStorage(data.ix[old_mask, :], data.loc[old_mask, 'label'].values)
tt_folding_v_cut = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
train_features=features_old, group_feature='group_column')
%time tt_folding_v_cut.fit_lds(data_sw_passed_v_cut_lds)
pass
In [24]:
x = data.loc[old_mask, 'event_id'].values
data.index = data.event_id
data_sel = data.ix[x, :]
In [25]:
data_sw_passed_sp_lds = LabeledDataStorage(data_sel, data_sel['label'].values)
tt_folding_sp = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
train_features=features, group_feature='group_column')
%time tt_folding_sp.fit_lds(data_sw_passed_sp_lds)
pass
In [26]:
report = ClassificationReport({'dt': tt_folding, 'dt with all features': tt_folding_all}, data_sw_passed_lds)
In [27]:
report.learning_curve(RocAuc(), steps=1)
Out[27]:
In [28]:
report.compute_metric(RocAuc())
Out[28]:
In [29]:
report.roc()
Out[29]:
In [30]:
report.feature_importance()
Out[30]:
In [31]:
tt_folding.estimators[0].clf.estimators = tt_folding.estimators[0].clf.estimators[:4000]
tt_folding.estimators[1].clf.estimators = tt_folding.estimators[1].clf.estimators[:4000]
In [32]:
tt_folding_all.estimators[0].clf.estimators = tt_folding_all.estimators[0].clf.estimators[:6000]
tt_folding_all.estimators[1].clf.estimators = tt_folding_all.estimators[1].clf.estimators[:6000]
In [33]:
report = ClassificationReport({'dt': tt_folding, 'dt with all features': tt_folding_all}, data_sw_passed_lds)
In [34]:
report.compute_metric(RocAuc())
Out[34]:
In [35]:
report_v_cut = ClassificationReport({'tt': tt_folding_v_cut}, data_sw_passed_v_cut_lds)
In [36]:
report_v_cut.learning_curve(RocAuc(), steps=1)
Out[36]:
In [38]:
report_v_cut.compute_metric(RocAuc())
Out[38]:
In [39]:
tt_folding_v_cut.estimators[0].clf.estimators = tt_folding_v_cut.estimators[0].clf.estimators[:7000]
tt_folding_v_cut.estimators[1].clf.estimators = tt_folding_v_cut.estimators[1].clf.estimators[:7000]
In [40]:
report_sp = ClassificationReport({'tt': tt_folding_sp}, data_sw_passed_sp_lds)
report_sp.learning_curve(RocAuc(), steps=1)
Out[40]:
In [41]:
report_sp.compute_metric(RocAuc())
Out[41]:
In [42]:
tt_folding_sp.estimators[0].clf.estimators = tt_folding_sp.estimators[0].clf.estimators[:3000]
tt_folding_sp.estimators[1].clf.estimators = tt_folding_sp.estimators[1].clf.estimators[:3000]
In [43]:
N_pass_v_cut = get_events_number(data.ix[old_mask, :])
tagging_efficiency_v_cut = 1. * N_pass_v_cut / N_B_events
tagging_efficiency_delta_v_cut = sqrt(N_pass_v_cut) / N_B_events
print tagging_efficiency_v_cut, tagging_efficiency_delta_v_cut
In [44]:
models = []
In [45]:
from utils import get_result_with_bootstrap_for_given_part
In [46]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency_v_cut, tagging_efficiency_delta_v_cut,
tt_folding_v_cut,
[data.ix[old_mask, :]], N_B_events=N_B_events,
logistic=True, name="old tagging", n_calibrations=30,
sign_part_column='signVtx', part_name='vertex',
logistic_combined=True))
In [48]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency_v_cut, tagging_efficiency_delta_v_cut,
tt_folding_sp,
[data_sel], N_B_events=N_B_events,
logistic=True, name="inclusive vertex with selection",
n_calibrations=30,
sign_part_column='signVtx', part_name='vertex',
logistic_combined=False))
In [49]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data], N_B_events=N_B_events,
logistic=True, name="inclusive vertex", n_calibrations=30,
sign_part_column='signVtx', part_name='vertex',
logistic_combined=False))
In [50]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding_all,
[data], N_B_events=N_B_events,
logistic=True, name="inclusive vertex, all features",
n_calibrations=30,
sign_part_column='signVtx', part_name='vertex',
logistic_combined=False))
In [54]:
pandas.concat(models)
Out[54]:
In [55]:
from utils import group_max
from scipy.special import logit, expit
from utils import compute_B_prob_using_part_prob
from utils import calculate_auc_with_and_without_untag_events
from utils import compute_mistag
from utils import calibrate_probs, bootstrap_calibrate_prob, result_table
percentile_bins = [10, 20, 30, 40, 50, 60, 70, 80, 90]
In [56]:
def estimate_events(Bsign, Bweight, Bprob, Bevent, tagging_efficiency, tagging_efficiency_delta, name):
auc, auc_full = calculate_auc_with_and_without_untag_events(Bsign, Bprob, Bweight, N_B_events=N_B_events)
print auc, auc_full
D2, aucs = bootstrap_calibrate_prob(Bsign, Bweight, Bprob,
n_calibrations=30, symmetrize=True, logistic=False)
# Bprob_calibrated, (iso_reg1, iso_reg2) = calibrate_probs(Bsign, Bweight, Bprob,
# symmetrize=True, return_calibrator=True, logistic=True)
return result_table(tagging_efficiency, tagging_efficiency_delta, D2, auc_full, name)
In [57]:
data['p_vertex'] = tt_folding_all.predict_proba(data)[:, 1] + 1e-8*numpy.random.random(len(data))
max_events = group_max(data.group_column.values, data['p_vertex'].values)
min_events = -group_max(data.group_column.values, -data['p_vertex'].values)
p_events = numpy.where(abs(max_events - 0.5) > abs(min_events - 0.5), max_events, min_events)
In [58]:
data_events = data.ix[abs(p_events - data.p_vertex.values) < 1e-14, :]
data_probs = p_events[abs(p_events - data.p_vertex.values) < 1e-14]
In [59]:
len(data_probs), len(numpy.unique(data.group_column))
Out[59]:
In [60]:
Bsign, Bweight, Bprob, Bevent = compute_B_prob_using_part_prob(data_events, data_probs, sign_part_column='signVtx')
In [61]:
t = estimate_events(Bsign, Bweight, Bprob, Bevent, tagging_efficiency,
tagging_efficiency_delta, 'inclusive vertex, max val, all features')
In [63]:
t
Out[63]:
In [64]:
x_pd_max = pandas.DataFrame({'event_id': data_events.event_id,
'Bweight': numpy.ones(len(data_events)),
'Bsign': data_events.signB,
'vertex_relation_prob': Bprob / (1 - Bprob)})
In [65]:
x_pd_max.to_csv('../models/Bdata_vertex_MC_max.csv', header=True, index=False)
In [66]:
# mean predictions
p_mean = expit(numpy.bincount(data.group_column, weights=logit(data.p_vertex.values) * data.signVtx) / numpy.bincount(data.group_column))
p_svertex = numpy.ones(len(p_mean))
In [67]:
Bsign, Bweight, Bprob, Bevent = compute_B_prob_using_part_prob(pandas.DataFrame({
'signVtx': p_svertex,
'event_id': numpy.arange(len(p_svertex)),
'N_sig_sw': numpy.ones(len(p_svertex)),
'signB': numpy.bincount(data.group_column, weights=data.signB) / numpy.bincount(data.group_column)}),
p_mean, sign_part_column='signVtx')
In [68]:
t_mean = estimate_events(Bsign, Bweight, Bprob, Bevent, tagging_efficiency,
tagging_efficiency_delta, 'inclusive vertex, mean, all features')
In [69]:
run_all = map(int, numpy.bincount(data.group_column, weights=data.run) // numpy.bincount(data.group_column))
event_all = map(int, numpy.bincount(data.group_column, weights=data.event) // numpy.bincount(data.group_column))
x = pandas.DataFrame({'run': run_all, 'event': event_all})
In [70]:
x_pd = pandas.DataFrame({'event_id': x.run.apply(str) + '_' + x.event.apply(str),
'Bweight': numpy.ones(len(p_svertex)),
'Bsign': numpy.bincount(data.group_column, weights=data.signB) / numpy.bincount(data.group_column),
'vertex_relation_prob': p_mean / (1 - p_mean)})
In [71]:
x_pd.to_csv('../models/Bdata_vertex_MC_sqrt.csv', header=True, index=False)
In [72]:
pandas.concat(models + [t, t_mean])
Out[72]: