In [1]:
%pylab inline
In [2]:
from collections import OrderedDict
import numpy
import pandas
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from rep.metaml import FoldingClassifier
from rep_ef.estimators import MatrixNetSkyGridClassifier
In [3]:
from utils import get_N_B_events, predict_by_estimator, bootstrap_calibrate_prob
from utils import get_events_number, get_events_statistics, result_table, compute_mistag
In [4]:
import root_numpy
data_ele = pandas.DataFrame(root_numpy.root2array('datasets/old_tagging/nnet_ele.root'))
data_muon = pandas.DataFrame(root_numpy.root2array('datasets/old_tagging/nnet_muon.root'))
data_kaon = pandas.DataFrame(root_numpy.root2array('datasets/old_tagging/nnet_kaon.root'))
data_vtx = pandas.DataFrame(root_numpy.root2array('datasets/old_tagging/nnet_vtx.root'))
In [5]:
data_ele.columns
Out[5]:
In [6]:
datasets = {'$e$': data_ele, '$\mu$': data_muon, '$K$': data_kaon, 'vtx': data_vtx}
In [7]:
for name, data in datasets.items():
data['label'] = data.iscorrect
# reconstructing sign of B using tagger answer and iscorrect (simulation fail)
data['signB'] = data.tagAnswer * (2 * data.iscorrect - 1)
data['event_id'] = data.runNum.apply(str) + '_' + data.evtNum.apply(str)
In [8]:
features_vtx = ['mult', 'nnkrec', 'ptB', 'vflag', 'ipsmean', 'ptmean', 'vcharge',
'svm', 'svp', 'BDphiDir', 'svtau', 'docamax']
features_ele = ['mult', 'partPt', 'partP', 'ptB', 'IPs', 'partlcs', 'eOverP', 'ghostProb', 'IPPU']
features_muon = ['mult', 'partPt', 'partP', 'ptB', 'IPs', 'partlcs', 'PIDNNm', 'ghostProb', 'IPPU']
features_kaon = ['mult', 'partP', 'partPt', 'nnkrec', 'ptB', 'IPs', 'partlcs',
'PIDNNk', 'PIDNNpi', 'PIDNNp', 'ghostProb', 'IPPU']
features_tr = ['mult', 'partPt', 'partP', 'ptB', 'IPs', 'partlcs', 'ghostProb', 'IPPU', 'eOverP',
'PIDNNpi', 'PIDNNp', 'PIDNNk', 'PIDNNm', 'PIDNNe', 'nnkrec', 'partTheta', 'partPhi',
'veloch']
features_tmva = {'$e$': features_ele, '$\mu$': features_muon,
'$K$': features_kaon, 'vtx': features_vtx}
# features_xgb = {'$e$': features_tr, '$\mu$': features_tr,
# '$K$': features_tr, 'vtx': features_vtx}
features_xgb = features_tmva
In [9]:
get_N_B_events()
Out[9]:
In [10]:
result = OrderedDict()
In [11]:
tagging_efficiecies = []
tagging_efficiecies_delta = []
for key, data in datasets.items():
N_B_passed = get_events_number(data)
tagging_efficiecies.append(1. * N_B_passed / get_N_B_events())
tagging_efficiecies_delta.append(sqrt(N_B_passed) / get_N_B_events())
result['$\epsilon_{tag}$'] = tagging_efficiecies
result['$\Delta\epsilon_{tag}$'] = tagging_efficiecies_delta
In [12]:
tagging_efficiencies_table = pandas.DataFrame(result, index=datasets.keys())
tagging_efficiencies_table
Out[12]:
In [13]:
sweight_threshold = 0.
data_sw_passed = dict([(key, data[data.N_sig_sw > sweight_threshold]) for key, data in datasets.items()])
data_sw_not_passed = dict([(key, data[data.N_sig_sw <= sweight_threshold]) for key, data in datasets.items()])
In [14]:
for key, data in data_sw_passed.items():
print key, get_events_statistics(data)
In [15]:
for key, data in data_sw_not_passed.items():
print key, get_events_statistics(data)
In [16]:
estimators = OrderedDict()
In [17]:
from rep.estimators import TMVAClassifier
In [18]:
from rep.estimators import TMVAClassifier
from rep.metaml import FoldingClassifier
tmva_base_muon = TMVAClassifier(method='kMLP', factory_options='Transformations=I', sigmoid_function='identity',
NeuronType='tanh', NCycles=280, HiddenLayers='N+5', TrainingMethod='BFGS', TestRate=5,
UseRegulator=True, EstimatorType='CE')
tmva_base_ele = TMVAClassifier(method='kMLP', factory_options='Transformations=I', sigmoid_function='identity',
NeuronType='sigmoid', NCycles=180, HiddenLayers='N+5', TrainingMethod='BFGS',
UseRegulator=True)
tmva_base_kaon_vtx = TMVAClassifier(method='kMLP', factory_options='Transformations=I',
sigmoid_function='identity',
NeuronType='tanh', NCycles=180, HiddenLayers='N+5', TrainingMethod='BFGS',
UseRegulator=True, EstimatorType='CE')
for key, data in data_sw_passed.items():
if 'e' in key:
tmva_base = tmva_base_ele
elif 'mu' in key:
tmva_base = tmva_base_muon
else:
tmva_base = tmva_base_kaon_vtx
estimators[key + '_tmva'] = FoldingClassifier(tmva_base, n_folds=2, random_state=11, ipc_profile='ssh-ipy',
features=features_tmva[key])
estimators[key + '_tmva'].fit(data, data['label'], data['N_sig_sw'])
In [19]:
from rep.estimators import XGBoostClassifier
from rep.metaml import FoldingClassifier
xgb_base_ele = XGBoostClassifier(colsample=1.0, eta=0.01, nthreads=4,
n_estimators=200, subsample=0.3, max_depth=5)
xgb_base_other = XGBoostClassifier(colsample=1.0, eta=0.01, nthreads=4,
n_estimators=500, subsample=0.3, max_depth=3)
for key, data in data_sw_passed.items():
if 'e' in key:
xgb_base = xgb_base_ele
else:
xgb_base = xgb_base_other
estimators[key + '_xgboost'] = FoldingClassifier(xgb_base, n_folds=2, random_state=11, ipc_profile='ssh-ipy',
features=features_xgb[key])
estimators[key + '_xgboost'].fit(data, data['label'], data['N_sig_sw'])
In [20]:
import cPickle
with open('models/old-tagging.pkl', 'w') as f:
cPickle.dump(estimators, f)
In [21]:
for key, _ in datasets.items():
for suffix in ['_xgboost', '_tmva']:
name = key + suffix
probs = estimators[name].predict_proba(data_sw_passed[key])[:, 1]
print name, 'AUC:', roc_auc_score(data_sw_passed[key]['label'].values,
probs, sample_weight=data_sw_passed[key]['N_sig_sw'].values)
In [22]:
models = []
In [23]:
for key, _ in datasets.items():
for suffix in ['_xgboost', '_tmva']:
name = key + suffix
data_predicted, probs = predict_by_estimator(estimators[name], [data_sw_passed[key], data_sw_not_passed[key]])
D2, aucs = bootstrap_calibrate_prob(data_predicted.label.values, data_predicted.N_sig_sw.values,
probs, n_calibrations=30)
print name, 'AUC:', numpy.mean(aucs), 'AUC delta', numpy.std(aucs)
tagging_efficiency, tagging_efficiency_delta = \
numpy.array(tagging_efficiencies_table[tagging_efficiencies_table.index == key])[0]
models.append(result_table(tagging_efficiency, tagging_efficiency_delta, D2, 0., name))
In [24]:
pandas.concat(models)
Out[24]:
In [25]:
pandas.concat(models).to_csv('img/old-tagging-parts.csv', header=True, index=False)
In [26]:
from utils import predict_by_estimator, bootstrap_calibrate_prob, calculate_auc_with_and_without_untag_events
from utils import calibrate_probs
In [27]:
def combine_taggers(tagger_outputs, tagger_keys):
"""
Copy-pasted formulas (5.1), (5.2) from [TODO link].
Formulas by themselves are not readable, please refer to context.
:param tagger_outputs: output of tagger for tracks.
There are 4 taggers, each having
tag_n - tagger output.
prob_n - probability of right tagged.
"""
pb = []
pnb = []
for key in tagger_keys:
prob = tagger_outputs['prob_{}'.format(key)].values
tag = tagger_outputs['tag_{}'.format(key)].values
pb.append((1 + tag) / 2 - tag * prob)
pnb.append((1 - tag) / 2 + tag * prob)
pb = numpy.prod(pb, axis=0)
pnb = numpy.prod(pnb, axis=0)
probs_wrong = pb / (pb + pnb)
tag_result = numpy.ones(len(probs_wrong))
tag_result[probs_wrong > 0.5] = -1
return tag_result, 1 - probs_wrong, tagger_outputs.weight.values, tagger_outputs.signB.values
In [28]:
percentile_bins = [10, 20, 30, 40, 50, 60, 70, 80, 90]
def run_combine(model_suffix="", model_name="", logistic=False):
"""
:param suffix: suffix used for taggers
:param model_name: name for model after combining classifiers
"""
data_with_predictions = {}
# computing calibrated predictions of each tagger
for key in datasets.keys():
name = key + model_suffix
data, probs = predict_by_estimator(estimators[name], [data_sw_passed[key], data_sw_not_passed[key]])
probs_calibrated = calibrate_probs(data.label.values, data.N_sig_sw.values, probs, logistic=logistic)
ids = numpy.array(data['event_id'])
data_with_predictions[key] = pandas.DataFrame({'prob_{}'.format(key): probs_calibrated,
'tag_{}'.format(key): data.tagAnswer.values,
'weight': data.N_sig_sw.values,
'signB': data.signB.values}, index=ids)
# collecting all together,
# setting tag_n = -99 if untagged
data_combined = pandas.DataFrame({'event_id': numpy.unique(numpy.concatenate([d.index.values for d in
data_with_predictions.values()]))})
data_combined.index = data_combined.event_id
tagger_keys = datasets.keys()
for key in tagger_keys:
data_combined['prob_{}'.format(key)] = 0.5
data_combined['tag_{}'.format(key)] = 1
for key, d in data_with_predictions.items():
data_combined.ix[d.index, 'prob_{}'.format(key)] = d['prob_{}'.format(key)]
data_combined.ix[d.index, 'tag_{}'.format(key)] = d['tag_{}'.format(key)]
data_combined.ix[d.index, 'weight'] = d['weight']
data_combined.ix[d.index, 'signB'] = d['signB']
# getting predictions
tags, Bprobs, Bweights, Bsign = combine_taggers(data_combined, tagger_keys)
Bprob_calibrated = calibrate_probs(Bsign, Bweights, Bprobs, random_state=13, symmetrize=True)
Bprob_calibrated += numpy.random.normal(size=len(Bprob_calibrated)) * 0.001
auc, auc_full = calculate_auc_with_and_without_untag_events(Bsign, Bprobs, Bweights)
print 'AUC for tagged:', auc, 'AUC with untag:', auc_full
subplot(1, 2, 1)
hist(Bprobs[Bsign == 1], alpha=0.4, bins=70, weights=Bweights[Bsign == 1], label='$B^+$')
hist(Bprobs[Bsign == -1], alpha=0.4, bins=70, weights=Bweights[Bsign == -1], label='$B^-$')
legend(), title('B probs')
subplot(1, 2, 2)
hist(Bprob_calibrated[Bsign == 1], alpha=0.4, bins=70, weights=Bweights[Bsign == 1], label='$B^+$')
hist(Bprob_calibrated[Bsign == -1], alpha=0.4, bins=70, weights=Bweights[Bsign == -1], label='$B^-$')
legend(), title('B probs calibrated'), show()
fpr, tpr, _ = roc_curve(Bsign, Bprobs, sample_weight=Bweights)
plot(fpr, tpr), plot([0, 1], [0, 1]), show()
tagging_efficiency_combined = sum(Bweights) / get_N_B_events()
tagging_efficiency_combined_delta = numpy.sqrt(sum(Bweights)) / get_N_B_events()
subplot(1, 2, 1)
compute_mistag(Bprobs, Bsign, Bweights, Bsign > -100, label="$B$", uniform=False, bins=percentile_bins)
compute_mistag(Bprobs, Bsign, Bweights, Bsign == 1, label="$B^+$", uniform=False, bins=percentile_bins)
compute_mistag(Bprobs, Bsign, Bweights, Bsign == -1, label="$B^-$", uniform=False, bins=percentile_bins)
legend(loc='best')
title('B prob, percentile bins'), xlabel('mistag probability'), ylabel('true mistag probability')
subplot(1, 2, 2)
compute_mistag(Bprob_calibrated, Bsign, Bweights, Bsign > -100, label="$B$", uniform=False, bins=percentile_bins)
compute_mistag(Bprob_calibrated, Bsign, Bweights, Bsign == 1, label="$B^+$", uniform=False, bins=percentile_bins)
compute_mistag(Bprob_calibrated, Bsign, Bweights, Bsign == -1, label="$B^-$", uniform=False, bins=percentile_bins)
legend(loc='best')
title('B prob calibrated, percentile bins'), xlabel('mistag probability'), ylabel('true mistag probability')
show()
D2_bootstrap, aucs = bootstrap_calibrate_prob(Bsign, Bweights, Bprobs)
D2 = numpy.average((2*(Bprobs - 0.5))**2, weights=Bweights)
print 'Efficiency, not calibrated', numpy.average((2*(Bprobs - 0.5))**2,
weights=Bweights) * tagging_efficiency_combined * 100
print 'Average AUC', numpy.mean(aucs), numpy.std(aucs)
return result_table(tagging_efficiency_combined, tagging_efficiency_combined_delta, D2_bootstrap,
auc_full, model_name)
In [29]:
figsize(18, 7)
results = [run_combine("_xgboost", "iso-xgb_combined"),
run_combine("_tmva", "iso-tmva_combined"),
run_combine("_xgboost", "log-xgb_combined", logistic=True),
run_combine("_tmva", "log-tmva_combined", logistic=True)]
In [30]:
result = pandas.concat(results)
result
Out[30]:
In [31]:
result.to_csv('img/old-tagging.csv', header=True, index=False)