In [1]:
%pylab inline
In [2]:
import sys
sys.path.insert(0, "../")
In [3]:
from collections import OrderedDict
import numpy
import pandas
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from rep.metaml import FoldingClassifier
In [4]:
from utils import get_N_B_events, predict_by_estimator, bootstrap_calibrate_prob
from utils import get_events_number, get_events_statistics, result_table, compute_mistag, shrink_floats_xgb
In [8]:
import root_numpy
def read_samples(filename, filename_vtx, filename_splot):
splot = pandas.read_csv(filename_splot, sep=' ', header=None).drop([8], axis=1)
splot.columns = ['run', 'evTop', 'evBottom', 'Bmass', 'Nbkg_tot_sw', 'L_Nbkg_tot', 'Nsig_tot_sw', 'L_Nsig_tot']
event_id_column = 'event_id'
splot[event_id_column] = splot.run.apply(str) + '_' + (splot.evTop * 1e6 + splot.evBottom).apply(int).apply(str)
index = numpy.unique(splot[event_id_column], return_index=True)[1]
splot.index = numpy.arange(len(splot))
splot = splot.ix[index, :]
splot.index = splot[event_id_column]
data = pandas.read_csv(filename, sep='\t')
data = data.query('(Bmass < 5.4) & (Bmass > 5.16) & ((K_cut == 1) | (mu_cut == 1) | (e_cut == 1))')
data_vertex = pandas.read_csv(filename_vtx, sep='\t')
data_vertex = data_vertex.query('(Bmass < 5.4) & (Bmass > 5.16) & (v_cut == 1) & (vcharge > 0.2)')
datasets = dict()
for name, selection, data_temp in zip(['K', 'e', 'mu', 'vtx'],
['K_cut == 1', 'e_cut == 1', 'mu_cut == 1', ""],
[data] * 3 + [data_vertex]):
if selection != "":
data_temp = data_temp.query(selection)
shrink_floats_xgb(data_temp)
if name == 'vtx':
data_temp['label'] = (data_temp.signB.values * data_temp.signVtx.values < 0) * 1
else:
data_temp['label'] = (data_temp.signB.values * data_temp.signTrack.values < 0) * 1
data_temp['event_id'] = data_temp.run.apply(str) + '_' + data_temp.event.apply(int).apply(str)
data_temp['tagAnswer'] = data_temp.signB * (2 * data_temp.label - 1)
data_temp.index = data_temp["event_id"]
data_temp['N_sig_sw'] = splot.loc[data_temp.index, 'Nsig_tot_sw']
datasets[name] = data_temp
return datasets, sum(splot.Nsig_tot_sw.values)
In [9]:
def compute_efficiencies(datasets, N_B_events):
result = dict()
for key, data in datasets.items():
N_B_passed = get_events_number(data)
result[key] = (1. * N_B_passed / N_B_events, sqrt(N_B_passed) / N_B_events)
return result
In [10]:
datasets, N_B_events = read_samples('../datasets/data/csv/WG/Bu_JPsiK/2012/Tracks.csv',
'../datasets/data/csv/WG/Bu_JPsiK/2012/Vertices.csv',
'../datasets/data/csv/WG/Bu_JPsiK/2012/sPlot_Bu_JPsiK.csv')
In [14]:
datasets_kstar, N_B_events_kstar = read_samples('../datasets/data/csv/WG/Bd_JPsiKstar/2012/Tracks.csv',
'../datasets/data/csv/WG/Bd_JPsiKstar/2012/Vertices.csv',
'../datasets/data/csv/WG/Bd_JPsiKstar/2012/sPlot_Bd_JPsiKstar.csv')
In [15]:
eff = compute_efficiencies(datasets, N_B_events)
eff_kstar = compute_efficiencies(datasets_kstar, N_B_events_kstar)
In [16]:
print 'K+-'
print pandas.DataFrame(eff)
print 'K*'
print pandas.DataFrame(eff_kstar)
In [17]:
features_vtx = ['mult', 'nnkrec', 'log_ptB: log(ptB)', 'vflag', 'log_ipsmean: log(ipsmean)',
'log_ptmean: log(ptmean)', 'vcharge',
'log_svm: log(svm)', 'log_svp: log(svp)', 'BDphiDir', 'log_svtau: log(svtau)', 'docamax']
features_ele = ['mult', 'log_partPt: log(partPt)', 'log_partP: log(partP)',
'log_ptB: log(ptB)', 'log_IPs: log(IPs)', 'partlcs', 'log_eOverP: log(EOverP)',
'ghostProb', 'log_IPPU: log(IPPU)']
features_muon = ['mult', 'log_partPt: log(partPt)', 'log_partP: log(partP)',
'log_ptB: log(ptB)', 'log_IPs: log(IPs)', 'partlcs', 'PIDNNm', 'ghostProb', 'log_IPPU: log(IPPU)']
features_kaon = ['mult', 'log_partPt: log(partPt)', 'log_partP: log(partP)',
'nnkrec','log_ptB: log(ptB)', 'log_IPs: log(IPs)', 'partlcs',
'PIDNNk', 'PIDNNpi', 'PIDNNp', 'ghostProb', 'log_IPPU: log(IPPU)']
features = {'e': features_ele, 'mu': features_muon,
'K': features_kaon, 'vtx': features_vtx}
In [18]:
estimators = OrderedDict()
In [19]:
from rep.estimators import XGBoostClassifier
from rep.metaml import FoldingClassifier
xgb_base_ele = XGBoostClassifier(colsample=1.0, eta=0.01, nthreads=12,
n_estimators=200, subsample=0.3, max_depth=5)
xgb_base_other = XGBoostClassifier(colsample=1.0, eta=0.01, nthreads=12,
n_estimators=500, subsample=0.3, max_depth=3)
for key, data in datasets.items():
if 'e' in key:
xgb_base = xgb_base_ele
else:
xgb_base = xgb_base_other
estimators[key + '_xgboost'] = FoldingClassifier(xgb_base, n_folds=2, random_state=11,
features=features[key])
estimators[key + '_xgboost'].fit(data, data['label'], data['N_sig_sw'])
In [20]:
from rep.estimators import TMVAClassifier
from rep.metaml import FoldingClassifier
tmva_base_muon = TMVAClassifier(method='kMLP', factory_options='Transformations=I,D,N:Silent', sigmoid_function='identity',
NeuronType='tanh', NCycles=280, HiddenLayers='N+5', TrainingMethod='BFGS', TestRate=5,
UseRegulator=True, EstimatorType='CE')
tmva_base_ele = TMVAClassifier(method='kMLP', factory_options='Transformations=I,D,N:Silent', sigmoid_function='identity',
NeuronType='sigmoid', NCycles=180, HiddenLayers='N+5', TrainingMethod='BFGS',
UseRegulator=True)
tmva_base_kaon_vtx = TMVAClassifier(method='kMLP', factory_options='Transformations=I,D,N:Silent',
sigmoid_function='identity',
NeuronType='tanh', NCycles=180, HiddenLayers='N+5', TrainingMethod='BFGS',
UseRegulator=True, EstimatorType='CE')
for key, data in datasets.items():
if 'e' in key:
tmva_base = tmva_base_ele
elif 'mu' in key:
tmva_base = tmva_base_muon
else:
tmva_base = tmva_base_kaon_vtx
estimators[key + '_tmva'] = FoldingClassifier(tmva_base, n_folds=2, random_state=11,
features=features[key])
estimators[key + '_tmva'].fit(data, data['label'], data['N_sig_sw'])
In [ ]:
import cPickle
with open('../models/old-tagging-WG.pkl', 'r') as f:
estimators = cPickle.load(f)
In [21]:
import cPickle
with open('../models/old-tagging-WG.pkl', 'w') as f:
cPickle.dump(estimators, f)
In [22]:
estimators.keys()
Out[22]:
In [24]:
for key in datasets.keys():
for suffix in ['_xgboost', '_tmva']:
name = key + suffix
probs = estimators[name].predict_proba(datasets[key])[:, 1]
print name, 'AUC:', roc_auc_score(datasets[key]['label'].values,
probs, sample_weight=datasets[key]['N_sig_sw'].values)
In [25]:
from utils import predict_by_estimator, calibrate_probs, run_taggers_combination
def combine(estimators, datasets, keys, N_B_events, logistic=False,
return_calibrations=False, logistic_combined=True, model_name=None, with_roc=False):
"""
:param suffix: suffix used for taggers
:param model_name: name for model after combining classifiers
"""
data_with_predictions = {}
calibrators_tracks = dict()
# computing calibrated predictions of each tagger
for key in keys:
data, probs = predict_by_estimator(estimators[key], [datasets[key]])
probs_calibrated, calibrators_tracks[key] = \
calibrate_probs(data.label.values, data.N_sig_sw.values, probs, logistic=logistic)
ids = numpy.array(data['event_id'])
data_with_predictions[key] = pandas.DataFrame({'prob_{}'.format(key): probs_calibrated,
'tag_{}'.format(key): data.tagAnswer.values,
'weight': data.N_sig_sw.values,
'signB': data.signB.values}, index=ids)
calibrator_B, table, roc = run_taggers_combination(data_with_predictions, keys, N_B_events,
model_name=','.join(keys) if model_name is None else model_name,
logistic_combined=logistic_combined)
if return_calibrations and with_roc:
return table, calibrators_tracks, calibrator_B, roc
elif return_calibrations and not with_roc:
return table, calibrators_tracks, calibrator_B
elif with_roc:
return table, roc
else:
return table
In [26]:
test_estimators_tmva = dict((key, estimators[key + '_tmva']) for key in ['K', 'mu', 'e', 'vtx'])
In [27]:
figsize(18, 7)
results_separate_tmva = []
for key in datasets.keys():
x = combine(test_estimators_tmva, datasets, [key], N_B_events, logistic=True)
results_separate_tmva.append(x)
In [28]:
pandas.concat(results_separate_tmva)
Out[28]:
In [29]:
test_estimators = dict((key, estimators[key + '_xgboost']) for key in ['K', 'mu', 'e', 'vtx'])
In [30]:
figsize(18, 7)
calibrators_B = dict()
calibrator_tracks = dict()
results_separate = []
for key in datasets.keys():
x, calibrator_tracks[key], calibrators_B[key] = combine(test_estimators, datasets, [key], N_B_events,
logistic=True, return_calibrations=True)
results_separate.append(x)
In [31]:
pandas.concat(results_separate)
Out[31]:
In [32]:
figsize(18, 7)
results_separate_tmva.append(combine(test_estimators_tmva, datasets, ['K', 'e', 'mu', 'vtx'],
N_B_events, logistic=True,
model_name='tmva combination'))
In [33]:
figsize(18, 7)
x, calibrator_tracks_comb, calibrator_B_comb, roc_curve_old = combine(test_estimators, datasets,
['K', 'e', 'mu', 'vtx'], N_B_events,
logistic=True, return_calibrations=True,
model_name='xgboost combination', with_roc=True)
results_separate.append(x)
In [34]:
pandas.concat(results_separate)
Out[34]:
In [45]:
from utils import prepare_B_data
def prepare_for_epm(estimators, datasets, keys, calibrator_tracks, calibrator_B, N_B_events):
data_with_predictions = dict()
for key in keys:
probs = estimators[key].predict_proba(datasets[key])[:, 1]
probs_calibrated = calibrator_tracks[key].predict_proba(probs)
data_with_predictions[key] = pandas.DataFrame({'prob_{}'.format(key): probs_calibrated,
'tag_{}'.format(key): datasets[key].tagAnswer.values,
'weight': datasets[key].N_sig_sw.values,
'signB': datasets[key].signB.values,
'mass': datasets[key].Bmass.values,
'time': datasets[key]['time'].values},
index=numpy.array(datasets[key]['event_id']))
tags, Bprobs, Bweights, Bsign, Bmass, Btime = prepare_B_data(data_with_predictions, keys)
Bprob_calibrated = calibrator_B.predict_proba(Bprobs)
print 'eff tag: ', 1. * sum(Bweights) / N_B_events
print "D2:", numpy.average((1 - 2*Bprob_calibrated)**2, weights=Bweights)
print 'eff:', 1. * sum(Bweights) / N_B_events * numpy.average((1 - 2*Bprob_calibrated)**2, weights=Bweights)
mistag = numpy.minimum(Bprob_calibrated, 1 - Bprob_calibrated)
tag = numpy.ones(len(Bprob_calibrated))
tag[Bprob_calibrated < 0.5] = -1
prepared_data = pandas.DataFrame({'tag': numpy.array(tag, dtype='int32'),
'mistag': mistag,
'flavour': numpy.array(Bsign, dtype='int32'),
'decay_time': Btime,
'Bmass': Bmass,
'resolution': [50*1e-3] * len(Bmass),
'probs': Bprob_calibrated,
'weight': Bweights
})
return prepared_data
In [46]:
prepared_kstar = prepare_for_epm(test_estimators, datasets_kstar, ['K', 'e', 'mu', 'vtx'],
calibrator_tracks_comb, calibrator_B_comb, N_B_events_kstar)
In [47]:
root_numpy.array2root(prepared_kstar.to_records(index=False), "../for_epm/kstar_old.root",
mode='recreate')
In [48]:
pandas.concat(results_separate_tmva)
Out[48]:
In [49]:
pandas.concat(results_separate)
Out[49]:
In [50]:
pandas.concat(results_separate_tmva + results_separate).to_csv(
'../img/old-tagging-WG.csv', header=True, index=False)
In [51]:
import cPickle
with open('../models/old-rocs-WG', 'w') as f:
cPickle.dump(roc_curve_old, f)