In [2]:
%pylab inline
In [3]:
import sys
sys.path.insert(0, "../")
In [4]:
import numpy
import pandas
import root_numpy
from sklearn.cross_validation import train_test_split
from utils import calibrate_probs, predict_by_estimator, compute_B_prob_using_part_prob
from utils import calculate_auc_with_and_without_untag_events, compute_mistag, result_table
from utils import compute_sum_of_charges, get_events_statistics, compute_N_B_events_MC, add_diff_pt
from scipy.special import logit
In [5]:
dict_SS_OS = {'SS': 1, 'OS': -1, 'NAN': 0}
In [6]:
tag_kstar = pandas.read_csv('../datasets/MC/csv/WG/Bd_JPsiKstar/2012/Tracks.csv', sep='\t',
usecols=['xFlag', 'OS_SS', 'BOosc'])
tag_ks = pandas.read_csv('../datasets/MC/csv/WG/Bd_JPsiKstar/2012/Tracks.csv', sep='\t',
usecols=['xFlag', 'OS_SS', 'BOosc'])
tag_k = pandas.read_csv('../datasets/MC/csv/WG/Bd_JPsiKstar/2012/Tracks.csv', sep='\t',
usecols=['xFlag', 'OS_SS', 'BOosc'])
In [7]:
for br in ['xFlag', 'OS_SS']:
print br
for name, t in dict_SS_OS.items():
print name,
print '\tK\t', 100. * sum(tag_k[br] == t) / len(tag_k[br]),
print '\tK*\t', 100. * sum(tag_kstar[br] == t) / len(tag_kstar[br]),
print '\tKs\t', 100. * sum(tag_ks[br] == t) / len(tag_ks[br])
print '\n'
In [8]:
mask_ks = tag_ks['BOosc'] == 0
mask_kstar = tag_kstar['BOosc'] == 0
for name, t in dict_SS_OS.items():
print name,
print '\tK*\t', 100. * sum(tag_kstar['xFlag'][mask_kstar] == t) / len(tag_kstar['xFlag'][mask_kstar]),
print '\tKs\t', 100. * sum(tag_ks['xFlag'][mask_ks] == t) / len(tag_ks['xFlag'][mask_ks])
In [9]:
k_data = pandas.read_csv('../models/part_tracks_MC.csv')
In [10]:
from utils import estimate_channel
In [11]:
from decisiontrain import DecisionTrainClassifier
from rep.estimators import SklearnClassifier
from folding_group import FoldingGroupClassifier
features = ['cos_diff_phi', 'diff_pt', 'partPt', 'partP', 'nnkrec', 'diff_eta', 'EOverP',
'ptB', 'sum_PID_mu_k', 'proj', 'PIDNNe', 'sum_PID_k_e', 'PIDNNk', 'sum_PID_mu_e', 'PIDNNm',
'phi', 'IP', 'IPerr', 'IPs', 'veloch', 'max_PID_k_e', 'ghostProb',
'IPPU', 'eta', 'max_PID_mu_e', 'max_PID_mu_k', 'partlcs']
In [12]:
import cPickle
with open('../models/dt_MC.pkl', 'r') as f:
tt_folding = cPickle.load(f)
In [33]:
import cPickle
with open('../models/calibrator_tracks_MC2.pkl', 'r') as f:
calibrator_tracks = cPickle.load(f)
In [32]:
import cPickle
with open('../models/calibrator_tracks_MC2.pkl', 'w') as f:
cPickle.dump(calibrator_tracks, f)
In [14]:
import cPickle
with open('../models/calibrator_B_MC.pkl', 'r') as f:
calibrator_B = cPickle.load(f)
In [15]:
import cPickle
with open('../models/dt_ss_os_only.pkl', 'r') as f:
tt_os_ss = cPickle.load(f)
In [16]:
with open('../models/os_ss_calibrator_only.pkl', 'r') as f:
calibrator_os_ss = cPickle.load(f)
In [17]:
from utils import data_tracks_preprocessing
In [18]:
data_kstar = pandas.read_csv('../datasets/MC/csv/WG/Bd_JPsiKstar/2012/Tracks.csv', sep="\t")
data_kstar = data_tracks_preprocessing(data_kstar)
In [19]:
# N_B_kstar = compute_N_B_events_MC('../datasets/MC/csv/WG/Bd_JPsiKstar/2012/Tracks.root',
# '../datasets/MC/csv/WG/Bd_JPsiKstar/2012/Vertices.root')
N_B_kstar = 431667.0
In [20]:
N_B_kstar
Out[20]:
In [21]:
mean_kstar = compute_sum_of_charges(data_kstar, "$K^*$", bins=21,
event_id_column='event_id', show_with_signal=False)
In [22]:
mean_kstar
Out[22]:
In [23]:
data_kstar_new = data_kstar.copy()
data_kstar_new.ix[data_kstar.OS_SS >= 0, 'signTrack'] *= -1
mean_kstar_invert = compute_sum_of_charges(data_kstar_new, "$K^*$", bins=21,
event_id_column='event_id', show_with_signal=False)
In [24]:
mean_kstar_invert
Out[24]:
In [25]:
p_kstar = tt_folding.predict_proba(data_kstar)[:, 1]
p_kstar_calib = calibrator_tracks.predict_proba(p_kstar)
p_kstar_ss = tt_os_ss.predict_proba(data_kstar)[:, 1]
p_kstar_ss_calib = calibrator_os_ss.predict_proba(p_kstar_ss)
In [26]:
data_kstar_new = data_kstar.copy()
data_kstar_new['signTrack'] = data_kstar_new['signTrack'].values * (1 - p_kstar_ss_calib) + p_kstar_ss_calib * (-data_kstar_new['signTrack'].values)
mean_kstar_invert = compute_sum_of_charges(data_kstar_new, "$K^*$", bins=21,
event_id_column='event_id', show_with_signal=False)
In [27]:
mean_kstar_invert
Out[27]:
In [28]:
from sklearn.metrics import roc_auc_score
In [29]:
for br in ['xFlag', 'OS_SS']:
print br
for name, group in dict_SS_OS.items():
mask = data_kstar[br].values == group
print name, 'K*', roc_auc_score((data_kstar.signTrack.values * data_kstar.signB.values > 0)[mask],
p_kstar[mask]),
mask = k_data[br].values == group
print 'K+/-', roc_auc_score(k_data.label.values[mask], k_data.part_prob.values[mask])
print '\n'
In [30]:
os_region_mask = (data_kstar.IPs > 3) & ((abs(data_kstar.diff_eta) > 0.6) | (abs(data_kstar.diff_phi) > 0.825))
ss_region_mask = (abs(data_kstar.diff_eta) < 0.6) & (abs(data_kstar.diff_phi) < 0.825) & (data_kstar.IPs < 3)
In [ ]:
table_kstar, _ = estimate_channel(p_kstar, data_kstar,
N_B_kstar, name="K*, OS/SS tag inverting", calibrator_tracks=calibrator_tracks,
calibrator_B=calibrator_B,
mask_to_invert=data_kstar.OS_SS.values >= 0)
table_kstar
In [111]:
table_kstar, _ = estimate_channel(p_kstar, data_kstar,
N_B_kstar, name="K*, OS/SS tag inverting", calibrator_tracks=calibrator_tracks,
calibrator_B=calibrator_B,
mask_to_invert=data_kstar.OS_SS.values >= 0)
table_kstar
Out[111]:
In [40]:
p_kstar_ss_calib_new = p_kstar_ss_calib.copy()
# p_kstar_ss_calib_new[data_kstar.OS_SS.values < 0] = 0.
# p_kstar_ss_calib_new[(data_kstar.OS_SS.values > 0) & (p_kstar_ss_calib_new <= 0.5)] = 0.
p_kstar_ss_calib_new[(p_kstar_ss_calib_new <= 0.5)] = 0.
p_kstar_ss_calib_new[data_kstar.OS_SS.values < 0] = 0.
table_kstar_BDT, prepared_data = estimate_channel(p_kstar, data_kstar,
N_B_kstar, name="K* BDT inverting",
calibrator_tracks=calibrator_tracks,
calibrator_B=calibrator_B,
prior=p_kstar_ss_calib_new, for_epm=True)
table_kstar_BDT
Out[40]:
In [32]:
p_kstar_ss_calib_new = p_kstar_ss_calib.copy()
p_kstar_ss_calib_new[data_kstar.OS_SS.values < 0] = 0.
p_kstar_ss_calib_new[data_kstar.OS_SS.values == 1] = 1.
table_kstar_BDT, prepared_data = estimate_channel(p_kstar, data_kstar,
N_B_kstar, name="K* BDT inverting",
calibrator_tracks=calibrator_tracks,
calibrator_B=calibrator_B,
prior=p_kstar_ss_calib_new, for_epm=True)
table_kstar_BDT
Out[32]:
In [33]:
root_numpy.array2root(prepared_data.to_records(index=False), "../EPM/kstar_MC_inclusive.root",
mode='recreate')
In [30]:
table_kstar_BDT, prepared_data = estimate_channel(p_kstar, data_kstar,
N_B_kstar, name="K* BDT inverting",
calibrator_tracks=calibrator_tracks,
calibrator_B=calibrator_B,
prior=p_kstar_ss_calib, for_epm=True)
table_kstar_BDT
Out[30]:
In [31]:
root_numpy.array2root(prepared_data.to_records(index=False), "../EPM/kstar_MC_inclusive.root",
mode='recreate')
In [112]:
table_kstar_BDT, prepared_data = estimate_channel(p_kstar, data_kstar,
N_B_kstar, name="K* BDT inverting",
calibrator_tracks=calibrator_tracks,
calibrator_B=calibrator_B,
prior=p_kstar_ss_calib, for_epm=True)
table_kstar_BDT
Out[112]:
In [121]:
scatter(p_kstar_ss_calib[data_kstar.OS_SS.values == -1], p_kstar_calib[data_kstar.OS_SS.values == -1], alpha=0.01)
Out[121]:
In [122]:
scatter(p_kstar_ss_calib[data_kstar.OS_SS.values == 1], p_kstar_calib[data_kstar.OS_SS.values == 1], alpha=0.01)
Out[122]:
In [123]:
scatter(p_kstar_ss_calib[data_kstar.OS_SS.values == 0], p_kstar_calib[data_kstar.OS_SS.values == 0], alpha=0.01)
Out[123]:
In [151]:
p_kstar_ss_calib_new = p_kstar_ss_calib.copy()
p_kstar_ss_calib_new[data_kstar.OS_SS.values < 0] = 0.
p_kstar_ss_calib_new[data_kstar.OS_SS.values == 1] = 1.
table_kstar_BDT, prepared_data = estimate_channel(p_kstar, data_kstar,
N_B_kstar, name="K* BDT inverting",
calibrator_tracks=calibrator_tracks,
calibrator_B=calibrator_B,
prior=p_kstar_ss_calib_new, for_epm=True)
table_kstar_BDT
Out[151]:
In [152]:
root_numpy.array2root(prepared_data.to_records(index=False), "../EPM/kstar_MC_inclusive.root",
mode='recreate')
In [116]:
data_kstar['label'] = (data_kstar.signB.values * data_kstar.signTrack.values > 0) * 1
In [117]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=3000, depth=6,
max_features=15, n_threads=12)
tt_folding_kstar = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
train_features=features, group_feature='group_column')
%time tt_folding_kstar.fit(data_kstar, data_kstar.label)
Out[117]:
In [118]:
p_kstar_own = tt_folding_kstar.predict_proba(data_kstar)[:, 1]
roc_auc_score(data_kstar.label.values, p_kstar_own)
Out[118]:
In [127]:
calibrator_tracks_kstar, calibrator_B_kstar, table_kstar_self, _ = estimate_channel(p_kstar_own, data_kstar,
N_B_kstar, name="K* self training")
table_kstar_self
Out[127]:
In [128]:
p_kstar_own_calib = calibrator_tracks_kstar.predict_proba(p_kstar_own)
In [137]:
from scipy.special import logit, expit
p_corr = logit(p_kstar_own_calib) * logit(p_kstar_calib)
weight = numpy.ones(len(p_corr))
weight[abs(p_corr) < 0.05] = 0
In [140]:
p_corr_new = expit(p_corr)
In [139]:
sum(data_kstar.OS_SS.values < 0), sum(data_kstar.OS_SS.values > 0), sum(data_kstar.OS_SS.values == 0)
Out[139]:
In [138]:
hist(p_corr, normed=True, alpha=0.4, label='OS', bins=100, weights=weight * (data_kstar.OS_SS.values < 0));
plt.show()
hist(p_corr, normed=True, alpha=0.4, label='SS', bins=100, weights=weight* (data_kstar.OS_SS.values > 0));
plt.show()
hist(p_corr, normed=True, alpha=0.4, label='NAN', bins=100, weights=weight * (data_kstar.OS_SS.values == 0));
legend();
In [53]:
import cPickle
with open('../models/dt_kstar_MC.pkl', 'w') as f:
cPickle.dump(tt_folding_kstar, f)
with open('../models/calibrator_tracks_kstar_MC.pkl', 'w') as f:
cPickle.dump(calibrator_tracks_kstar, f)
with open('../models/calibrator_B_kstar_MC.pkl', 'w') as f:
cPickle.dump(calibrator_B_kstar, f)
In [28]:
data_ks = pandas.read_csv('../datasets/MC/csv/WG/Bd_JPsiKs/2012/Tracks.csv', sep="\t")
data_ks = data_ks.ix[numpy.isfinite(data_ks.IPs), :]
add_features(data_ks)
get_events_statistics(data_ks)
Out[28]:
In [29]:
# N_B_ks = compute_N_B_events_MC('datasets/MC/csv/Bd_JPsiKs/Tracks.root',
# 'datasets/MC/csv/Bd_JPsiKs/Vertices.root')
N_B_ks = 130875.0
In [30]:
data_ks = data_ks.query(selection)
get_events_statistics(data_ks)
Out[30]:
In [31]:
mean_ks = compute_sum_of_charges(data_ks, "$K_s$", bins=21, event_id_column='event_id', show_with_signal=False)
In [32]:
mean_ks
Out[32]:
In [33]:
data_ks_new = data_ks.copy()
data_ks_new.ix[data_ks.OS_SS >= 0, 'signTrack'] *= -1
mean_ks_invert = compute_sum_of_charges(data_ks_new, "$K_s$",
bins=21, event_id_column='event_id', show_with_signal=False)
In [34]:
mean_ks_invert
Out[34]:
In [35]:
p_ks = tt_folding.predict_proba(data_ks)[:, 1]
p_ks_ss = tt_os_ss.predict_proba(data_ks)[:, 1]
p_ks_ss_calib = calibrator_os_ss.predict_proba(p_ks_ss)
In [36]:
for name, group in dict_SS_OS.items():
mask = data_ks.OS_SS.values == group
print name, 'Ks', roc_auc_score((data_ks.signTrack.values * data_ks.signB.values > 0)[mask], p_ks[mask]),
mask = k_data.OS_SS.values == group
print 'K+-', roc_auc_score(k_data.label.values[mask], k_data.part_prob.values[mask])
In [37]:
table_ks = estimate_channel(p_ks, data_ks,
N_B_ks, name="Ks, OS/SS tag inverting", calibrator_tracks=calibrator_tracks,
calibrator_B=calibrator_B,
mask_to_invert=data_ks.OS_SS.values >= 0)
table_ks
Out[37]:
In [38]:
table_ks_BDT = estimate_channel(p_ks, data_ks,
N_B_ks, name="Ks BDT inverting", calibrator_tracks=calibrator_tracks,
calibrator_B=calibrator_B,
prior=p_ks_ss_calib)
table_ks_BDT
Out[38]:
In [44]:
data_ks.index = range(len(data_ks))
In [45]:
_, ids = numpy.unique(data_ks.group_column, return_index=True)
ids_minus = ids[data_ks.ix[ids, 'signB'].values == -1]
ids_plus = ids[data_ks.ix[ids, 'signB'].values == 1]
print sum(data_ks.ix[ids, 'signB'] == 1), sum(data_ks.ix[ids, 'signB'] == -1)
In [46]:
data_ks.N_sig_sw = 1.
data_ks.ix[data_ks.signB == 1, 'N_sig_sw'] = 1. * sum(data_ks.ix[ids, 'signB'] == -1) / sum(data_ks.ix[ids, 'signB'] == 1)
In [47]:
ids_minus_take = numpy.random.choice(ids_minus, replace=False, size=len(ids_plus))
In [48]:
mask = numpy.in1d(data_ks.group_column.values, data_ks.group_column.values[ids_plus]) | \
numpy.in1d(data_ks.group_column.values, data_ks.group_column.values[ids_minus_take])
data_ks_new = data_ks.loc[mask, :].copy()
In [49]:
data_ks_new.index = range(len(data_ks_new))
_, ids = numpy.unique(data_ks_new.group_column, return_index=True)
print sum(data_ks_new.ix[ids, 'signB'] == 1), sum(data_ks_new.ix[ids, 'signB'] == -1)
In [50]:
# N_B_ks_new = sum(data_ks.ix[ids, 'N_sig_sw'])
N_B_ks_new = len(numpy.unique(data_ks_new.group_column))
N_B_ks_new
Out[50]:
In [51]:
data_ks_new.N_sig_sw = 1
In [52]:
sum(data_ks_new.signB == 1), sum(data_ks_new.signB == -1)
sum(data_ks_new.signTrack == 1), sum(data_ks_new.signTrack == -1)
Out[52]:
In [53]:
sum(data_ks_new.label == 0), sum(data_ks_new.label == 1)
Out[53]:
In [54]:
data_ks_new['label'] = (data_ks_new.signB.values * data_ks_new.signTrack.values > 0) * 1
tt_base = DecisionTrainClassifier(learning_rate=0.01, n_estimators=1000, depth=6,
n_threads=12)
tt_folding_ks = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=15,
train_features=features, group_feature='group_column')
%time tt_folding_ks.fit(data_ks_new, data_ks_new.label)
Out[54]:
In [55]:
p_ks_own = tt_folding_ks.predict_proba(data_ks_new)[:, 1]
roc_auc_score(data_ks_new.label, p_ks_own)
Out[55]:
In [56]:
p, _ = calibrate_probs(data_ks_new.label.values, numpy.ones(len(data_ks_new)), p_ks_own, logistic=True,
random_state=42)
In [57]:
roc_auc_score(data_ks_new.label, p)
Out[57]:
In [58]:
from rep.report.metrics import RocAuc
tt_folding_ks.test_on(data_ks_new, data_ks_new.label).learning_curve(RocAuc(), steps=1)
Out[58]:
In [59]:
table_ks_self = estimate_channel(p_ks_own, data_ks_new, N_B_ks_new, name="Ks self training")
table_ks_self
Out[59]:
In [60]:
pandas.concat([table_kstar, table_ks])
Out[60]:
In [61]:
pandas.concat([table_kstar_BDT, table_ks_BDT])
Out[61]:
In [62]:
pandas.concat([table_kstar_self, table_ks_self])
Out[62]: