In [2]:
!sudo sh -c 'echo 3 >/proc/sys/vm/drop_caches'
In [3]:
!free -m
In [4]:
%pylab inline
In [5]:
# import root_numpy
# import pandas
# data_k = pandas.DataFrame(root_numpy.root2array('datasets/MC/csv/Bu_JPsiK/Tracks.root',
# selection='(e_cut == 1) || (mu_cut == 1) || (K_cut == 1)'))
# data_ks = pandas.DataFrame(root_numpy.root2array('datasets/MC/csv/Bd_JPsiKs//Tracks.root',
# selection='(e_cut == 1) || (mu_cut == 1) || (K_cut == 1)'))
In [6]:
import root_numpy
import pandas
selection = '(ghostProb < 0.4) & ((PIDNNk > 0) | (PIDNNe > 0) | (PIDNNm > 0) | (PIDNNp > 0) | (PIDNNpi > 0))'
data_k = pandas.DataFrame(root_numpy.root2array('datasets/MC/csv/Bu_JPsiK/Tracks.root'))
data_ks = pandas.DataFrame(root_numpy.root2array('datasets/MC/csv/Bd_JPsiKs/Tracks.root'))
In [7]:
len(data_k), len(data_ks)
Out[7]:
In [8]:
from utils import add_diff_pt
In [9]:
from itertools import combinations
def add_features(data, event_id_column='event_id'):
event_id = data.run.apply(str) + '_' + data.event.apply(str)
data['group_column'] = numpy.unique(event_id, return_inverse=True)[1]
# all weights are 1, because this is MC
data[event_id_column] = event_id
add_diff_pt(data)
# add cos(diff_phi)
data.loc[:, 'cos_diff_phi'] = numpy.cos(data.diff_phi.values)
PIDs = {'k': data.PIDNNk.values,
'e': data.PIDNNe.values,
'mu': data.PIDNNm.values,
}
for (pid_name1, pid_values1), (pid_name2, pid_values2) in combinations(PIDs.items(), 2):
data.loc[:, 'max_PID_{}_{}'.format(pid_name1, pid_name2)] = numpy.maximum(pid_values1, pid_values2)
data.loc[:, 'sum_PID_{}_{}'.format(pid_name1, pid_name2)] = pid_values1 + pid_values2
In [10]:
add_features(data_k)
add_features(data_ks)
In [12]:
selection = '(ghostProb < 0.4) & ((PIDNNk > 0) | (PIDNNe > 0) | (PIDNNm > 0) | (PIDNNp > 0) | (PIDNNpi > 0))'
data_k = data_k.query(selection)
data_ks = data_ks.query(selection)
In [13]:
data = pandas.concat([data_k, data_ks])
data['label'] = [0] * len(data_k) + [1] * len(data_ks)
In [14]:
from folding_group import FoldingGroupClassifier
from rep.report.metrics import RocAuc
from rep.estimators import SklearnClassifier
from decisiontrain import DecisionTrainClassifier
In [15]:
features_ele = ['mult', 'log_partPt: log(partPt)', 'log_partP: log(partP)',
'log_ptB: log(ptB)', 'log_IPs: log(IPs)', 'partlcs', 'EOverP',
'ghostProb', 'log_IPPU: log(IPPU)']
features_muon = ['mult', 'log_partPt: log(partPt)', 'log_partP: log(partP)',
'log_ptB: log(ptB)', 'log_IPs: log(IPs)', 'partlcs', 'PIDNNm', 'ghostProb', 'log_IPPU: log(IPPU)']
features_kaon = ['mult', 'log_partPt: log(partPt)', 'log_partP: log(partP)',
'nnkrec','log_ptB: log(ptB)', 'log_IPs: log(IPs)', 'partlcs',
'PIDNNk', 'PIDNNpi', 'PIDNNp', 'ghostProb', 'log_IPPU: log(IPPU)']
In [16]:
features = ['cos_diff_phi', 'diff_pt', 'partPt', 'partP', 'nnkrec', 'diff_eta', 'EOverP',
'ptB', 'sum_PID_mu_k', 'proj', 'PIDNNe', 'sum_PID_k_e', 'PIDNNk', 'sum_PID_mu_e', 'PIDNNm',
'phi', 'IP', 'IPerr', 'IPs', 'veloch', 'max_PID_k_e', 'ghostProb',
'IPPU', 'eta', 'max_PID_mu_e', 'max_PID_mu_k', 'partlcs']
In [17]:
selected_tt = dict()
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=500, depth=6,
max_features=7, n_threads=12)
for mask, name, feat in zip([data.e_cut == 1,
data.mu_cut == 1,
data.K_cut == 1,
(data.mu_cut == 1) | (data.e_cut == 1) | (data.K_cut == 1)],
['electron', 'muon', 'kaon', 'all particles'],
[features_ele, features_muon, features_kaon, set.union(set(features_ele),
set(features_muon),
set(features_kaon))]):
selected_tt[name] = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
train_features=feat,
group_feature='group_column')
selected_tt[name].fit(data.ix[mask, :], data.loc[mask, 'label'])
selected_tt[name + '/all features'] = FoldingGroupClassifier(SklearnClassifier(tt_base),
n_folds=2, random_state=11,
train_features=features,
group_feature='group_column')
selected_tt[name + '/all features'].fit(data.ix[mask, :], data.loc[mask, 'label'])
In [18]:
data = data.ix[numpy.isfinite(data.IPs), :]
In [19]:
selected_tt['all tracks, inclusive tagging features'] = FoldingGroupClassifier(SklearnClassifier(tt_base),
n_folds=2, random_state=11,
train_features=features,
group_feature='group_column')
selected_tt['all tracks, inclusive tagging features'].fit(data, data.label)
Out[19]:
In [20]:
selected_tt['all tracks wo diff_eta'] = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
train_features=list(set(features) - {'diff_eta'}),
group_feature='group_column')
selected_tt['all tracks wo diff_eta'].fit(data, data.label)
Out[20]:
In [30]:
selected_tt['all tracks wo main'] = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
train_features=list(set(features) - {'ptB', 'eta',
'proj',
'diff_eta'}),
group_feature='group_column')
selected_tt['all tracks wo main'].fit(data, data.label)
Out[30]:
In [22]:
from collections import OrderedDict
In [23]:
report = OrderedDict()
for mask, name in zip([data.e_cut == 1,
data.mu_cut == 1,
data.K_cut == 1,
(data.mu_cut == 1) | (data.e_cut == 1) | (data.K_cut == 1)],
['electron', 'muon', 'kaon', 'all particles']):
r_temp = selected_tt[name].test_on(data.ix[mask, :], data.loc[mask, 'label'])
report[name + ', old tagging features'] = r_temp.compute_metric(RocAuc())['clf']
r_temp = selected_tt[name + '/all features'].test_on(data.ix[mask, :], data.loc[mask, 'label'])
report[name + ', all current features'] = r_temp.compute_metric(RocAuc())['clf']
In [24]:
report['all tracks, inclusive tagging features'] = selected_tt['all tracks, inclusive tagging features'].test_on(data, data.label).compute_metric(RocAuc())['clf']
report['all tracks wo diff_eta'] = selected_tt['all tracks wo diff_eta'].test_on(data, data.label).compute_metric(RocAuc())['clf']
report['all tracks wo main'] = selected_tt['all tracks wo main'].test_on(data, data.label).compute_metric(RocAuc())['clf']
In [28]:
r = selected_tt['all tracks, inclusive tagging features'].test_on(data, data.label)
In [29]:
r.feature_importance()
Out[29]:
In [32]:
pandas.DataFrame({'names': report.keys(), 'roc: K vs Ks': report.values()})
Out[32]:
In [26]:
figure(figsize=(22, 60))
for n, f in enumerate(features):
plt.subplot(10, 3, n + 1)
x1 = data.ix[data.label == 0, f].values
x2 = data.ix[data.label == 1, f].values
mask1 = x1 > -400
mask2 = x2 > -400
r1 = numpy.percentile(x1[mask1], [1, 99])
r2 = numpy.percentile(x2[mask2], [1, 99])
r = (min(r1[0], r2[0]), max(r1[1], r2[1]))
hist(x1[mask1], normed=True, alpha=0.5, bins=100, range=r, label='K^{+/-}')
hist(x2[mask2], normed=True, alpha=0.5, bins=100, range=r, label='Ks')
xlabel(f)
legend()
In [ ]: