In [1]:
%pylab inline
figsize(8, 6)
In [2]:
import pandas
import numpy
from folding_group import FoldingGroupClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
In [3]:
from utils import get_N_B_events, get_events_number, get_events_statistics
In [4]:
import root_numpy
data_nan = pandas.DataFrame(root_numpy.root2array('datasets/tracks.root', 'tracks'))
In [5]:
data_nan.head()
Out[5]:
In [6]:
event_id_column = 'event_id'
event_id = data_nan.run.apply(str) + '_' + data_nan.event.apply(str)
data_nan['group_column'] = numpy.unique(event_id, return_inverse=True)[1]
data_nan[event_id_column] = event_id
In [7]:
get_events_statistics(data_nan)
Out[7]:
In [8]:
get_N_B_events()
Out[8]:
In [9]:
data = data_nan.dropna()
len(data_nan), len(data), get_events_statistics(data)
Out[9]:
In [10]:
# add different between max pt in event and pt for each track
def add_diff_pt(data):
max_pt = group_max(data[event_id_column].values.astype(str), data.partPt.values)
data.loc[:, 'diff_pt'] = max_pt - data['partPt'].values
# max is computing max over tracks in the same event for saome data
def group_max(groups, data):
# computing unique integer id for each group
assert len(groups) == len(data)
_, event_id = numpy.unique(groups, return_inverse=True)
max_over_event = numpy.zeros(max(event_id) + 1) - numpy.inf
numpy.maximum.at(max_over_event, event_id, data)
return max_over_event[event_id]
In [11]:
# add diff pt
add_diff_pt(data)
# add cos(diff_phi)
data.loc[:, 'cos_diff_phi'] = numpy.cos(data.diff_phi.values)
In [12]:
from itertools import combinations
PIDs = {'k': data.PIDNNk.values,
'e': data.PIDNNe.values,
'mu': data.PIDNNm.values,
}
for (pid_name1, pid_values1), (pid_name2, pid_values2) in combinations(PIDs.items(), 2):
data.loc[:, 'max_PID_{}_{}'.format(pid_name1, pid_name2)] = numpy.maximum(pid_values1, pid_values2)
data.loc[:, 'sum_PID_{}_{}'.format(pid_name1, pid_name2)] = pid_values1 + pid_values2
In [13]:
data.loc[:, 'label'] = (data.signB.values * data.signTrack.values > 0) * 1
In [14]:
','.join(data.columns)
Out[14]:
In [15]:
initial_cut = '(ghostProb < 0.4)'
data = data.query(initial_cut)
os_selection = (data.IPs.values > 3) * ((abs(data.diff_eta.values) > 0.6) | (abs(data.diff_phi.values) > 0.825))
data = data[os_selection]
In [16]:
get_events_statistics(data)
Out[16]:
In [17]:
threshold_kaon = 0.
threshold_muon = 0.
threshold_electron = 0.
threshold_pion = 0.5
threshold_proton = 0.5
cut_pid = " ((PIDNNk > {trk}) | (PIDNNm > {trm}) | (PIDNNe > {tre})) & (PIDNNpi < {trpi}) & (PIDNNp < {trp}) "
cut_pid = cut_pid.format(trk=threshold_kaon, trm=threshold_muon, tre=threshold_electron, trpi=threshold_pion,
trp=threshold_proton)
data = data.query(cut_pid)
In [18]:
get_events_statistics(data)
Out[18]:
where $sw_i$ - sPLot weight (sWeight for signal)
$$\epsilon_{tag} = \frac{N (\text{passed selection})} {N (\text{all events})}$$$$\Delta\epsilon_{tag} = \frac{\sqrt{N (\text{passed selection})}} {N (\text{all events})}$$
In [19]:
N_B_passed = float(get_events_number(data))
tagging_efficiency = N_B_passed / get_N_B_events()
tagging_efficiency_delta = sqrt(N_B_passed) / get_N_B_events()
tagging_efficiency, tagging_efficiency_delta
Out[19]:
In [20]:
hist(data.diff_pt.values, bins=100)
pass
In [21]:
_, take_indices = numpy.unique(data[event_id_column], return_index=True)
figure(figsize=[15, 5])
subplot(1, 2, 1)
hist(data.Bmass.values[take_indices], bins=100)
title('B mass hist')
xlabel('mass')
subplot(1, 2, 2)
hist(data.N_sig_sw.values[take_indices], bins=100, normed=True)
title('sWeights hist')
xlabel('signal sWeights')
plt.savefig('img/Bmass_OS.png' , format='png')
In [22]:
sweight_threshold = 1.
data_sw_passed = data[data.N_sig_sw > sweight_threshold]
data_sw_not_passed = data[data.N_sig_sw <= sweight_threshold]
get_events_statistics(data_sw_passed)
Out[22]:
In [23]:
_, take_indices = numpy.unique(data_sw_passed[event_id_column], return_index=True)
figure(figsize=[15, 5])
subplot(1, 2, 1)
hist(data_sw_passed.Bmass.values[take_indices], bins=100)
title('B mass hist for sWeight > 1 selection')
xlabel('mass')
subplot(1, 2, 2)
hist(data_sw_passed.N_sig_sw.values[take_indices], bins=100, normed=True)
title('sWeights hist for sWeight > 1 selection')
xlabel('signal sWeights')
plt.savefig('img/Bmass_selected_OS.png' , format='png')
In [24]:
hist(data_sw_passed.diff_pt.values, bins=100)
pass
In [25]:
features = list(set(data.columns) - {'index', 'run', 'event', 'i', 'signB', 'signTrack', 'N_sig_sw', 'Bmass', 'mult',
'PIDNNp', 'PIDNNpi', 'label', 'thetaMin', 'Dist_phi', event_id_column,
'mu_cut', 'e_cut', 'K_cut', 'ID', 'diff_phi', 'group_column'})
features
Out[25]:
In [26]:
figure(figsize=[15, 16])
bins = 60
step = 3
for i, (feature1, feature2) in enumerate(combinations(['PIDNNk', 'PIDNNm', 'PIDNNe', 'PIDNNp', 'PIDNNpi'], 2)):
subplot(4, 3, i + 1)
Z, (x, y) = numpy.histogramdd(data_sw_passed[[feature1, feature2]].values, bins=bins, range=([0, 1], [0, 1]))
pcolor(numpy.log(Z).T, vmin=0)
xlabel(feature1)
ylabel(feature2)
xticks(numpy.arange(bins, step), x[::step]), yticks(numpy.arange(bins, step), y[::step])
plt.savefig('img/PID_selected_OS.png' , format='png')
In [27]:
hist(data_sw_passed.diff_pt.values, bins=60, normed=True)
pass
In [71]:
_, n_tracks = numpy.unique(data_sw_passed[event_id_column], return_counts=True)
hist(n_tracks, bins=max(n_tracks), range=(1, max(n_tracks)))
title('Number of tracks')
plt.savefig('img/tracks_number_OS.png' , format='png')
In [29]:
figure(figsize=[15, 4])
for i, column in enumerate(['PIDNNm', 'PIDNNe', 'PIDNNk']):
subplot(1, 3, i + 1)
hist(data_sw_passed[column].values, bins=60, range=(0, 1), label=column)
legend()
In [30]:
from hep_ml.decisiontrain import DecisionTrainClassifier
from hep_ml.losses import LogLossFunction
from rep.estimators import SklearnClassifier
In [31]:
data_sw_passed_lds = LabeledDataStorage(data_sw_passed, data_sw_passed.label, data_sw_passed.N_sig_sw.values)
In [49]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=3000, depth=6,
max_features=15, loss=LogLossFunction(regularization=100))
tt_folding = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
train_features=features, group_feature='group_column',
parallel_profile = 'threads-2')
%time tt_folding.fit_lds(data_sw_passed_lds)
pass
In [50]:
import cPickle
with open('models/dt_OS.pkl', 'w') as f:
cPickle.dump(tt_folding, f)
In [51]:
comparison_report = ClassificationReport({'tt': tt_folding}, data_sw_passed_lds)
In [52]:
comparison_report.compute_metric(RocAuc())
Out[52]:
In [53]:
comparison_report.roc()
Out[53]:
In [54]:
lc = comparison_report.learning_curve(RocAuc(), steps=1)
In [55]:
lc
Out[55]:
In [56]:
for est in tt_folding.estimators:
est.estimators = est.estimators[:1000]
In [57]:
comparison_report.feature_importance()
Out[57]:
In [58]:
from utils import get_result_with_bootstrap_for_given_part
In [59]:
models = []
In [60]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data_sw_passed, data_sw_not_passed], 'tt-iso', logistic=False))
In [61]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data_sw_passed, data_sw_not_passed], 'tt-log', logistic=True))
In [62]:
pandas.set_option('display.precision', 8)
result = pandas.concat(models)
result.index = result.name
result.drop('name', axis=1)
Out[62]:
In [63]:
from utils import prepare_B_data_for_given_part
In [64]:
Bdata_prepared = prepare_B_data_for_given_part(tt_folding, [data_sw_passed, data_sw_not_passed], logistic=True)
In [65]:
Bdata_prepared.to_csv('models/Bdata_tracks_OS.csv', header=True, index=False)
In [ ]: