In [1]:
%pylab inline
figsize(8, 6)
In [3]:
import pandas
import numpy
from rep.metaml import FoldingClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
In [4]:
from utils import get_N_B_events, get_events_number, get_events_statistics
In [7]:
import root_numpy
data_nan = pandas.DataFrame(root_numpy.root2array('datasets/data/csv/JPsiK/Tracks.root'))
In [8]:
data_nan.head()
Out[8]:
In [ ]:
event_id_column = 'event_id'
data_nan[event_id_column] = data_nan.run.apply(str) + '_' + data_nan.event.apply(str)
In [ ]:
get_events_statistics(data_nan)
In [9]:
get_N_B_events()
Out[9]:
In [10]:
data = data_nan.dropna()
len(data_nan), len(data), get_events_statistics(data)
Out[10]:
In [11]:
# add different between max pt in event and pt for each track
def add_diff_pt(data):
max_pt = group_max(data[event_id_column].values.astype(str), data.partPt.values)
data['diff_pt'] = max_pt - data['partPt'].values
# max is computing max over tracks in the same event for saome data
def group_max(groups, data):
# computing unique integer id for each group
assert len(groups) == len(data)
_, event_id = numpy.unique(groups, return_inverse=True)
max_over_event = numpy.zeros(max(event_id) + 1) - numpy.inf
numpy.maximum.at(max_over_event, event_id, data)
return max_over_event[event_id]
In [12]:
# add diff pt
add_diff_pt(data)
# add cos(diff_phi)
data['cos_diff_phi'] = numpy.cos(data.diff_phi.values)
In [13]:
from itertools import combinations
PIDs = {'k': data.PIDNNk.values,
'e': data.PIDNNe.values,
'mu': data.PIDNNm.values,
}
for (pid_name1, pid_values1), (pid_name2, pid_values2) in combinations(PIDs.items(), 2):
data['max_PID_{}_{}'.format(pid_name1, pid_name2)] = numpy.maximum(pid_values1, pid_values2)
data['sum_PID_{}_{}'.format(pid_name1, pid_name2)] = pid_values1 + pid_values2
In [14]:
data['label'] = (data.signB.values * data.signTrack.values > 0) * 1
In [15]:
','.join(data.columns)
Out[15]:
In [16]:
threshold_mistag = 0.6
initial_cut = '(PIDNNp < {tr}) & (PIDNNpi < {tr}) & (ghostProb < 0.4)'.format(tr=threshold_mistag)
data = data.query(initial_cut)
In [17]:
get_events_statistics(data)
Out[17]:
In [18]:
threshold_kaon = 0.7
threshold_muon = 0.4
threshold_electron = 0.6
cut_pid = " ( (PIDNNk > {trk}) | (PIDNNm > {trm}) | (PIDNNe > {tre}) ) "
cut_pid = cut_pid.format(trk=threshold_kaon, trm=threshold_muon, tre=threshold_electron)
data = data.query(cut_pid)
In [19]:
get_events_statistics(data)
Out[19]:
where $sw_i$ - sPLot weight (sWeight for signal)
$$\epsilon_{tag} = \frac{N (\text{passed selection})} {N (\text{all events})}$$$$\Delta\epsilon_{tag} = \frac{\sqrt{N (\text{passed selection})}} {N (\text{all events})}$$
In [20]:
N_B_passed = float(get_events_number(data))
tagging_efficiency = N_B_passed / get_N_B_events()
tagging_efficiency_delta = sqrt(N_B_passed) / get_N_B_events()
tagging_efficiency, tagging_efficiency_delta
Out[20]:
In [21]:
hist(data.diff_pt.values, bins=100)
pass
In [22]:
_, take_indices = numpy.unique(data[event_id_column], return_index=True)
figure(figsize=[15, 5])
subplot(1, 2, 1)
hist(data.Bmass.values[take_indices], bins=100)
title('B mass hist')
xlabel('mass')
subplot(1, 2, 2)
hist(data.N_sig_sw.values[take_indices], bins=100, normed=True)
title('sWeights hist')
xlabel('signal sWeights')
plt.savefig('img/Bmass.png' , format='png')
In [23]:
sweight_threshold = 1.
data_sw_passed = data[data.N_sig_sw > sweight_threshold]
data_sw_not_passed = data[data.N_sig_sw <= sweight_threshold]
get_events_statistics(data_sw_passed)
Out[23]:
In [24]:
_, take_indices = numpy.unique(data_sw_passed[event_id_column], return_index=True)
figure(figsize=[15, 5])
subplot(1, 2, 1)
hist(data_sw_passed.Bmass.values[take_indices], bins=100)
title('B mass hist for sWeight > 1 selection')
xlabel('mass')
subplot(1, 2, 2)
hist(data_sw_passed.N_sig_sw.values[take_indices], bins=100, normed=True)
title('sWeights hist for sWeight > 1 selection')
xlabel('signal sWeights')
plt.savefig('img/Bmass_selected.png' , format='png')
In [25]:
hist(data_sw_passed.diff_pt.values, bins=100)
pass
In [26]:
features = list(set(data.columns) - {'run', 'event', 'i', 'signB', 'signTrack', 'N_sig_sw', 'Bmass', 'mult',
'PIDNNp', 'PIDNNpi', 'label', 'thetaMin', 'Dist_phi', event_id_column,
'mu_cut', 'e_cut', 'K_cut', 'ID', 'diff_phi', 'index'})
features
Out[26]:
In [27]:
figure(figsize=[15, 4])
for i, (feature1, feature2) in enumerate(combinations(['PIDNNk', 'PIDNNm', 'PIDNNe'], 2)):
subplot(1, 3, i + 1)
scatter(data_sw_passed[feature1].values, data_sw_passed[feature2].values, alpha=0.01)
xlabel(feature1)
ylabel(feature2)
ylim(0, 1), xlim(0, 1)
plt.savefig('img/PID_selected.png' , format='png')
In [33]:
figure(figsize=[18, 4])
bins = 60
step = 3
for i, (feature1, feature2) in enumerate(combinations(['PIDNNk', 'PIDNNm', 'PIDNNe'], 2)):
subplot(1, 3, i + 1)
Z, (x, y) = numpy.histogramdd(data_sw_passed[[feature1, feature2]].values, bins=bins, range=([0, 1], [0, 1]))
pcolor(log(Z), vmin=0)
xlabel(feature1)
ylabel(feature2)
xticks(arange(bins, step), x[::step]), yticks(arange(bins, step), y[::step])
cb = colorbar()
cb.set_label('log N')
plt.savefig('img/PID_selected_hist.png' , format='png')
In [34]:
hist(data_sw_passed.diff_pt.values, bins=60, normed=True)
pass
In [35]:
_, n_tracks = numpy.unique(data_sw_passed[event_id_column], return_counts=True)
hist(n_tracks, bins=60)
title('Number of tracks')
plt.savefig('img/tracks_number.png' , format='png')
In [36]:
figure(figsize=[15, 4])
for i, column in enumerate(['PIDNNm', 'PIDNNe', 'PIDNNk']):
subplot(1, 3, i + 1)
hist(data_sw_passed[column].values, bins=60, range=(0, 1), label=column)
legend()
In [37]:
base = RandomForestClassifier(n_estimators=300, max_depth=14, min_samples_leaf=100, n_jobs=8)
est_choose_lds = LabeledDataStorage(data_sw_passed, data_sw_passed.label, data_sw_passed.N_sig_sw)
est_choose_RT = FoldingClassifier(base, features=features, random_state=11)
%time est_choose_RT.fit_lds(est_choose_lds)
pass
In [38]:
est_choose_report = ClassificationReport({'RF': est_choose_RT}, est_choose_lds)
In [39]:
est_choose_report.compute_metric(RocAuc())
Out[39]:
In [40]:
plot([0, 1], [1, 0], 'k--')
est_choose_report.roc()
Out[40]:
In [41]:
imp = numpy.sum([est.feature_importances_ for est in est_choose_RT.estimators], axis=0)
imp = pandas.DataFrame({'importance': imp / numpy.max(imp), 'feature': est_choose_RT.features})
imp.sort('importance', ascending=False)
Out[41]:
In [42]:
from utils import plot_flattened_probs
probs = est_choose_RT.predict_proba(data_sw_passed)
flat_ss = plot_flattened_probs(probs, data_sw_passed.label.values, data_sw_passed.N_sig_sw.values, label=1)
flat_os = plot_flattened_probs(probs, data_sw_passed.label.values, data_sw_passed.N_sig_sw.values, label=0)
In [43]:
hist(probs[:, 1][data_sw_passed.label.values == 1], bins=60, alpha=0.3, normed=True)
hist(probs[:, 1][data_sw_passed.label.values == 0], bins=60, alpha=0.3, normed=True)
pass
In [44]:
def get_max_ids(groups, values):
"""in each group return index of object with highest value"""
_, groups = numpy.unique(groups, return_inverse=True)
sorter = numpy.argsort(values)
# languages ranked by average salaries:
values_order = numpy.argsort(sorter)
top_order = np.zeros(groups.max() + 1, dtype=int)
numpy.maximum.at(top_order, groups, values_order)
return sorter[top_order]
In [45]:
## select the best tracks by estimator in event
## we are trying to leave only those tracks which have the greatest rf prediction
def get_best_tracks(data, probs):
data = data.copy()
probabilities = numpy.where(data['label'] > 0, probs, 1 - probs)
best_ids = get_max_ids(data[event_id_column], probabilities)
good_tracks = data.iloc[best_ids, :]
n_counts = numpy.bincount(best_ids, minlength=len(data))
other_ids = numpy.where(n_counts == 0)[0]
other_tracks = data.iloc[other_ids, :]
print len(good_tracks), len(other_tracks)
return good_tracks, other_tracks
In [46]:
def get_pair_best_tracks(data, probs):
"""Select best tracks of same and of opposite sign """
assert (numpy.unique(data['label']) == [0, 1]).all(), 'labels should be 0, 1'
data = data.copy()
probabilities = numpy.where(data['label'] > 0, probs, 1 - probs)
good_tracks = []
other_tracks = []
for label in [0, 1]:
train_data = data[data['label'] == label]
pred = probabilities[numpy.array(data['label'] == label)]
best_ids = get_max_ids(train_data[event_id_column], pred)
good_tracks.append(train_data.iloc[best_ids, :])
n_counts = numpy.bincount(best_ids, minlength=len(train_data))
other_ids = numpy.where(n_counts == 0)[0]
other_tracks.append(train_data.iloc[other_ids, :])
good_tracks = pandas.concat(good_tracks)
other_tracks = pandas.concat(other_tracks)
print len(good_tracks), len(other_tracks)
return good_tracks, other_tracks
In [47]:
from hep_ml.decisiontrain import DecisionTrainClassifier
from hep_ml.losses import LogLossFunction
In [48]:
data_sw_passed_lds = LabeledDataStorage(data_sw_passed, data_sw_passed.label, data_sw_passed.N_sig_sw.values)
In [95]:
from rep.estimators import XGBoostClassifier
xgb_base = XGBoostClassifier(max_depth=5, colsample=15, eta=0.1, subsample=0.2, n_estimators=150)
xgb_folding = FoldingClassifier(xgb_base, n_folds=2, random_state=11, ipc_profile='ssh-ipy', features=features)
%time xgb_folding.fit_lds(data_sw_passed_lds)
pass
In [52]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=1500, depth=6, pretransform_needed=True,
max_features=15, loss=LogLossFunction(regularization=100))
tt_folding = FoldingClassifier(tt_base, n_folds=2, random_state=11, ipc_profile='ssh-ipy', features=features)
%time tt_folding.fit_lds(data_sw_passed_lds)
pass
In [96]:
comparison_report = ClassificationReport({'tt': tt_folding, 'xgb': xgb_folding}, data_sw_passed_lds)
In [97]:
comparison_report.compute_metric(RocAuc())
Out[97]:
In [98]:
comparison_report.roc()
Out[98]:
In [99]:
lc = comparison_report.learning_curve(RocAuc(), steps=1)
In [102]:
lc.plot()#xlim=(0, 1))
In [67]:
xgb_folding.estimators[0].get_feature_importances().sort('effect', ascending=False)
Out[67]:
In [ ]:
temp_probs = est_choose_RT.predict_proba(data_sw_passed)[:, 1]
mask = ((flat_ss(temp_probs) < 0.2) & (data_sw_passed.label == 0)) | \
((flat_os(temp_probs) > 0.4) & (data_sw_passed.label == 1))
In [ ]:
base = RandomForestClassifier(n_estimators=300, max_depth=14, min_samples_leaf=100, n_jobs=8)
forest_preselection_lds = LabeledDataStorage(data_sw_passed, mask * 1, data_sw_passed.N_sig_sw)
forest_preselection = FoldingClassifier(base, features=features, random_state=11)
%time forest_preselection.fit_lds(forest_preselection_lds)
pass
In [ ]:
report = ClassificationReport({'rf': forest_preselection}, forest_preselection_lds)
In [45]:
report.compute_metric(RocAuc())
Out[45]:
In [46]:
report.roc()
Out[46]:
In [47]:
report.prediction_pdf()
Out[47]:
In [48]:
prob = forest_preselection.predict_proba(data_sw_passed)[:, 1]
data_sw_passed_preselected = data_sw_passed[prob > 0.5]
data_sw_passed_not_preselected = data_sw_passed[prob <= 0.5]
prob = forest_preselection.predict_proba(data_sw_not_passed)[:, 1]
data_sw_not_passed_preselected = data_sw_not_passed[prob > 0.5]
In [49]:
get_events_statistics(data_sw_passed_preselected)
Out[49]:
In [50]:
data_sw_passed_preselected_lds = LabeledDataStorage(data_sw_passed_preselected, data_sw_passed_preselected.label,
sample_weight=data_sw_passed_preselected.N_sig_sw.values)
In [ ]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=3000, depth=6, pretransform_needed=True,
max_features=15, loss=LogLossFunction(regularization=100))
tt_folding_preselected = FoldingClassifier(tt_base, n_folds=2, random_state=11,
ipc_profile='ssh-ipy', features=features)
%time tt_folding_preselected.fit_lds(data_sw_passed_preselected_lds)
pass
In [55]:
report = ClassificationReport({'tt': tt_folding_preselected}, data_sw_passed_preselected_lds)
report.learning_curve(RocAuc())
Out[55]:
In [59]:
temp_probs = est_choose_RT.predict_proba(data_sw_passed)[:, 1]
mask = ((flat_ss(temp_probs) < 0.6) & (data_sw_passed.label == 0)) | \
((flat_os(temp_probs) > 0.2) & (data_sw_passed.label == 1))
data_sw_passed_rf_selected = data_sw_passed[mask]
data_sw_passed_rf_not_selected = data_sw_passed[~mask]
In [60]:
get_events_statistics(data_sw_passed_rf_selected)
Out[60]:
In [61]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=3000, depth=6, pretransform_needed=True,
max_features=15, loss=LogLossFunction(regularization=100))
tt_folding_forest = FoldingClassifier(tt_base, n_folds=2, random_state=11,
features=features, ipc_profile='ssh-ipy')
data_sw_passed_rf_lds = LabeledDataStorage(data_sw_passed_rf_selected, data_sw_passed_rf_selected.label,
sample_weight=data_sw_passed_rf_selected.N_sig_sw.values)
%time tt_folding_forest.fit_lds(data_sw_passed_rf_lds)
pass
In [62]:
report = ClassificationReport({'tt + forest': tt_folding_forest}, data_sw_passed_rf_lds)
report.learning_curve(RocAuc())
Out[62]:
In [63]:
temp_probs = est_choose_RT.predict_proba(data_sw_passed)[:, 1]
data_sw_passed_top_selected, data_sw_passed_top_not_selected = get_best_tracks(data_sw_passed, temp_probs)
In [64]:
print get_events_statistics(data_sw_passed_top_selected)
print get_events_statistics(data_sw_passed_top_not_selected)
In [65]:
tt_folding_top_forest_lds = LabeledDataStorage(data_sw_passed_top_selected, data_sw_passed_top_selected.label,
sample_weight=data_sw_passed_top_selected.N_sig_sw.values)
In [66]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=3000, depth=6, pretransform_needed=True,
max_features=15, loss=LogLossFunction(regularization=100))
tt_folding_forest_top = FoldingClassifier(tt_base, n_folds=2, random_state=11, features=features,
ipc_profile='ssh-ipy')
%time tt_folding_forest_top.fit_lds(tt_folding_top_forest_lds)
pass
In [67]:
report = ClassificationReport({'tt': tt_folding_forest_top}, tt_folding_top_forest_lds)
report.learning_curve(RocAuc())
Out[67]:
In [68]:
temp_probs = est_choose_RT.predict_proba(data_sw_passed)[:, 1]
data_sw_passed_pair_selected, data_sw_passed_pair_not_selected = get_pair_best_tracks(data_sw_passed, temp_probs)
In [69]:
print get_events_statistics(data_sw_passed_pair_selected)
print get_events_statistics(data_sw_passed_pair_not_selected)
In [70]:
tt_folding_pair_forest_lds = LabeledDataStorage(data_sw_passed_pair_selected, data_sw_passed_pair_selected.label,
sample_weight=data_sw_passed_pair_selected.N_sig_sw.values)
In [71]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=3000, depth=6, pretransform_needed=True,
max_features=15, loss=LogLossFunction(regularization=100))
tt_folding_forest_pair = FoldingClassifier(tt_base, n_folds=2, random_state=11, features=features,
ipc_profile='ssh-ipy')
%time tt_folding_forest_pair.fit_lds(tt_folding_pair_forest_lds)
pass
In [72]:
report = ClassificationReport({'tt': tt_folding_forest_pair}, tt_folding_pair_forest_lds)
report.learning_curve(RocAuc())
Out[72]:
In [56]:
from utils import get_result_with_bootstrap_for_given_part
In [57]:
models = []
In [75]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta,
tt_folding_forest_top,
[data_sw_passed_top_selected,
data_sw_passed_top_not_selected,
data_sw_not_passed],
'rf-top-tt-iso', logistic=False))
In [76]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta,
tt_folding_forest_top,
[data_sw_passed_top_selected,
data_sw_passed_top_not_selected,
data_sw_not_passed],
'rf-top-tt-log', logistic=True))
In [77]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta,
tt_folding_forest_pair,
[data_sw_passed_pair_selected,
data_sw_passed_pair_not_selected,
data_sw_not_passed],
'rf-pair-top-tt-iso', logistic=False))
In [78]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta,
tt_folding_forest_pair,
[data_sw_passed_pair_selected,
data_sw_passed_pair_not_selected,
data_sw_not_passed],
'rf-pair-top-tt-log', logistic=True))
In [79]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta,
tt_folding_forest,
[data_sw_passed_rf_selected,
data_sw_passed_rf_not_selected,
data_sw_not_passed],
'rf-tt-iso', logistic=False))
In [80]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta,
tt_folding_forest,
[data_sw_passed_rf_selected,
data_sw_passed_rf_not_selected,
data_sw_not_passed],
'rf-tt-log', logistic=True))
In [81]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data_sw_passed, data_sw_not_passed], 'tt-iso', logistic=False))
In [82]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data_sw_passed, data_sw_not_passed], 'tt-log', logistic=True))
In [83]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, xgb_folding,
[data_sw_passed, data_sw_not_passed], 'xgb-iso', logistic=False))
In [84]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, xgb_folding,
[data_sw_passed, data_sw_not_passed], 'xgb-log', logistic=True))
In [ ]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, est_choose_RT,
[data_sw_passed, data_sw_not_passed], 'rf-iso', logistic=False))
In [ ]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, est_choose_RT,
[data_sw_passed, data_sw_not_passed], 'rf-log', logistic=True))
In [58]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency,
tagging_efficiency_delta,
tt_folding_preselected,
[data_sw_passed_preselected, data_sw_passed_not_preselected,
data_sw_not_passed],
'forest-idea-tt-log', logistic=True))
In [59]:
N_B_passed = float(get_events_number(data_sw_passed_preselected)) + float(get_events_number(data_sw_not_passed_preselected))
tagging_efficiency_preselected = N_B_passed / get_N_B_events()
tagging_efficiency_preselected_delta = sqrt(N_B_passed) / get_N_B_events()
tagging_efficiency_preselected, tagging_efficiency_preselected_delta
Out[59]:
In [60]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency_preselected,
tagging_efficiency_preselected_delta,
tt_folding_preselected,
[data_sw_passed_preselected, data_sw_not_passed_preselected],
'preselected-tt-log', logistic=True))
In [61]:
pandas.set_option('display.precision', 8)
result = pandas.concat(models)
result.index = result.name
result.drop('name', axis=1)
Out[61]:
In [91]:
pandas.set_option('display.precision', 8)
result = pandas.concat(models)
result.index = result.name
result.drop('name', axis=1)
Out[91]:
In [63]:
from utils import prepare_B_data_for_given_part
In [93]:
Bdata_prepared = prepare_B_data_for_given_part(tt_folding, [data_sw_passed, data_sw_not_passed], logistic=True)
In [94]:
Bdata_prepared.to_csv('models/Bdata_tracks.csv', header=True, index=False)
In [64]:
Bdata_prepared_idea = prepare_B_data_for_given_part(tt_folding_preselected,
[data_sw_passed_preselected, data_sw_passed_not_preselected,
data_sw_not_passed],
logistic=True)
In [67]:
Bdata_prepared_idea.to_csv('models/Bdata_tracks_idea.csv', header=True, index=False)