In [1]:
%pylab inline
figsize(8, 6)


Populating the interactive namespace from numpy and matplotlib

Import


In [3]:
import pandas
import numpy
from rep.metaml import FoldingClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score

In [4]:
from utils import get_N_B_events, get_events_number, get_events_statistics

Reading initial data


In [7]:
import root_numpy
data_nan = pandas.DataFrame(root_numpy.root2array('datasets/data/csv/JPsiK/Tracks.root'))

In [8]:
data_nan.head()


Out[8]:
run event Bmass i mult partP partPt ptB IPs IP ... proj ID veloch signB signTrack Dist_phi N_sig_sw mu_cut e_cut K_cut
0 115839 204997902 5.309576 0 13 3.67156 0.300418 4.004197 0.816143 0.056365 ... 1.058442 -211 0.911645 1 -1 0.114615 0.59521 0 0 0
1 115839 204997902 5.309576 1 13 8.33952 1.103876 4.004197 1.375382 0.033095 ... 3.121358 -211 0.796731 1 -1 0.051334 0.59521 0 0 0
2 115839 204997902 5.309576 2 13 8.37654 1.182519 4.004197 4.338812 0.077636 ... 10.585135 -211 0.946629 1 -1 1.856516 0.59521 0 0 0
3 115839 204997902 5.309576 3 13 25.72961 0.905010 4.004197 2.287509 0.086603 ... 7.485243 211 1.058989 1 1 0.577419 0.59521 0 0 0
4 115839 204997902 5.309576 4 13 3.70597 0.516123 4.004197 0.562424 0.015694 ... 5.617354 211 1.042135 1 1 1.314513 0.59521 0 0 0

5 rows × 35 columns


In [ ]:
event_id_column = 'event_id'
data_nan[event_id_column] = data_nan.run.apply(str) + '_' + data_nan.event.apply(str)

In [ ]:
get_events_statistics(data_nan)

In [9]:
get_N_B_events()


Out[9]:
742867.7142562866

Remove rows with NAN from data


In [10]:
data = data_nan.dropna()
len(data_nan), len(data), get_events_statistics(data)


Out[10]:
(27156193, 27156190, {'Events': 1005757, 'tracks': 27156190})

Add diff_pt and cos(diff_phi)


In [11]:
# add different between max pt in event and pt for each track
def add_diff_pt(data):
    max_pt = group_max(data[event_id_column].values.astype(str), data.partPt.values)
    data['diff_pt'] = max_pt - data['partPt'].values

# max is computing max over tracks in the same event for saome data
def group_max(groups, data):
    # computing unique integer id for each group
    assert len(groups) == len(data)
    _, event_id = numpy.unique(groups, return_inverse=True)
    max_over_event = numpy.zeros(max(event_id) + 1) - numpy.inf
    numpy.maximum.at(max_over_event, event_id, data)
    return max_over_event[event_id]

In [12]:
# add diff pt
add_diff_pt(data)
# add cos(diff_phi)
data['cos_diff_phi'] = numpy.cos(data.diff_phi.values)

Add max, sum among PIDs


In [13]:
from itertools import combinations
PIDs = {'k': data.PIDNNk.values,
        'e': data.PIDNNe.values,
        'mu': data.PIDNNm.values,
       }
for (pid_name1, pid_values1), (pid_name2, pid_values2) in combinations(PIDs.items(), 2):
    data['max_PID_{}_{}'.format(pid_name1, pid_name2)] = numpy.maximum(pid_values1, pid_values2)
    data['sum_PID_{}_{}'.format(pid_name1, pid_name2)] = pid_values1 + pid_values2

define label = signB * signTrack

  • if > 0 (same sign) - label 1
  • if < 0 (different sign) - label 0

In [14]:
data['label'] = (data.signB.values * data.signTrack.values > 0) * 1

In [15]:
','.join(data.columns)


Out[15]:
'index,run,event,Bmass,i,mult,partP,partPt,ptB,IPs,IP,IPerr,partlcs,EOverP,ghostProb,IPPU,nnkrec,PIDNNk,PIDNNpi,PIDNNp,PIDNNm,PIDNNe,diff_eta,diff_phi,phi,eta,proj,ID,veloch,signB,signTrack,Dist_phi,N_sig_sw,mu_cut,e_cut,K_cut,event_id,diff_pt,cos_diff_phi,max_PID_mu_k,sum_PID_mu_k,max_PID_mu_e,sum_PID_mu_e,max_PID_k_e,sum_PID_k_e,label'

Filter out protons and pions, apply ghost prob cut


In [16]:
threshold_mistag = 0.6
initial_cut = '(PIDNNp < {tr}) & (PIDNNpi < {tr}) & (ghostProb < 0.4)'.format(tr=threshold_mistag)
data = data.query(initial_cut)

In [17]:
get_events_statistics(data)


Out[17]:
{'Events': 997107, 'tracks': 9498623}

Leave muons, kaons, electrons


In [18]:
threshold_kaon = 0.7 
threshold_muon = 0.4
threshold_electron = 0.6
cut_pid = " ( (PIDNNk > {trk}) | (PIDNNm > {trm}) | (PIDNNe > {tre}) ) "
cut_pid = cut_pid.format(trk=threshold_kaon, trm=threshold_muon, tre=threshold_electron)
    
data = data.query(cut_pid)

In [19]:
get_events_statistics(data)


Out[19]:
{'Events': 657874, 'tracks': 1191986}

Calculating tagging efficiency ($\epsilon_{tag}$)

$$N (\text{B passed selection}) = \sum_{\text{B passed selection}} sw_i$$$$N (\text{all B events}) = \sum_{\text{all B events}} sw_i,$$

where $sw_i$ - sPLot weight (sWeight for signal)

$$\epsilon_{tag} = \frac{N (\text{passed selection})} {N (\text{all events})}$$$$\Delta\epsilon_{tag} = \frac{\sqrt{N (\text{passed selection})}} {N (\text{all events})}$$

In [20]:
N_B_passed = float(get_events_number(data))
tagging_efficiency = N_B_passed / get_N_B_events()
tagging_efficiency_delta = sqrt(N_B_passed) / get_N_B_events()
tagging_efficiency, tagging_efficiency_delta


Out[20]:
(0.6694822894448079, 0.00094932256580402023)

In [21]:
hist(data.diff_pt.values, bins=100)
pass


Choose most probable B-events


In [22]:
_, take_indices = numpy.unique(data[event_id_column], return_index=True)

figure(figsize=[15, 5])

subplot(1, 2, 1)
hist(data.Bmass.values[take_indices], bins=100)
title('B mass hist')
xlabel('mass')

subplot(1, 2, 2)
hist(data.N_sig_sw.values[take_indices], bins=100, normed=True)
title('sWeights hist')
xlabel('signal sWeights')
plt.savefig('img/Bmass.png' , format='png')


Define B-like events for training

Events with low sWeight still will be used only to test quality.


In [23]:
sweight_threshold = 1.
data_sw_passed = data[data.N_sig_sw > sweight_threshold]
data_sw_not_passed = data[data.N_sig_sw <= sweight_threshold]
get_events_statistics(data_sw_passed)


Out[23]:
{'Events': 409381, 'tracks': 743076}

In [24]:
_, take_indices = numpy.unique(data_sw_passed[event_id_column], return_index=True)

figure(figsize=[15, 5])
subplot(1, 2, 1)
hist(data_sw_passed.Bmass.values[take_indices], bins=100)
title('B mass hist for sWeight > 1 selection')
xlabel('mass')

subplot(1, 2, 2)
hist(data_sw_passed.N_sig_sw.values[take_indices], bins=100, normed=True)
title('sWeights hist for sWeight > 1 selection')
xlabel('signal sWeights')
plt.savefig('img/Bmass_selected.png' , format='png')



In [25]:
hist(data_sw_passed.diff_pt.values, bins=100)
pass


Main idea:

find tracks, which can help reconstruct the sign of B if you know track sign.

label = signB * signTrack

  • the highest output means that this is same sign B as track
  • the lowest output means that this is opposite sign B than track

Define features


In [26]:
features = list(set(data.columns) - {'run', 'event', 'i', 'signB', 'signTrack', 'N_sig_sw', 'Bmass', 'mult', 
                                     'PIDNNp', 'PIDNNpi', 'label', 'thetaMin', 'Dist_phi', event_id_column, 
                                     'mu_cut', 'e_cut', 'K_cut', 'ID', 'diff_phi', 'index'})
features


Out[26]:
['cos_diff_phi',
 'diff_pt',
 'partPt',
 'partP',
 'nnkrec',
 'diff_eta',
 'EOverP',
 'ptB',
 'sum_PID_mu_k',
 'proj',
 'PIDNNe',
 'sum_PID_k_e',
 'PIDNNk',
 'sum_PID_mu_e',
 'PIDNNm',
 'phi',
 'IP',
 'IPerr',
 'IPs',
 'veloch',
 'max_PID_k_e',
 'ghostProb',
 'IPPU',
 'eta',
 'max_PID_mu_e',
 'max_PID_mu_k',
 'partlcs']

PID pairs scatters


In [27]:
figure(figsize=[15, 4])
for i, (feature1, feature2) in enumerate(combinations(['PIDNNk', 'PIDNNm', 'PIDNNe'], 2)):
    subplot(1, 3, i + 1)
    scatter(data_sw_passed[feature1].values, data_sw_passed[feature2].values, alpha=0.01)
    xlabel(feature1)
    ylabel(feature2)
    ylim(0, 1), xlim(0, 1)
plt.savefig('img/PID_selected.png' , format='png')


/moosefs/ipython_env/local/lib/python2.7/site-packages/matplotlib/collections.py:548: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == 'face':

In [33]:
figure(figsize=[18, 4])
bins = 60
step = 3
for i, (feature1, feature2) in enumerate(combinations(['PIDNNk', 'PIDNNm', 'PIDNNe'], 2)):
    subplot(1, 3, i + 1)
    Z, (x, y) = numpy.histogramdd(data_sw_passed[[feature1, feature2]].values, bins=bins, range=([0, 1], [0, 1]))
    pcolor(log(Z), vmin=0)
    xlabel(feature1)
    ylabel(feature2)
    xticks(arange(bins, step), x[::step]), yticks(arange(bins, step), y[::step])
    cb = colorbar()
    cb.set_label('log N')
plt.savefig('img/PID_selected_hist.png' , format='png')


pt


In [34]:
hist(data_sw_passed.diff_pt.values, bins=60, normed=True)
pass


count of tracks


In [35]:
_, n_tracks = numpy.unique(data_sw_passed[event_id_column], return_counts=True)
hist(n_tracks, bins=60)    
title('Number of tracks')
plt.savefig('img/tracks_number.png' , format='png')


PIDs histograms


In [36]:
figure(figsize=[15, 4])
for i, column in enumerate(['PIDNNm', 'PIDNNe', 'PIDNNk']):
    subplot(1, 3, i + 1)
    hist(data_sw_passed[column].values, bins=60, range=(0, 1), label=column)
    legend()


Preselection of tracks using 2-folding training


In [37]:
base = RandomForestClassifier(n_estimators=300, max_depth=14, min_samples_leaf=100, n_jobs=8)
est_choose_lds = LabeledDataStorage(data_sw_passed, data_sw_passed.label, data_sw_passed.N_sig_sw)
est_choose_RT = FoldingClassifier(base, features=features, random_state=11)
%time est_choose_RT.fit_lds(est_choose_lds)
pass


CPU times: user 32min 32s, sys: 6.62 s, total: 32min 38s
Wall time: 4min 9s

In [38]:
est_choose_report = ClassificationReport({'RF': est_choose_RT}, est_choose_lds)


KFold prediction using folds column

ROC AUC


In [39]:
est_choose_report.compute_metric(RocAuc())


Out[39]:
OrderedDict([('RF', 0.54724144959465404)])

ROC CURVE


In [40]:
plot([0, 1], [1, 0], 'k--')
est_choose_report.roc()


Out[40]:

Feature importances


In [41]:
imp = numpy.sum([est.feature_importances_ for est in est_choose_RT.estimators], axis=0)
imp = pandas.DataFrame({'importance': imp / numpy.max(imp), 'feature': est_choose_RT.features})
imp.sort('importance', ascending=False)


Out[41]:
feature importance
18 IPs 1.000000
2 partPt 0.825593
16 IP 0.759939
17 IPerr 0.593016
9 proj 0.579082
1 diff_pt 0.579023
22 IPPU 0.417439
3 partP 0.397054
0 cos_diff_phi 0.378237
11 sum_PID_k_e 0.376548
20 max_PID_k_e 0.367478
19 veloch 0.364029
5 diff_eta 0.355193
8 sum_PID_mu_k 0.351742
23 eta 0.333296
14 PIDNNm 0.332240
7 ptB 0.321987
12 PIDNNk 0.311121
21 ghostProb 0.300048
25 max_PID_mu_k 0.298935
26 partlcs 0.288946
15 phi 0.288610
24 max_PID_mu_e 0.283888
13 sum_PID_mu_e 0.266287
10 PIDNNe 0.234471
6 EOverP 0.193428
4 nnkrec 0.047616

Distributions for output

normalizing distribution with flattener


In [42]:
from utils import plot_flattened_probs
probs = est_choose_RT.predict_proba(data_sw_passed)
flat_ss = plot_flattened_probs(probs, data_sw_passed.label.values, data_sw_passed.N_sig_sw.values, label=1)
flat_os = plot_flattened_probs(probs, data_sw_passed.label.values, data_sw_passed.N_sig_sw.values, label=0)


KFold prediction using folds column

In [43]:
hist(probs[:, 1][data_sw_passed.label.values == 1], bins=60, alpha=0.3, normed=True)
hist(probs[:, 1][data_sw_passed.label.values == 0], bins=60, alpha=0.3, normed=True)
pass


Select top-good tracks in event


In [44]:
def get_max_ids(groups, values):
    """in each group return index of object with highest value"""
    _, groups = numpy.unique(groups, return_inverse=True)
    sorter = numpy.argsort(values)
    # languages ranked by average salaries:
    values_order = numpy.argsort(sorter)

    top_order = np.zeros(groups.max() + 1, dtype=int)
    numpy.maximum.at(top_order, groups, values_order)

    return sorter[top_order]

In [45]:
## select the best tracks by estimator in event 
## we are trying to leave only those tracks which have the greatest rf prediction

def get_best_tracks(data, probs):
    data = data.copy()
    probabilities = numpy.where(data['label'] > 0, probs, 1 - probs)
    
    best_ids = get_max_ids(data[event_id_column], probabilities)
    good_tracks = data.iloc[best_ids, :]
    n_counts = numpy.bincount(best_ids, minlength=len(data))
    other_ids = numpy.where(n_counts == 0)[0]
    other_tracks = data.iloc[other_ids, :]
    
    print len(good_tracks), len(other_tracks)
    return good_tracks, other_tracks

In [46]:
def get_pair_best_tracks(data, probs):
    """Select best tracks of same and of opposite sign """
    assert (numpy.unique(data['label']) == [0, 1]).all(), 'labels should be 0, 1'
    data = data.copy()
    
    probabilities = numpy.where(data['label'] > 0, probs, 1 - probs)

    good_tracks = []
    other_tracks = []
    
    for label in [0, 1]:
        train_data = data[data['label'] == label]
        pred = probabilities[numpy.array(data['label'] == label)]
        best_ids = get_max_ids(train_data[event_id_column], pred)
        good_tracks.append(train_data.iloc[best_ids, :])
        n_counts = numpy.bincount(best_ids, minlength=len(train_data))
        other_ids = numpy.where(n_counts == 0)[0]
        other_tracks.append(train_data.iloc[other_ids, :])

    good_tracks = pandas.concat(good_tracks)
    other_tracks = pandas.concat(other_tracks)
    print len(good_tracks), len(other_tracks)
    return good_tracks, other_tracks

Train to distinguish same sign vs opposite sign


In [47]:
from hep_ml.decisiontrain import DecisionTrainClassifier
from hep_ml.losses import LogLossFunction

In [48]:
data_sw_passed_lds = LabeledDataStorage(data_sw_passed, data_sw_passed.label, data_sw_passed.N_sig_sw.values)

XGB


In [95]:
from rep.estimators import XGBoostClassifier
xgb_base = XGBoostClassifier(max_depth=5, colsample=15, eta=0.1, subsample=0.2, n_estimators=150)
xgb_folding = FoldingClassifier(xgb_base, n_folds=2, random_state=11, ipc_profile='ssh-ipy', features=features)
%time xgb_folding.fit_lds(data_sw_passed_lds)
pass


CPU times: user 7min 20s, sys: 39 s, total: 7min 59s
Wall time: 1h 8min 6s

DT


In [52]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=1500, depth=6, pretransform_needed=True, 
                                  max_features=15, loss=LogLossFunction(regularization=100))
tt_folding = FoldingClassifier(tt_base, n_folds=2, random_state=11, ipc_profile='ssh-ipy', features=features)
%time tt_folding.fit_lds(data_sw_passed_lds)
pass


CPU times: user 25.5 s, sys: 2.3 s, total: 27.8 s
Wall time: 3min 40s

In [96]:
comparison_report = ClassificationReport({'tt': tt_folding, 'xgb': xgb_folding}, data_sw_passed_lds)


KFold prediction using folds column
KFold prediction using folds column

In [97]:
comparison_report.compute_metric(RocAuc())


Out[97]:
OrderedDict([('tt', 0.54728558252967374), ('xgb', 0.5439541142956712)])

In [98]:
comparison_report.roc()


Out[98]:

In [99]:
lc = comparison_report.learning_curve(RocAuc(), steps=1)


Default prediction
Default prediction

In [102]:
lc.plot()#xlim=(0, 1))



In [67]:
xgb_folding.estimators[0].get_feature_importances().sort('effect', ascending=False)


Out[67]:
effect
partPt 796
diff_pt 674
veloch 614
IP 609
IPs 603
proj 555
ptB 518
phi 513
partlcs 498
cos_diff_phi 497
diff_eta 484
partP 474
ghostProb 444
IPerr 427
IPPU 424
eta 386
sum_PID_mu_k 371
sum_PID_k_e 326
EOverP 321
sum_PID_mu_e 256
PIDNNk 200
PIDNNm 162
PIDNNe 156
max_PID_k_e 106
max_PID_mu_k 96
max_PID_mu_e 81
nnkrec 57

Forest idea


In [ ]:
temp_probs = est_choose_RT.predict_proba(data_sw_passed)[:, 1]
mask = ((flat_ss(temp_probs) < 0.2) & (data_sw_passed.label == 0)) | \
       ((flat_os(temp_probs) > 0.4) & (data_sw_passed.label == 1))


KFold prediction using folds column

In [ ]:
base = RandomForestClassifier(n_estimators=300, max_depth=14, min_samples_leaf=100, n_jobs=8)
forest_preselection_lds = LabeledDataStorage(data_sw_passed, mask * 1, data_sw_passed.N_sig_sw)
forest_preselection = FoldingClassifier(base, features=features, random_state=11)
%time forest_preselection.fit_lds(forest_preselection_lds)
pass


CPU times: user 35min 18s, sys: 468 ms, total: 35min 19s
Wall time: 1h 20min 17s

In [ ]:
report = ClassificationReport({'rf': forest_preselection}, forest_preselection_lds)


KFold prediction using folds column

In [45]:
report.compute_metric(RocAuc())


Out[45]:
OrderedDict([('rf', 0.61822769201547434)])

In [46]:
report.roc()


Out[46]:

In [47]:
report.prediction_pdf()


Out[47]:

In [48]:
prob = forest_preselection.predict_proba(data_sw_passed)[:, 1]
data_sw_passed_preselected = data_sw_passed[prob > 0.5]
data_sw_passed_not_preselected = data_sw_passed[prob <= 0.5]
prob = forest_preselection.predict_proba(data_sw_not_passed)[:, 1]
data_sw_not_passed_preselected = data_sw_not_passed[prob > 0.5]


KFold prediction using folds column
KFold prediction using folds column

In [49]:
get_events_statistics(data_sw_passed_preselected)


Out[49]:
{'Events': 143694, 'tracks': 168933}

In [50]:
data_sw_passed_preselected_lds = LabeledDataStorage(data_sw_passed_preselected, data_sw_passed_preselected.label, 
                                                    sample_weight=data_sw_passed_preselected.N_sig_sw.values)

In [ ]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=3000, depth=6, pretransform_needed=True, 
                                  max_features=15, loss=LogLossFunction(regularization=100))
tt_folding_preselected = FoldingClassifier(tt_base, n_folds=2, random_state=11, 
                                           ipc_profile='ssh-ipy', features=features)
%time tt_folding_preselected.fit_lds(data_sw_passed_preselected_lds)
pass

In [55]:
report = ClassificationReport({'tt': tt_folding_preselected}, data_sw_passed_preselected_lds)
report.learning_curve(RocAuc())


KFold prediction using folds column
Default prediction
Out[55]:

Forest selection


In [59]:
temp_probs = est_choose_RT.predict_proba(data_sw_passed)[:, 1]
mask = ((flat_ss(temp_probs) < 0.6) & (data_sw_passed.label == 0)) | \
       ((flat_os(temp_probs) > 0.2) & (data_sw_passed.label == 1))
data_sw_passed_rf_selected = data_sw_passed[mask]
data_sw_passed_rf_not_selected = data_sw_passed[~mask]


KFold prediction using folds column

In [60]:
get_events_statistics(data_sw_passed_rf_selected)


Out[60]:
{'Events': 351507, 'tracks': 559067}

In [61]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=3000, depth=6, pretransform_needed=True, 
                                  max_features=15, loss=LogLossFunction(regularization=100))
tt_folding_forest = FoldingClassifier(tt_base, n_folds=2, random_state=11, 
                                      features=features,  ipc_profile='ssh-ipy')
data_sw_passed_rf_lds = LabeledDataStorage(data_sw_passed_rf_selected, data_sw_passed_rf_selected.label, 
                                           sample_weight=data_sw_passed_rf_selected.N_sig_sw.values)
%time tt_folding_forest.fit_lds(data_sw_passed_rf_lds)
pass


CPU times: user 16.7 s, sys: 524 ms, total: 17.2 s
Wall time: 46 s

In [62]:
report = ClassificationReport({'tt + forest': tt_folding_forest}, data_sw_passed_rf_lds)
report.learning_curve(RocAuc())


KFold prediction using folds column
Default prediction
Out[62]:

Top best selection


In [63]:
temp_probs = est_choose_RT.predict_proba(data_sw_passed)[:, 1]
data_sw_passed_top_selected, data_sw_passed_top_not_selected = get_best_tracks(data_sw_passed, temp_probs)


KFold prediction using folds column
409381 333695

In [64]:
print get_events_statistics(data_sw_passed_top_selected)
print get_events_statistics(data_sw_passed_top_not_selected)


{'tracks': 409381, 'Events': 409381}
{'tracks': 333695, 'Events': 205238}

In [65]:
tt_folding_top_forest_lds = LabeledDataStorage(data_sw_passed_top_selected, data_sw_passed_top_selected.label, 
                                              sample_weight=data_sw_passed_top_selected.N_sig_sw.values)

In [66]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=3000, depth=6, pretransform_needed=True, 
                                  max_features=15, loss=LogLossFunction(regularization=100))
tt_folding_forest_top = FoldingClassifier(tt_base, n_folds=2, random_state=11, features=features,
                                          ipc_profile='ssh-ipy')

%time tt_folding_forest_top.fit_lds(tt_folding_top_forest_lds)
pass


CPU times: user 12.1 s, sys: 396 ms, total: 12.5 s
Wall time: 32.3 s

In [67]:
report = ClassificationReport({'tt': tt_folding_forest_top}, tt_folding_top_forest_lds)
report.learning_curve(RocAuc())


KFold prediction using folds column
Default prediction
Out[67]:

Top pair selection


In [68]:
temp_probs = est_choose_RT.predict_proba(data_sw_passed)[:, 1]
data_sw_passed_pair_selected, data_sw_passed_pair_not_selected = get_pair_best_tracks(data_sw_passed, temp_probs)


KFold prediction using folds column
542837 200239

In [69]:
print get_events_statistics(data_sw_passed_pair_selected)
print get_events_statistics(data_sw_passed_pair_not_selected)


{'tracks': 542837, 'Events': 409381}
{'tracks': 200239, 'Events': 142507}

In [70]:
tt_folding_pair_forest_lds = LabeledDataStorage(data_sw_passed_pair_selected, data_sw_passed_pair_selected.label, 
                                                sample_weight=data_sw_passed_pair_selected.N_sig_sw.values)

In [71]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=3000, depth=6, pretransform_needed=True, 
                                  max_features=15, loss=LogLossFunction(regularization=100))
tt_folding_forest_pair = FoldingClassifier(tt_base, n_folds=2, random_state=11, features=features, 
                                           ipc_profile='ssh-ipy')

%time tt_folding_forest_pair.fit_lds(tt_folding_pair_forest_lds)
pass


CPU times: user 15.4 s, sys: 536 ms, total: 15.9 s
Wall time: 40.8 s

In [72]:
report = ClassificationReport({'tt': tt_folding_forest_pair}, tt_folding_pair_forest_lds)
report.learning_curve(RocAuc())


KFold prediction using folds column
Default prediction
Out[72]:

Calibration


In [56]:
from utils import get_result_with_bootstrap_for_given_part

In [57]:
models = []

In [75]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, 
                                                       tt_folding_forest_top, 
                                                       [data_sw_passed_top_selected, 
                                                        data_sw_passed_top_not_selected, 
                                                        data_sw_not_passed], 
                                                       'rf-top-tt-iso', logistic=False))


KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.587715616461 AUC with untag: 0.563118461862
mean AUC after calibration: 0.587884282279 8.20976451985e-07

In [76]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, 
                                                       tt_folding_forest_top, 
                                                      [data_sw_passed_top_selected,
                                                       data_sw_passed_top_not_selected,
                                                       data_sw_not_passed], 
                                                      'rf-top-tt-log', logistic=True))


KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.584460801308 AUC with untag: 0.561112244944
mean AUC after calibration: 0.584678940681 8.0595071689e-07

In [77]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, 
                                                       tt_folding_forest_pair, 
                                                       [data_sw_passed_pair_selected, 
                                                        data_sw_passed_pair_not_selected, 
                                                        data_sw_not_passed], 
                                                       'rf-pair-top-tt-iso', logistic=False))


KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.589261973862 AUC with untag: 0.563897113476
mean AUC after calibration: 0.589346221488 9.54920879733e-07

In [78]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, 
                                                       tt_folding_forest_pair, 
                                                       [data_sw_passed_pair_selected, 
                                                        data_sw_passed_pair_not_selected, 
                                                        data_sw_not_passed], 
                                                       'rf-pair-top-tt-log', logistic=True))


KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.587266343777 AUC with untag: 0.562685814851
mean AUC after calibration: 0.587326330435 4.82661824719e-07

In [79]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta,
                                                       tt_folding_forest, 
                                                      [data_sw_passed_rf_selected,
                                                       data_sw_passed_rf_not_selected, 
                                                       data_sw_not_passed], 
                                                      'rf-tt-iso', logistic=False))


KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.589259152789 AUC with untag: 0.563728673933
mean AUC after calibration: 0.589210476064 5.85351071109e-07

In [80]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta,
                                                       tt_folding_forest, 
                                                      [data_sw_passed_rf_selected,
                                                       data_sw_passed_rf_not_selected,
                                                       data_sw_not_passed], 
                                                      'rf-tt-log', logistic=True))


KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.583949713277 AUC with untag: 0.56081572404
mean AUC after calibration: 0.584001935716 7.91650891009e-07

In [81]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding, 
                                                      [data_sw_passed, data_sw_not_passed], 'tt-iso', logistic=False))


KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.587965767715 AUC with untag: 0.562318041982
mean AUC after calibration: 0.587654036136 6.19746886299e-07

In [82]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding, 
                                                      [data_sw_passed, data_sw_not_passed], 'tt-log', logistic=True))


KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.588166631229 AUC with untag: 0.562524016679
mean AUC after calibration: 0.58813827882 8.64307980322e-07

In [83]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, xgb_folding, 
                                                      [data_sw_passed, data_sw_not_passed], 'xgb-iso', logistic=False))


KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.588136829227 AUC with untag: 0.562485354707
mean AUC after calibration: 0.587981920775 9.72006081266e-07

In [84]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, xgb_folding, 
                                                      [data_sw_passed, data_sw_not_passed], 'xgb-log', logistic=True))


KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.588144586301 AUC with untag: 0.562605527033
mean AUC after calibration: 0.587989501854 7.15698035539e-07

In [ ]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, est_choose_RT, 
                                                      [data_sw_passed, data_sw_not_passed], 'rf-iso', logistic=False))


KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.587708021135 AUC with untag: 0.562532162499
mean AUC after calibration: 0.587552498538 4.15038828649e-07

In [ ]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, est_choose_RT, 
                                                      [data_sw_passed, data_sw_not_passed], 'rf-log', logistic=True))


KFold prediction using folds column
KFold prediction using folds column

In [58]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency,
                                                       tagging_efficiency_delta, 
                                                       tt_folding_preselected, 
                                                      [data_sw_passed_preselected, data_sw_passed_not_preselected,
                                                       data_sw_not_passed], 
                                                       'forest-idea-tt-log', logistic=True))


KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.581941648328 AUC with untag: 0.558921142619
mean AUC after calibration: 0.581666683019 7.09832944819e-07

In [59]:
N_B_passed = float(get_events_number(data_sw_passed_preselected)) + float(get_events_number(data_sw_not_passed_preselected))
tagging_efficiency_preselected = N_B_passed / get_N_B_events()
tagging_efficiency_preselected_delta = sqrt(N_B_passed) / get_N_B_events()
tagging_efficiency_preselected, tagging_efficiency_preselected_delta


Out[59]:
(0.23614535442880896, 0.00056381154067846944)

In [60]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency_preselected,
                                                       tagging_efficiency_preselected_delta, 
                                                       tt_folding_preselected, 
                                                      [data_sw_passed_preselected, data_sw_not_passed_preselected], 
                                                       'preselected-tt-log', logistic=True))


KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.673231470006 AUC with untag: 0.551742390309
mean AUC after calibration: 0.673206066419 1.36174151028e-06

Comparison table of different models


In [61]:
pandas.set_option('display.precision', 8)
result = pandas.concat(models)
result.index = result.name
result.drop('name', axis=1)


Out[61]:
$\epsilon_{tag}, \%$ $\Delta \epsilon_{tag}, \%$ $D^2$ $\Delta D^2$ $\epsilon, \%$ $\Delta \epsilon, \%$ AUC, with untag $\Delta$ AUC, with untag
name
forest-idea-tt-log 66.9482289 0.0949323 0.0274507 0.0004126 1.8377762 0.0277433 55.8921143 0
preselected-tt-log 23.6145354 0.0563812 0.0874277 0.0010933 2.0645653 0.0262831 55.1742390 0

In [91]:
pandas.set_option('display.precision', 8)
result = pandas.concat(models)
result.index = result.name
result.drop('name', axis=1)


Out[91]:
$\epsilon_{tag}, \%$ $\Delta \epsilon_{tag}, \%$ $D^2$ $\Delta D^2$ $\epsilon, \%$ $\Delta \epsilon, \%$ AUC, with untag $\Delta$ AUC, with untag
name
rf-top-mn-iso 66.9482289 0.0949323 0.0257898 0.0005404 1.7265782 0.0362612 56.3118462 0
rf-top-mn-log 66.9482289 0.0949323 0.0259220 0.0003973 1.7354313 0.0267115 56.1112245 0
rf-pair-top-mn-iso 66.9482289 0.0949323 0.0289846 0.0005979 1.9404681 0.0401220 56.3897113 0
rf-pair-top-mn-log 66.9482289 0.0949323 0.0294079 0.0003707 1.9688047 0.0249756 56.2685815 0
rf-mn-iso 66.9482289 0.0949323 0.0292616 0.0005990 1.9590100 0.0401996 56.3728674 0
rf-mn-log 66.9482289 0.0949323 0.0242828 0.0003985 1.6256921 0.0267816 56.0815724 0
tt-iso 66.9482289 0.0949323 0.0296822 0.0006424 1.9871713 0.0431011 56.2318042 0
tt-log 66.9482289 0.0949323 0.0326286 0.0004945 2.1844240 0.0332512 56.2524017 0
mn-iso 66.9482289 0.0949323 0.0298982 0.0005886 2.0016335 0.0395100 56.2485355 0
mn-log 66.9482289 0.0949323 0.0327318 0.0004888 2.1913367 0.0328745 56.2605527 0
rf-iso 66.9482289 0.0949323 0.0288403 0.0005863 1.9308101 0.0393492 56.2532162 0
rf-log 66.9482289 0.0949323 0.0313808 0.0004688 2.1008859 0.0315271 56.2400785 0
forest-idea-tt-log 66.9482289 0.0949323 0.0313231 0.0004076 2.0970261 0.0274528 56.2169638 0
preselected-tt-log 45.7172359 0.0784484 0.0455594 0.0005787 2.0828480 0.0266991 55.4594955 0

Implementing best tracking


In [63]:
from utils import prepare_B_data_for_given_part

In [93]:
Bdata_prepared = prepare_B_data_for_given_part(tt_folding, [data_sw_passed, data_sw_not_passed], logistic=True)


KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.588166631229 AUC with untag: 0.562524016679

In [94]:
Bdata_prepared.to_csv('models/Bdata_tracks.csv', header=True, index=False)

In [64]:
Bdata_prepared_idea = prepare_B_data_for_given_part(tt_folding_preselected, 
                                                    [data_sw_passed_preselected, data_sw_passed_not_preselected,
                                                    data_sw_not_passed], 
                                                    logistic=True)


KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
AUC for tagged: 0.581941648328 AUC with untag: 0.558921142619

In [67]:
Bdata_prepared_idea.to_csv('models/Bdata_tracks_idea.csv', header=True, index=False)