In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [ ]:
import sys
sys.path.insert(0, "../")

Import


In [2]:
import pandas
import numpy

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score

from rep.metaml import FoldingClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc

In [3]:
from utils import get_events_number, get_events_statistics

Reading initial data


In [4]:
import root_numpy
data = pandas.DataFrame(root_numpy.root2array('datasets/MC/csv/Bu_JPsiK/Vertices_Mike.root', selection='vcharge > 0.2'))

Define label

label = signB * signVtx > 0

  • same sign of B and vtx -> label = 1
  • opposite sign of B and vtx -> label = 0

In [5]:
event_id_column = 'event_id'
event_id = data.run.apply(str) + '_' + data.event.apply(str)
data['group_column'] = numpy.unique(event_id, return_inverse=True)[1]

# all weights are 1, because this is MC
data['N_sig_sw'] = 1
data[event_id_column] = event_id

In [6]:
data.head()


Out[6]:
run event No mult nnkrec Bmass ptB vflag ptmean ipsmean ... W_M_svtau M_pointtheta W_M_pointtheta docamax signB signVtx N_sig_sw BOosc group_column event_id
0 2517778 1110822 0 1 4 5.27306 5.05964 2 0.851762 55.706501 ... 7.349960 0.046197 0.044500 0.000000 1 -1 1 1 320473 2517778_1110822
1 2517778 1110824 0 1 4 5.27489 11.70600 2 1.564780 8.629360 ... 166.444000 0.067733 0.012434 0.000000 1 1 1 0 320474 2517778_1110824
2 2517778 1110869 0 1 1 5.27815 2.78838 2 1.764510 5.588500 ... 128503.000000 0.190129 0.017458 0.000000 1 -1 1 0 320475 2517778_1110869
3 2517778 1110880 0 1 2 5.28571 8.92569 6 0.881253 6.912300 ... 227.988007 0.029884 0.003415 0.085911 1 1 1 0 320476 2517778_1110880
4 2517778 1110894 0 1 3 5.27156 10.75980 4 0.926523 12.084800 ... 57.450001 0.035372 0.010057 0.014691 -1 1 1 0 320477 2517778_1110894

5 rows × 26 columns


In [7]:
get_events_statistics(data)


Out[7]:
{'Events': 550720, 'parts': 552781}

In [8]:
import json
with open('models/JPsiKMC.json', 'r') as f:
    N_B_events = json.load(f)['N_B_events']
N_B_events


Out[8]:
1488891.0

In [9]:
N_pass = get_events_number(data)
tagging_efficiency = 1. * N_pass / N_B_events
tagging_efficiency_delta = sqrt(N_pass) / N_B_events
print tagging_efficiency, tagging_efficiency_delta


0.369886042699 0.000498428101082

In [10]:
Bdata_tracks = pandas.read_csv('models/Bdata_tracks_MC.csv')

In [11]:
Bdata_tracks.index = Bdata_tracks.event_id

In [12]:
data['initial_pred'] = Bdata_tracks.ix[data.event_id, 'track_relation_prob'].values

In [13]:
data.ix[numpy.isnan(data['initial_pred'].values), 'initial_pred'] = 1.

Define features


In [19]:
data.columns


Out[19]:
Index([u'run', u'event', u'No', u'mult', u'nnkrec', u'Bmass', u'ptB', u'vflag',
       u'ptmean', u'ipsmean', u'vcharge', u'svm', u'svp', u'M_BDphiDir',
       u'W_M_BDphiDir', u'M_svtau', u'W_M_svtau', u'M_pointtheta',
       u'W_M_pointtheta', u'docamax', u'signB', u'signVtx', u'N_sig_sw',
       u'BOosc', u'group_column', u'event_id', u'initial_pred'],
      dtype='object')

In [20]:
features =  ['mult', 'nnkrec', 'ptB', 'ipsmean', 'ptmean', 'vcharge', 
             'svm', 'svp', 'M_BDphiDir', 'M_svtau', 'docamax']

In [15]:
data_lds = LabeledDataStorage(data, 1 * (data.signB * data.signVtx > 0), data.N_sig_sw)

Training


In [16]:
from hep_ml.decisiontrain import DecisionTrainClassifier
from hep_ml.losses import LogLossFunction

In [17]:
from hep_ml.losses import HessianLossFunction
from hep_ml.commonutils import check_sample_weight
from scipy.special import expit

class LogLossFunctionTagging(HessianLossFunction):
    """Logistic loss function (logloss), aka binomial deviance, aka cross-entropy,
       aka log-likelihood loss.
    """  
    def fit(self, X, y, sample_weight):
        self.sample_weight = check_sample_weight(y, sample_weight=sample_weight,
                                                 normalize=True, normalize_by_class=True)
        self.initial_pred = numpy.log(X['initial_pred'].values)
        self.y_signed = 2 * y - 1
        self.minus_y_signed = - self.y_signed
        self.y_signed_times_weights = self.y_signed * self.sample_weight
        HessianLossFunction.fit(self, X, y, sample_weight=self.sample_weight)
        return self

    def __call__(self, y_pred):
        y_pred = y_pred + self.initial_pred
        return numpy.sum(self.sample_weight * numpy.logaddexp(0, self.minus_y_signed * y_pred))

    def negative_gradient(self, y_pred):
        y_pred = y_pred + self.initial_pred
        return self.y_signed_times_weights * expit(self.minus_y_signed * y_pred)

    def hessian(self, y_pred):
        y_pred = y_pred + self.initial_pred
        expits = expit(y_pred)
        return self.sample_weight * expits * (1 - expits)

    def prepare_tree_params(self, y_pred):
        y_pred = y_pred + self.initial_pred
        return self.y_signed * expit(self.minus_y_signed * y_pred), self.sample_weight

In [45]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=3000, depth=6, 
                                  max_features=8,
                                  loss=LogLossFunctionTagging(regularization=100), train_features=features)
tt_folding = FoldingClassifier(tt_base, n_folds=2, random_state=11,
                               features=features + ['initial_pred'])
%time tt_folding.fit_lds(data_lds)
pass


CPU times: user 4min 34s, sys: 472 ms, total: 4min 35s
Wall time: 3min 36s

In [46]:
from scipy.special import expit, logit

In [47]:
p = tt_folding.predict_proba(data)[:, 1]
roc_auc_score(data.signB.values,
              log(data['initial_pred'].values) + logit(p) * data.signB.values,
              sample_weight=data.N_sig_sw.values)


KFold prediction using folds column
Out[47]:
0.68290766983758322

Report for all vtx


In [48]:
report = ClassificationReport({'tt': tt_folding}, data_lds)


KFold prediction using folds column

In [49]:
report.learning_curve(RocAuc())


KFold prediction using folds column
Out[49]:

In [50]:
report.compute_metric(RocAuc())


Out[50]:
OrderedDict([('tt', 0.57122015644127921)])

In [51]:
report.roc()


Out[51]:

In [53]:
tt_folding.estimators[0].estimators = tt_folding.estimators[0].estimators[:2500]
tt_folding.estimators[1].estimators = tt_folding.estimators[1].estimators[:2500]

Calibrating results $p(\text{vrt same sign}|B)$ and combining them


In [29]:
models = []

In [30]:
from utils import get_result_with_bootstrap_for_given_part

In [32]:
data['label'] =  1 * (data.signB * data.signVtx > 0)

In [ ]:
result = get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding, 
                                                  [data], 'tt-log', logistic=True, N_B_events=N_B_events, 
                                                  n_calibrations=30, sign_part_column='signVtx', part_name='vertex')
result


KFold prediction using folds column
AUC for tagged: 0.605187162372 AUC with untag: 0.546709330325

In [33]:
result = get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding, 
                                                  [data], 'tt-log', logistic=True, N_B_events=N_B_events, 
                                                  n_calibrations=30, sign_part_column='signVtx', part_name='vertex')
result


KFold prediction using folds column
AUC for tagged: 0.60511096162 AUC with untag: 0.546616378757
mean AUC after calibration: 0.604944329219 2.68784269792e-09
Out[33]:
name $\epsilon_{tag}, \%$ $\Delta \epsilon_{tag}, \%$ $D^2$ $\Delta D^2$ $\epsilon, \%$ $\Delta \epsilon, \%$ AUC, with untag $\Delta$ AUC, with untag
0 tt-log 36.988604 0.049843 0.038955 0.000025 1.440907 0.002147 54.661638 0

Implementing the best vertex model

and saving its predictions


In [34]:
from utils import prepare_B_data_for_given_part

In [36]:
Bdata_prepared = prepare_B_data_for_given_part(tt_folding, [data], N_B_events, logistic=True,
                                               sign_part_column='signVtx', part_name='vertex')


KFold prediction using folds column
AUC for tagged: 0.60511096162 AUC with untag: 0.546616378757

In [37]:
Bdata_prepared.to_csv('models/Bdata_vertex_MC_loss.csv', header=True, index=False)