In [1]:
%pylab inline
In [ ]:
import sys
sys.path.insert(0, "../")
In [2]:
import pandas
import numpy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from rep.metaml import FoldingClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc
In [3]:
from utils import get_events_number, get_events_statistics
In [4]:
import root_numpy
data = pandas.DataFrame(root_numpy.root2array('datasets/MC/csv/Bu_JPsiK/Vertices_Mike.root', selection='vcharge > 0.2'))
In [5]:
event_id_column = 'event_id'
event_id = data.run.apply(str) + '_' + data.event.apply(str)
data['group_column'] = numpy.unique(event_id, return_inverse=True)[1]
# all weights are 1, because this is MC
data['N_sig_sw'] = 1
data[event_id_column] = event_id
In [6]:
data.head()
Out[6]:
In [7]:
get_events_statistics(data)
Out[7]:
In [8]:
import json
with open('models/JPsiKMC.json', 'r') as f:
N_B_events = json.load(f)['N_B_events']
N_B_events
Out[8]:
In [9]:
N_pass = get_events_number(data)
tagging_efficiency = 1. * N_pass / N_B_events
tagging_efficiency_delta = sqrt(N_pass) / N_B_events
print tagging_efficiency, tagging_efficiency_delta
In [10]:
Bdata_tracks = pandas.read_csv('models/Bdata_tracks_MC.csv')
In [11]:
Bdata_tracks.index = Bdata_tracks.event_id
In [12]:
data['initial_pred'] = Bdata_tracks.ix[data.event_id, 'track_relation_prob'].values
In [13]:
data.ix[numpy.isnan(data['initial_pred'].values), 'initial_pred'] = 1.
In [19]:
data.columns
Out[19]:
In [20]:
features = ['mult', 'nnkrec', 'ptB', 'ipsmean', 'ptmean', 'vcharge',
'svm', 'svp', 'M_BDphiDir', 'M_svtau', 'docamax']
In [15]:
data_lds = LabeledDataStorage(data, 1 * (data.signB * data.signVtx > 0), data.N_sig_sw)
In [16]:
from hep_ml.decisiontrain import DecisionTrainClassifier
from hep_ml.losses import LogLossFunction
In [17]:
from hep_ml.losses import HessianLossFunction
from hep_ml.commonutils import check_sample_weight
from scipy.special import expit
class LogLossFunctionTagging(HessianLossFunction):
"""Logistic loss function (logloss), aka binomial deviance, aka cross-entropy,
aka log-likelihood loss.
"""
def fit(self, X, y, sample_weight):
self.sample_weight = check_sample_weight(y, sample_weight=sample_weight,
normalize=True, normalize_by_class=True)
self.initial_pred = numpy.log(X['initial_pred'].values)
self.y_signed = 2 * y - 1
self.minus_y_signed = - self.y_signed
self.y_signed_times_weights = self.y_signed * self.sample_weight
HessianLossFunction.fit(self, X, y, sample_weight=self.sample_weight)
return self
def __call__(self, y_pred):
y_pred = y_pred + self.initial_pred
return numpy.sum(self.sample_weight * numpy.logaddexp(0, self.minus_y_signed * y_pred))
def negative_gradient(self, y_pred):
y_pred = y_pred + self.initial_pred
return self.y_signed_times_weights * expit(self.minus_y_signed * y_pred)
def hessian(self, y_pred):
y_pred = y_pred + self.initial_pred
expits = expit(y_pred)
return self.sample_weight * expits * (1 - expits)
def prepare_tree_params(self, y_pred):
y_pred = y_pred + self.initial_pred
return self.y_signed * expit(self.minus_y_signed * y_pred), self.sample_weight
In [45]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=3000, depth=6,
max_features=8,
loss=LogLossFunctionTagging(regularization=100), train_features=features)
tt_folding = FoldingClassifier(tt_base, n_folds=2, random_state=11,
features=features + ['initial_pred'])
%time tt_folding.fit_lds(data_lds)
pass
In [46]:
from scipy.special import expit, logit
In [47]:
p = tt_folding.predict_proba(data)[:, 1]
roc_auc_score(data.signB.values,
log(data['initial_pred'].values) + logit(p) * data.signB.values,
sample_weight=data.N_sig_sw.values)
Out[47]:
In [48]:
report = ClassificationReport({'tt': tt_folding}, data_lds)
In [49]:
report.learning_curve(RocAuc())
Out[49]:
In [50]:
report.compute_metric(RocAuc())
Out[50]:
In [51]:
report.roc()
Out[51]:
In [53]:
tt_folding.estimators[0].estimators = tt_folding.estimators[0].estimators[:2500]
tt_folding.estimators[1].estimators = tt_folding.estimators[1].estimators[:2500]
In [29]:
models = []
In [30]:
from utils import get_result_with_bootstrap_for_given_part
In [32]:
data['label'] = 1 * (data.signB * data.signVtx > 0)
In [ ]:
result = get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data], 'tt-log', logistic=True, N_B_events=N_B_events,
n_calibrations=30, sign_part_column='signVtx', part_name='vertex')
result
In [33]:
result = get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data], 'tt-log', logistic=True, N_B_events=N_B_events,
n_calibrations=30, sign_part_column='signVtx', part_name='vertex')
result
Out[33]:
In [34]:
from utils import prepare_B_data_for_given_part
In [36]:
Bdata_prepared = prepare_B_data_for_given_part(tt_folding, [data], N_B_events, logistic=True,
sign_part_column='signVtx', part_name='vertex')
In [37]:
Bdata_prepared.to_csv('models/Bdata_vertex_MC_loss.csv', header=True, index=False)