In [2]:
%pylab inline
In [3]:
import pandas
import numpy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from rep.metaml import FoldingClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc
In [4]:
from utils import get_N_B_events, get_events_number, get_events_statistics
In [5]:
import root_numpy
data = pandas.DataFrame(root_numpy.root2array('datasets/1016_vtx.root'))
In [6]:
event_id_column = 'event_id'
data[event_id_column] = data.runNum.apply(str) + '_' + (data.evtNum.apply(int)).apply(str)
# reconstructing sign of B
data['signB'] = data.tagAnswer * (2 * data.iscorrect - 1)
# assure sign is +1 or -1
data['signVtx'] = (data.signVtx.values > 0) * 2 - 1
data['label'] = (data.signVtx.values * data.signB.values > 0) * 1
In [7]:
data.head()
Out[7]:
In [8]:
get_events_statistics(data)
Out[8]:
In [9]:
N_pass = get_events_number(data)
tagging_efficiency = 1. * N_pass / get_N_B_events()
tagging_efficiency_delta = sqrt(N_pass) / get_N_B_events()
print tagging_efficiency, tagging_efficiency_delta
In [10]:
Bdata_tracks = pandas.read_csv('models/Bdata_tracks_PID_less.csv')
In [11]:
Bdata_tracks.index = Bdata_tracks.event_id
In [12]:
data['initial_pred'] = Bdata_tracks.ix[data.event_id, 'track_relation_prob'].values
In [13]:
sweight_threshold = 1.
data_sw_passed = data[data.N_sig_sw > sweight_threshold]
data_sw_not_passed = data[data.N_sig_sw <= sweight_threshold]
get_events_statistics(data_sw_passed)
Out[13]:
In [14]:
features = ['mult', 'nnkrec', 'ptB', 'vflag', 'ipsmean', 'ptmean', 'vcharge',
'svm', 'svp', 'BDphiDir', 'svtau', 'docamax']
In [15]:
data_sw_passed_lds = LabeledDataStorage(data_sw_passed, data_sw_passed.label, data_sw_passed.N_sig_sw)
In [17]:
from hep_ml.decisiontrain import DecisionTrainClassifier
from hep_ml.losses import LogLossFunction
In [39]:
from hep_ml.losses import HessianLossFunction
from hep_ml.commonutils import check_sample_weight
from scipy.special import expit
class LogLossFunctionTagging(HessianLossFunction):
"""Logistic loss function (logloss), aka binomial deviance, aka cross-entropy,
aka log-likelihood loss.
"""
def fit(self, X, y, sample_weight):
self.sample_weight = check_sample_weight(y, sample_weight=sample_weight,
normalize=True, normalize_by_class=True)
self.initial_pred = numpy.log(X['initial_pred'].values)
self.y_signed = 2 * y - 1
self.minus_y_signed = - self.y_signed
self.y_signed_times_weights = self.y_signed * self.sample_weight
HessianLossFunction.fit(self, X, y, sample_weight=self.sample_weight)
return self
def __call__(self, y_pred):
y_pred = y_pred + self.initial_pred
return numpy.sum(self.sample_weight * numpy.logaddexp(0, self.minus_y_signed * y_pred))
def negative_gradient(self, y_pred):
y_pred = y_pred + self.initial_pred
return self.y_signed_times_weights * expit(self.minus_y_signed * y_pred)
def hessian(self, y_pred):
y_pred = y_pred + self.initial_pred
expits = expit(y_pred)
return self.sample_weight * expits * (1 - expits)
def prepare_tree_params(self, y_pred):
y_pred = y_pred + self.initial_pred
return self.y_signed * expit(self.minus_y_signed * y_pred), self.sample_weight
In [58]:
Out[58]:
In [40]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=1500, depth=6,
max_features=8, loss=LogLossFunctionTagging(regularization=100), train_features=features)
tt_folding = FoldingClassifier(tt_base, n_folds=2, random_state=11,
features=features + ['initial_pred'])
%time tt_folding.fit_lds(data_sw_passed_lds)
pass
In [61]:
In [62]:
from scipy.special import expit, logit
In [72]:
data_temp = data_sw_not_passed
In [73]:
print roc_auc_score(data_temp.signB.values, data_temp['initial_pred'].values, sample_weight=data_temp.N_sig_sw.values)
In [75]:
p = tt_folding.predict_proba(data_temp)[:, 1]
print roc_auc_score(data_temp.signB.values,
log(data_temp['initial_pred'].values) + logit(p) * data_temp.signB.values,
sample_weight=data_temp.N_sig_sw.values)
In [66]:
hist(tt_folding.estimators[0].loss.initial_pred, )
pass
In [43]:
report = ClassificationReport({'tt': tt_folding}, data_sw_passed_lds)
In [44]:
report.learning_curve(RocAuc())
Out[44]:
In [45]:
report.compute_metric(RocAuc())
Out[45]:
In [46]:
report.roc()
Out[46]:
In [47]:
models = []
In [48]:
from utils import get_result_with_bootstrap_for_given_part
In [49]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data_sw_passed, data_sw_not_passed],
logistic=True, name="tt-log",
sign_part_column='signVtx', part_name='vertex'))
In [50]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data_sw_passed, data_sw_not_passed],
logistic=False, name="tt-iso",
sign_part_column='signVtx', part_name='vertex'))
In [51]:
pandas.concat(models)
Out[51]:
In [31]:
pandas.concat(models)
Out[31]:
In [52]:
from utils import prepare_B_data_for_given_part
In [53]:
Bdata_prepared = prepare_B_data_for_given_part(tt_folding, [data_sw_passed, data_sw_not_passed], logistic=False,
sign_part_column='signVtx', part_name='vertex')
In [54]:
Bdata_prepared.to_csv('models/Bdata_vertex_new_loss.csv', header=True, index=False)
In [ ]: