In [1]:
%pylab inline
In [ ]:
import sys
sys.path.insert(0, "../")
In [2]:
import pandas
import numpy
from sklearn.metrics import roc_curve, roc_auc_score
from rep.metaml import FoldingClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc
In [3]:
from utils import get_N_B_events, get_events_number, get_events_statistics
In [4]:
import root_numpy
data = pandas.DataFrame(root_numpy.root2array('../datasets/MC/csv/Bu_JPsiK/Vertices_Mike.root'))
In [5]:
data.head()
Out[5]:
In [6]:
len(data)
Out[6]:
In [7]:
event_id_column = 'event_id'
event_id = data.run.apply(str) + '_' + data.event.apply(str)
data['group_column'] = numpy.unique(event_id, return_inverse=True)[1]
# all weights are 1, because this is MC
data['N_sig_sw'] = 1
data[event_id_column] = event_id
In [8]:
get_events_statistics(data)
Out[8]:
In [9]:
import json
with open('../models/JPsiKMC.json', 'r') as f:
N_B_events = json.load(f)['N_B_events']
In [10]:
N_B_events
Out[10]:
In [11]:
N_pass = get_events_number(data)
tagging_efficiency = 1. * N_pass / N_B_events
tagging_efficiency_delta = sqrt(N_pass) / N_B_events
print tagging_efficiency, tagging_efficiency_delta
In [12]:
data.columns
Out[12]:
In [13]:
features = ['mult', u'nnkrec', u'ptB', u'vflag',
u'ptmean', u'ipsmean', u'vcharge', u'svm', u'svp', u'M_BDphiDir',
u'W_M_BDphiDir', u'M_svtau', u'W_M_svtau', u'M_pointtheta',
u'W_M_pointtheta', u'docamax',]
In [14]:
from utils import compute_sum_of_charges
In [15]:
compute_sum_of_charges(data, 'Vertices', bins=100,
event_id_column=event_id_column, sign_part_column='signVtx')
Out[15]:
In [16]:
data.loc[:, 'label'] = (data.signB.values * data.signVtx.values > 0) * 1
In [17]:
from decisiontrain import DecisionTrainClassifier
from rep.estimators import SklearnClassifier
from folding_group import FoldingGroupClassifier
In [18]:
data_sw_passed_lds = LabeledDataStorage(data, data.label.values)
In [19]:
tt_base = DecisionTrainClassifier(learning_rate=0.01, n_estimators=10000, depth=6,
max_features=12, n_threads=12)
tt_folding = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
train_features=features, group_feature='group_column')
%time tt_folding.fit_lds(data_sw_passed_lds)
pass
In [20]:
report = ClassificationReport({'dt': tt_folding}, data_sw_passed_lds)
In [21]:
report.learning_curve(RocAuc(), steps=1)
Out[21]:
In [22]:
report.compute_metric(RocAuc())
Out[22]:
In [23]:
report.roc()
Out[23]:
In [24]:
report.feature_importance()
Out[24]:
In [25]:
tt_folding.estimators[0].clf.estimators = tt_folding.estimators[0].clf.estimators[:5000]
tt_folding.estimators[1].clf.estimators = tt_folding.estimators[1].clf.estimators[:5000]
In [26]:
models = []
In [27]:
from utils import get_result_with_bootstrap_for_given_part
In [ ]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data], N_B_events=N_B_events,
logistic=True, name="inclusive vertex", n_calibrations=30,
sign_part_column='signVtx', part_name='vertex',
logistic_combined=False))
In [33]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data], N_B_events=N_B_events,
logistic=True, name="inclusive vertex, logistic",
n_calibrations=30,
sign_part_column='signVtx', part_name='vertex',
logistic_combined=True))
In [34]:
pandas.concat(models)
Out[34]:
In [35]:
from utils import prepare_B_data_for_given_part
In [36]:
Bdata_prepared = prepare_B_data_for_given_part(tt_folding, [data], logistic=True, sign_part_column='signVtx',
N_B_events=N_B_events, part_name='vertex')
In [37]:
Bdata_prepared.to_csv('../models/Bdata_vertices_MC_jets.csv', header=True, index=False)