In [1]:
%pylab inline
In [2]:
import pandas
import numpy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from rep.metaml import FoldingClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc
In [3]:
from utils import get_N_B_events, get_events_number, get_events_statistics
In [4]:
import root_numpy
data = pandas.DataFrame(root_numpy.root2array('datasets/1016_vtx.root'))
In [5]:
event_id_column = 'event_id'
data[event_id_column] = data.runNum.apply(str) + '_' + (data.evtNum.apply(int)).apply(str)
# reconstructing sign of B
data['signB'] = data.tagAnswer * (2 * data.iscorrect - 1)
# assure sign is +1 or -1
data['signVtx'] = (data.signVtx.values > 0) * 2 - 1
data['label'] = (data.signVtx.values * data.signB.values > 0) * 1
In [6]:
data.head()
Out[6]:
In [7]:
get_events_statistics(data)
Out[7]:
In [8]:
N_pass = get_events_number(data)
tagging_efficiency = 1. * N_pass / get_N_B_events()
tagging_efficiency_delta = sqrt(N_pass) / get_N_B_events()
print tagging_efficiency, tagging_efficiency_delta
In [9]:
sweight_threshold = 1.
data_sw_passed = data[data.N_sig_sw > sweight_threshold]
data_sw_not_passed = data[data.N_sig_sw <= sweight_threshold]
get_events_statistics(data_sw_passed)
Out[9]:
In [10]:
features = ['mult', 'nnkrec', 'ptB', 'vflag', 'ipsmean', 'ptmean', 'vcharge',
'svm', 'svp', 'BDphiDir', 'svtau', 'docamax']
In [11]:
data_sw_passed_lds = LabeledDataStorage(data_sw_passed, data_sw_passed.label, data_sw_passed.N_sig_sw)
In [12]:
base = RandomForestClassifier(n_estimators=300, max_depth=8, min_samples_leaf=50, n_jobs=4)
est_choose_RT = FoldingClassifier(base, features=features, random_state=13)
%time est_choose_RT.fit_lds(data_sw_passed_lds)
pass
In [13]:
report = ClassificationReport({'rf': est_choose_RT}, data_sw_passed_lds)
In [14]:
report.compute_metric(RocAuc())
Out[14]:
In [15]:
plot([0, 1], [1, 0], 'k--')
report.roc()
Out[15]:
In [16]:
imp = numpy.sum([est.feature_importances_ for est in est_choose_RT.estimators], axis=0)
imp = pandas.DataFrame({'importance': imp, 'feature': est_choose_RT.features})
imp.sort('importance', ascending=False)
Out[16]:
In [17]:
from utils import plot_flattened_probs
In [18]:
probs = est_choose_RT.predict_proba(data_sw_passed)
flat_ss = plot_flattened_probs(probs, data_sw_passed.label.values, data_sw_passed.N_sig_sw.values, label=1)
flat_os = plot_flattened_probs(probs, data_sw_passed.label.values, data_sw_passed.N_sig_sw.values, label=0)
In [19]:
hist(probs[data_sw_passed.label.values == 1][:, 1], bins=60, normed=True, alpha=0.4)
hist(probs[data_sw_passed.label.values == 0][:, 1], bins=60, normed=True, alpha=0.4)
pass
In [20]:
mask = ((flat_ss(probs[:, 1]) < 0.6) & (data_sw_passed.label == 0)) | \
((flat_os(probs[:, 1]) > 0.2) & (data_sw_passed.label == 1))
data_sw_passed_selected = data_sw_passed[mask]
data_sw_passed_not_selected = data_sw_passed[~mask]
In [21]:
get_events_statistics(data_sw_passed_selected)
Out[21]:
In [22]:
data_sw_passed_selected_lds = LabeledDataStorage(data_sw_passed_selected,
data_sw_passed_selected.label,
data_sw_passed_selected.N_sig_sw)
In [23]:
from hep_ml.decisiontrain import DecisionTrainClassifier
from hep_ml.losses import LogLossFunction
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=9000, depth=6, pretransform_needed=True,
max_features=8, loss=LogLossFunction(regularization=100))
tt_folding_rf = FoldingClassifier(tt_base, n_folds=2, random_state=11, ipc_profile='ssh-ipy',
features=features)
%time tt_folding_rf.fit_lds(data_sw_passed_selected_lds)
pass
In [24]:
report = ClassificationReport({'tt': tt_folding_rf}, data_sw_passed_selected_lds)
In [25]:
report.learning_curve(RocAuc())
Out[25]:
In [26]:
report.compute_metric(RocAuc())
Out[26]:
In [27]:
from hep_ml.decisiontrain import DecisionTrainClassifier
from hep_ml.losses import LogLossFunction
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=1500, depth=6, pretransform_needed=True,
max_features=8, loss=LogLossFunction(regularization=100))
tt_folding = FoldingClassifier(tt_base, n_folds=2, random_state=11, ipc_profile='ssh-ipy',
features=features)
%time tt_folding.fit_lds(data_sw_passed_lds)
pass
In [28]:
report = ClassificationReport({'tt': tt_folding}, data_sw_passed_lds)
In [29]:
report.learning_curve(RocAuc())
Out[29]:
In [30]:
report.compute_metric(RocAuc())
Out[30]:
In [31]:
report.roc()
Out[31]:
In [34]:
models = []
In [35]:
from utils import get_result_with_bootstrap_for_given_part
In [36]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data_sw_passed, data_sw_not_passed],
logistic=True, name="tt-log",
sign_part_column='signVtx', part_name='vertex'))
In [37]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding,
[data_sw_passed, data_sw_not_passed],
logistic=False, name="tt-iso",
sign_part_column='signVtx', part_name='vertex'))
In [ ]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency, tagging_efficiency_delta, tt_folding_rf,
[data_sw_passed_selected,
data_sw_passed_not_selected,
data_sw_not_passed],
logistic=True, name="rf-tt-log",
sign_part_column='signVtx', part_name='vertex'))
In [ ]:
models.append(get_result_with_bootstrap_for_given_part(tagging_efficiency,
tagging_efficiency_delta, tt_folding_rf,
[data_sw_passed_selected,
data_sw_passed_not_selected,
data_sw_not_passed],
logistic=False, name="rf-tt-iso",
sign_part_column='signVtx', part_name='vertex'))
In [41]:
pandas.concat(models)
Out[41]:
In [42]:
from utils import prepare_B_data_for_given_part
In [43]:
Bdata_prepared = prepare_B_data_for_given_part(tt_folding, [data_sw_passed, data_sw_not_passed], logistic=False,
sign_part_column='signVtx', part_name='vertex')
In [44]:
Bdata_prepared.to_csv('models/Bdata_vertex.csv', header=True, index=False)