учим при помощи классификатора перевзвешивание различия данных (удаляем треки из симуляции, которые не появляются в данных и удаляем из данных треки, которые не умеем симулировать):
классификатор предсказывает $p(MC)$ и $p(RD)$
для симуляции при $p(MC)>0.5$
$$w_{MC}=\frac{p(RD)}{p(MC)},$$ иначе $$w_{MC}=1$$
In [260]:
%pylab inline
figsize(8, 6)
In [261]:
import sys
sys.path.insert(0, "../")
In [262]:
import pandas
import numpy
from folding_group import FoldingGroupClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc
from sklearn.metrics import roc_curve, roc_auc_score
from decisiontrain import DecisionTrainClassifier
from rep.estimators import SklearnClassifier
In [4]:
import root_numpy
MC = pandas.DataFrame(root_numpy.root2array('../datasets/MC/csv/WG/Bu_JPsiK/2012/Tracks.root', stop=5000000))
data = pandas.DataFrame(root_numpy.root2array('../datasets/data/csv/WG/Bu_JPsiK/2012/Tracks.root', stop=5000000))
In [5]:
data.head()
Out[5]:
In [6]:
MC.head()
Out[6]:
In [7]:
from utils import data_tracks_preprocessing
data = data_tracks_preprocessing(data, N_sig_sw=True)
MC = data_tracks_preprocessing(MC)
In [8]:
', '.join(data.columns)
Out[8]:
In [9]:
print sum(data.signB == 1), sum(data.signB == -1)
print sum(MC.signB == 1), sum(MC.signB == -1)
In [10]:
mask_sw_positive = (data.N_sig_sw.values > 1) * 1
In [11]:
data.head()
Out[11]:
In [12]:
data['group_column'] = numpy.unique(data.event_id, return_inverse=True)[1]
MC['group_column'] = numpy.unique(MC.event_id, return_inverse=True)[1]
data.index = numpy.arange(len(data))
MC.index = numpy.arange(len(MC))
In [13]:
# features = ['cos_diff_phi', 'diff_pt', 'partPt', 'partP', 'nnkrec', 'diff_eta', 'EOverP',
# 'ptB', 'sum_PID_mu_k', 'proj', 'PIDNNe', 'sum_PID_k_e', 'PIDNNk', 'sum_PID_mu_e', 'PIDNNm',
# 'phi', 'IP', 'IPerr', 'IPs', 'veloch', 'max_PID_k_e', 'ghostProb',
# 'IPPU', 'eta', 'max_PID_mu_e', 'max_PID_mu_k', 'partlcs']
In [14]:
features = ['cos_diff_phi', 'partPt', 'partP', 'nnkrec', 'diff_eta', 'EOverP',
'ptB', 'sum_PID_mu_k', 'proj', 'PIDNNe', 'sum_PID_k_e', 'PIDNNk', 'sum_PID_mu_e', 'PIDNNm',
'phi', 'IP', 'IPerr', 'IPs', 'veloch', 'max_PID_k_e', 'ghostProb',
'IPPU', 'eta', 'max_PID_mu_e', 'max_PID_mu_k', 'partlcs']
In [53]:
b_ids_data = numpy.unique(data.group_column.values, return_index=True)[1]
b_ids_MC = numpy.unique(MC.group_column.values, return_index=True)[1]
In [54]:
Bdata = data.iloc[b_ids_data].copy()
BMC = MC.iloc[b_ids_MC].copy()
Bdata['Beta'] = Bdata.diff_eta + Bdata.eta
BMC['Beta'] = BMC.diff_eta + BMC.eta
Bdata['Bphi'] = Bdata.diff_phi + Bdata.phi
BMC['Bphi'] = BMC.diff_phi + BMC.phi
In [56]:
Bfeatures = ['Beta', 'Bphi', 'ptB']
In [57]:
hist(Bdata['ptB'].values, normed=True, alpha=0.5, bins=60,
weights=Bdata['N_sig_sw'].values)
hist(BMC['ptB'].values, normed=True, alpha=0.5, bins=60);
In [58]:
hist(Bdata['Beta'].values, normed=True, alpha=0.5, bins=60,
weights=Bdata['N_sig_sw'].values)
hist(BMC['Beta'].values, normed=True, alpha=0.5, bins=60);
In [59]:
hist(Bdata['Bphi'].values, normed=True, alpha=0.5, bins=60,
weights=Bdata['N_sig_sw'].values)
hist(BMC['Bphi'].values, normed=True, alpha=0.5, bins=60);
In [60]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=1000,
n_threads=16)
In [61]:
data_vs_MC_B = pandas.concat([Bdata, BMC])
label_data_vs_MC_B = [0] * len(Bdata) + [1] * len(BMC)
weights_data_vs_MC_B = numpy.concatenate([Bdata.N_sig_sw.values * (Bdata.N_sig_sw.values > 1) * 1,
numpy.ones(len(BMC))])
weights_data_vs_MC_B_all = numpy.concatenate([Bdata.N_sig_sw.values, numpy.ones(len(BMC))])
In [62]:
tt_B = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=321,
train_features=Bfeatures, group_feature='group_column')
%time tt_B.fit(data_vs_MC_B, label_data_vs_MC_B, sample_weight=weights_data_vs_MC_B)
pass
In [63]:
roc_auc_score(label_data_vs_MC_B, tt_B.predict_proba(data_vs_MC_B)[:, 1], sample_weight=weights_data_vs_MC_B)
Out[63]:
In [64]:
roc_auc_score(label_data_vs_MC_B, tt_B.predict_proba(data_vs_MC_B)[:, 1], sample_weight=weights_data_vs_MC_B_all)
Out[64]:
In [65]:
from hep_ml.reweight import GBReweighter, FoldingReweighter
In [66]:
reweighterB = FoldingReweighter(GBReweighter(), random_state=3444)
reweighterB.fit(BMC[Bfeatures], Bdata[Bfeatures], target_weight=Bdata.N_sig_sw)
Out[66]:
In [67]:
BMC_weights = reweighterB.predict_weights(BMC[Bfeatures])
In [68]:
hist(Bdata['ptB'].values, normed=True, alpha=0.5, bins=60,
weights=Bdata['N_sig_sw'].values)
hist(BMC['ptB'].values, normed=True, alpha=0.5, bins=60, weights=BMC_weights);
In [69]:
weights_data_vs_MC_B_w = numpy.concatenate([Bdata.N_sig_sw.values * (Bdata.N_sig_sw.values > 1) * 1,
BMC_weights])
weights_data_vs_MC_B_all_w = numpy.concatenate([Bdata.N_sig_sw.values, BMC_weights])
In [70]:
tt_B = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=321,
train_features=Bfeatures, group_feature='group_column')
%time tt_B.fit(data_vs_MC_B, label_data_vs_MC_B, sample_weight=weights_data_vs_MC_B_w)
roc_auc_score(label_data_vs_MC_B, tt_B.predict_proba(data_vs_MC_B)[:, 1], sample_weight=weights_data_vs_MC_B_all_w)
Out[70]:
In [71]:
MC['N_sig_sw'] = BMC_weights[numpy.unique(MC.group_column.values, return_inverse=True)[1]]
In [73]:
def compute_target_number_of_tracks(X):
ids = numpy.unique(X.group_column, return_inverse=True)[1]
number_of_tracks = numpy.bincount(X.group_column)
target = number_of_tracks[ids]
return target
In [74]:
from decisiontrain import DecisionTrainRegressor
from rep.estimators import SklearnRegressor
from rep.metaml import FoldingRegressor
In [75]:
tt_base_reg = DecisionTrainRegressor(learning_rate=0.02, n_estimators=1000,
n_threads=16)
In [76]:
%%time
tt_data_NT = FoldingRegressor(SklearnRegressor(tt_base_reg), n_folds=2, random_state=321,
features=features)
tt_data_NT.fit(data, compute_target_number_of_tracks(data), sample_weight=data.N_sig_sw.values * mask_sw_positive)
In [77]:
from sklearn.metrics import mean_squared_error
mean_squared_error(compute_target_number_of_tracks(data), tt_data_NT.predict(data),
sample_weight=data.N_sig_sw.values) ** 0.5
Out[77]:
In [83]:
mean_squared_error(compute_target_number_of_tracks(data),
[numpy.mean(compute_target_number_of_tracks(data))] * len(data),
sample_weight=data.N_sig_sw.values) ** 0.5
Out[83]:
In [92]:
%%time
tt_MC_NT = FoldingRegressor(SklearnRegressor(tt_base_reg), n_folds=2, random_state=321,
features=features)
tt_MC_NT.fit(MC, compute_target_number_of_tracks(MC), sample_weight=MC.N_sig_sw.values)
In [93]:
mean_squared_error(compute_target_number_of_tracks(MC),
tt_MC_NT.predict(MC), sample_weight=MC.N_sig_sw.values) ** 0.5
Out[93]:
In [94]:
mean_squared_error(compute_target_number_of_tracks(MC),
[numpy.mean(compute_target_number_of_tracks(MC))] * len(MC),
sample_weight=MC.N_sig_sw.values) ** 0.5
Out[94]:
In [95]:
tt_MC_NT.get_feature_importances().sort_values(by='effect')[-5:]
Out[95]:
In [96]:
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=1000,
n_threads=16)
In [97]:
B_signs = data['signB'].groupby(data['group_column']).aggregate(numpy.mean)
B_weights = data['N_sig_sw'].groupby(data['group_column']).aggregate(numpy.mean)
In [98]:
B_signs_MC = MC['signB'].groupby(MC['group_column']).aggregate(numpy.mean)
B_weights_MC = MC['N_sig_sw'].groupby(MC['group_column']).aggregate(numpy.mean)
In [99]:
from scipy.special import logit, expit
def compute_Bprobs(X, track_proba, weights=None, normed_weights=False):
if weights is None:
weights = numpy.ones(len(X))
_, data_ids = numpy.unique(X['group_column'], return_inverse=True)
track_proba[~numpy.isfinite(track_proba)] = 0.5
track_proba[numpy.isnan(track_proba)] = 0.5
if normed_weights:
weights_per_events = numpy.bincount(data_ids, weights=weights)
weights /= weights_per_events[data_ids]
predictions = numpy.bincount(data_ids, weights=logit(track_proba) * X['signTrack'] * weights)
return expit(predictions)
In [100]:
tt_data = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=321,
train_features=features, group_feature='group_column')
%time tt_data.fit(data, data.label, sample_weight=data.N_sig_sw.values * mask_sw_positive)
pass
In [101]:
pandas.DataFrame({'dataset': ['MC', 'data'],
'quality': [roc_auc_score(
B_signs_MC, compute_Bprobs(MC, tt_data.predict_proba(MC)[:, 1]), sample_weight=B_weights_MC),
roc_auc_score(
B_signs, compute_Bprobs(data, tt_data.predict_proba(data)[:, 1]), sample_weight=B_weights)]})
Out[101]:
In [102]:
tt_MC = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=321,
train_features=features, group_feature='group_column')
%time tt_MC.fit(MC, MC.label)
pass
In [103]:
pandas.DataFrame({'dataset': ['MC', 'data'],
'quality': [roc_auc_score(
B_signs_MC, compute_Bprobs(MC, tt_MC.predict_proba(MC)[:, 1]), sample_weight=B_weights_MC),
roc_auc_score(
B_signs, compute_Bprobs(data, tt_MC.predict_proba(data)[:, 1]), sample_weight=B_weights)]})
Out[103]:
combine data and MC together to train a classifier
In [236]:
combined_data_MC = pandas.concat([data, MC])
combined_label = numpy.array([0] * len(data) + [1] * len(MC))
combined_weights_data = data.N_sig_sw.values #/ numpy.bincount(data.group_column)[data.group_column.values]
combined_weights_data_passed = combined_weights_data * mask_sw_positive
combined_weights_MC = MC.N_sig_sw.values# / numpy.bincount(MC.group_column)[MC.group_column.values]
combined_weights = numpy.concatenate([combined_weights_data_passed,
1. * combined_weights_MC / sum(combined_weights_MC) * sum(combined_weights_data_passed)])
combined_weights_all = numpy.concatenate([combined_weights_data,
1. * combined_weights_MC / sum(combined_weights_MC) * sum(combined_weights_data)])
In [276]:
%%time
tt_base_large = DecisionTrainClassifier(learning_rate=0.3, n_estimators=1000,
n_threads=20)
tt_data_vs_MC = FoldingGroupClassifier(SklearnClassifier(tt_base_large), n_folds=2, random_state=321,
train_features=features + ['label'], group_feature='group_column')
tt_data_vs_MC.fit(combined_data_MC, combined_label, sample_weight=combined_weights)
In [277]:
a = []
for n, p in enumerate(tt_data_vs_MC.staged_predict_proba(combined_data_MC)):
a.append(roc_auc_score(combined_label, p[:, 1], sample_weight=combined_weights))
In [278]:
plot(a)
Out[278]:
quality
In [279]:
combined_p = tt_data_vs_MC.predict_proba(combined_data_MC)[:, 1]
roc_auc_score(combined_label, combined_p, sample_weight=combined_weights)
Out[279]:
In [280]:
roc_auc_score(combined_label, combined_p, sample_weight=combined_weights_all)
Out[280]:
calibrate probabilities (due to reweighting rule where probabilities are used)
In [281]:
from utils import calibrate_probs, plot_calibration
combined_p_calib = calibrate_probs(combined_label, combined_weights, combined_p)[0]
plot_calibration(combined_p, combined_label, weight=combined_weights)
plot_calibration(combined_p_calib, combined_label, weight=combined_weights)
In [282]:
# reweight data predicted as data to MC
used_probs = combined_p_calib
data_probs_to_be_MC = used_probs[combined_label == 0]
MC_probs_to_be_MC = used_probs[combined_label == 1]
track_weights_data = numpy.ones(len(data))
# take data with probability to be data
mask_data = data_probs_to_be_MC < 0.5
track_weights_data[mask_data] = (data_probs_to_be_MC[mask_data]) / (1 - data_probs_to_be_MC[mask_data])
# reweight MC predicted as MC to data
track_weights_MC = numpy.ones(len(MC))
mask_MC = MC_probs_to_be_MC > 0.5
track_weights_MC[mask_MC] = (1 - MC_probs_to_be_MC[mask_MC]) / (MC_probs_to_be_MC[mask_MC])
# simple approach, reweight only MC
track_weights_only_MC = (1 - MC_probs_to_be_MC) / MC_probs_to_be_MC
In [283]:
# data_ids = numpy.unique(data['group_column'], return_inverse=True)[1]
# MC_ids = numpy.unique(MC['group_column'], return_inverse=True)[1]
# # event_weight_data = (numpy.bincount(data_ids, weights=data.N_sig_sw) / numpy.bincount(data_ids))[data_ids]
# # event_weight_MC = (numpy.bincount(MC_ids, weights=MC.N_sig_sw) / numpy.bincount(MC_ids))[MC_ids]
# # normalize weights for tracks in a way that sum w_track = 1 per event
# track_weights_data /= numpy.bincount(data_ids, weights=track_weights_data)[data_ids]
# track_weights_MC /= numpy.bincount(MC_ids, weights=track_weights_MC)[MC_ids]
In [284]:
hist(combined_p_calib[combined_label == 1], label='MC', normed=True, alpha=0.4, bins=60,
weights=combined_weights_MC)
hist(combined_p_calib[combined_label == 0], label='data', normed=True, alpha=0.4, bins=60,
weights=combined_weights_data);
legend(loc='best')
Out[284]:
In [285]:
hist(track_weights_MC, normed=True, alpha=0.4, bins=60, label='MC')
hist(track_weights_data, normed=True, alpha=0.4, bins=60, label='RD');
legend(loc='best')
Out[285]:
In [286]:
numpy.mean(track_weights_data), numpy.mean(track_weights_MC)
Out[286]:
In [287]:
hist(combined_p_calib[combined_label == 1], label='MC', normed=True, alpha=0.4, bins=60,
weights=track_weights_MC * MC.N_sig_sw.values)
hist(combined_p_calib[combined_label == 0], label='data', normed=True, alpha=0.4, bins=60,
weights=track_weights_data * data.N_sig_sw.values);
legend(loc='best')
Out[287]:
In [288]:
roc_auc_score(combined_label, combined_p_calib,
sample_weight=numpy.concatenate([track_weights_data * data.N_sig_sw.values,
track_weights_MC * MC.N_sig_sw.values]))
Out[288]:
train classifier to distinguish data vs MC with provided weights
In [289]:
%%time
tt_check = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=433,
train_features=features + ['label'], group_feature='group_column')
tt_check.fit(combined_data_MC, combined_label,
sample_weight=numpy.concatenate([track_weights_data * data.N_sig_sw.values * mask_sw_positive,
track_weights_MC * MC.N_sig_sw.values]))
In [290]:
roc_auc_score(combined_label, tt_check.predict_proba(combined_data_MC)[:, 1],
sample_weight=numpy.concatenate([track_weights_data * data.N_sig_sw.values * mask_sw_positive,
track_weights_MC * MC.N_sig_sw.values]))
# * sum(track_weights_data * mask_sw_positive) / sum(track_weights_MC)
Out[290]:
In [291]:
roc_auc_score(combined_label, tt_check.predict_proba(combined_data_MC)[:, 1],
sample_weight=numpy.concatenate([track_weights_data * data.N_sig_sw.values,
track_weights_MC * MC.N_sig_sw.values]))
# * sum(track_weights_data) / sum(track_weights_MC)
Out[291]:
In [292]:
tt_reweighted_MC = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=321,
train_features=features, group_feature='group_column')
%time tt_reweighted_MC.fit(MC, MC.label, sample_weight=track_weights_MC * MC.N_sig_sw.values)
pass
In [293]:
pandas.DataFrame({'dataset': ['MC', 'data'],
'quality': [roc_auc_score(
B_signs_MC,
compute_Bprobs(MC, tt_reweighted_MC.predict_proba(MC)[:, 1],
weights=track_weights_MC, normed_weights=False),
sample_weight=B_weights_MC),
roc_auc_score(
B_signs,
compute_Bprobs(data, tt_reweighted_MC.predict_proba(data)[:, 1],
weights=track_weights_data, normed_weights=False),
sample_weight=B_weights)]})
Out[293]:
In [254]:
pandas.DataFrame({'dataset': ['MC', 'data'],
'quality': [roc_auc_score(
B_signs_MC,
compute_Bprobs(MC, tt_reweighted_MC.predict_proba(MC)[:, 1],
weights=track_weights_MC, normed_weights=False),
sample_weight=B_weights_MC),
roc_auc_score(
B_signs,
compute_Bprobs(data, tt_reweighted_MC.predict_proba(data)[:, 1],
weights=track_weights_data, normed_weights=False),
sample_weight=B_weights)]})
Out[254]:
In [294]:
%%time
tt_reweighted_data = FoldingGroupClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=321,
train_features=features, group_feature='group_column')
tt_reweighted_data.fit(data, data.label,
sample_weight=track_weights_data * data.N_sig_sw.values * mask_sw_positive)
pass
In [295]:
pandas.DataFrame({'dataset': ['MC', 'data'],
'quality': [roc_auc_score(
B_signs_MC,
compute_Bprobs(MC, tt_reweighted_data.predict_proba(MC)[:, 1],
weights=track_weights_MC, normed_weights=False),
sample_weight=B_weights_MC),
roc_auc_score(
B_signs,
compute_Bprobs(data, tt_reweighted_data.predict_proba(data)[:, 1],
weights=track_weights_data, normed_weights=False),
sample_weight=B_weights)]})
Out[295]:
In [256]:
pandas.DataFrame({'dataset': ['MC', 'data'],
'quality': [roc_auc_score(
B_signs_MC,
compute_Bprobs(MC, tt_reweighted_data.predict_proba(MC)[:, 1],
weights=track_weights_MC, normed_weights=False),
sample_weight=B_weights_MC),
roc_auc_score(
B_signs,
compute_Bprobs(data, tt_reweighted_data.predict_proba(data)[:, 1],
weights=track_weights_data, normed_weights=False),
sample_weight=B_weights)]})
Out[256]:
In [58]:
numpy.mean(mc_sum_weights_per_event), numpy.mean(data_sum_weights_per_event)
Out[58]:
In [74]:
_, data_ids = numpy.unique(data['group_column'], return_inverse=True)
mc_sum_weights_per_event = numpy.bincount(MC.group_column.values, weights=track_weights_MC)
data_sum_weights_per_event = numpy.bincount(data_ids, weights=track_weights_data)
In [77]:
hist(mc_sum_weights_per_event, bins=60, normed=True, alpha=0.5)
hist(data_sum_weights_per_event, bins=60, normed=True, alpha=0.5, weights=B_weights);
In [268]:
hist(mc_sum_weights_per_event, bins=60, normed=True, alpha=0.5)
hist(data_sum_weights_per_event, bins=60, normed=True, alpha=0.5, weights=B_weights);
In [55]:
hist(numpy.bincount(MC.group_column), bins=81, normed=True, alpha=0.5, range=(0, 80))
hist(numpy.bincount(data.group_column), bins=81, normed=True, alpha=0.5, range=(0, 80));
In [261]:
hist(expit(p_tt_mc) - expit(p_data), bins=60, weights=B_weights, normed=True, label='standard approach',
alpha=0.5);
hist(expit(p_data_w_MC) - expit(p_data_w), bins=60, weights=B_weights, normed=True, label='compensate method',
alpha=0.5);
legend()
xlabel('$p_{MC}-p_{data}$')
Out[261]:
In [200]:
from utils import compute_mistag
In [87]:
bins_perc = [10, 20, 30, 40, 50, 60, 70, 80, 90]
compute_mistag(expit(p_data), B_signs, B_weights, chosen=numpy.ones(len(B_signs), dtype=bool),
bins=bins_perc,
uniform=False, label='data')
compute_mistag(expit(p_tt_mc), B_signs, B_weights, chosen=numpy.ones(len(B_signs), dtype=bool),
bins=bins_perc,
uniform=False, label='MC')
compute_mistag(expit(p_data_w), B_signs, B_weights, chosen=numpy.ones(len(B_signs), dtype=bool),
bins=bins_perc,
uniform=False, label='new')
legend(loc='best')
xlim(0.3, 0.5)
ylim(0.2, 0.5)
Out[87]:
In [82]:
bins_edg = numpy.linspace(0.3, 0.9, 10)
compute_mistag(expit(p_data), B_signs, B_weights, chosen=numpy.ones(len(B_signs), dtype=bool),
bins=bins_edg,
uniform=True, label='data')
compute_mistag(expit(p_tt_mc), B_signs, B_weights, chosen=numpy.ones(len(B_signs), dtype=bool),
bins=bins_edg,
uniform=True, label='MC')
compute_mistag(expit(p_data_w), B_signs, B_weights, chosen=numpy.ones(len(B_signs), dtype=bool),
bins=bins_edg,
uniform=True, label='new')
legend(loc='best')
In [ ]: