In [1]:
%pylab inline
figsize(8, 6)
In [2]:
import pandas
import numexpr
import numpy
from rep_ef.estimators import MatrixNetSkyGridClassifier
from rep.metaml import FoldingClassifier
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
In [3]:
from utils import get_N_B_events, get_events_number, get_events_statistics
In [4]:
data_track_nan = pandas.read_csv('datasets/Tracks.csv', sep='\t')
In [5]:
data_track_nan.head()
Out[5]:
In [6]:
event_id_column = 'event_id'
data_track_nan[event_id_column] = data_track_nan.run.apply(str) + '_' + data_track_nan.event.apply(str)
In [7]:
get_events_statistics(data_track_nan)
Out[7]:
In [8]:
get_N_B_events()
Out[8]:
In [9]:
data_track = data_track_nan.dropna()
len(data_track_nan), len(data_track), get_events_statistics(data_track)
Out[9]:
In [10]:
# add different between max pt in event and pt for each track
def add_diff_pt(data):
max_pt = group_max(data[event_id_column].values.astype(str), data.partPt.values)
data['diff_pt'] = max_pt - data['partPt'].values
# max is computing max over tracks in the same event for saome data
def group_max(groups, data):
# computing unique integer id for each group
assert len(groups) == len(data)
_, event_id = numpy.unique(groups, return_inverse=True)
max_over_event = numpy.zeros(max(event_id) + 1) - numpy.inf
numpy.maximum.at(max_over_event, event_id, data)
return max_over_event[event_id]
In [11]:
# add diff pt
add_diff_pt(data_track)
# add cos(diff_phi)
data_track['cos_diff_phi'] = numpy.cos(data_track.diff_phi.values)
In [12]:
from itertools import combinations
PIDs = {'k': data_track.PIDNNk.values,
'e': data_track.PIDNNe.values,
'mu': data_track.PIDNNm.values,
}
for (pid_name1, pid_values1), (pid_name2, pid_values2) in combinations(PIDs.items(), 2):
data_track['max_PID_{}_{}'.format(pid_name1, pid_name2)] = numpy.maximum(pid_values1, pid_values2)
data_track['sum_PID_{}_{}'.format(pid_name1, pid_name2)] = pid_values1 + pid_values2
In [13]:
data_track['label'] = (data_track.signB.values * data_track.signTrack.values > 0) * 1
In [14]:
threshold_mistag = 0.6
initial_cut = '(PIDNNp < {tr}) & (PIDNNpi < {tr}) & (ghostProb < 0.4)'.format(tr=threshold_mistag)
data_track = data_track.query(initial_cut)
In [15]:
threshold_kaon = 0.7
threshold_muon = 0.4
threshold_electron = 0.6
cut_pid = """
((PIDNNk > {trk})
| (PIDNNm > {trm})
| (PIDNNe > {tre})) """.format(trk=threshold_kaon, trm=threshold_muon,
tre=threshold_electron).replace("\n", "")
data_track = data_track.query(cut_pid)
In [16]:
get_events_statistics(data_track)
Out[16]:
In [17]:
import root_numpy
data_vertex = pandas.DataFrame(root_numpy.root2array('datasets/1016_vtx.root'))
data_vertex[event_id_column] = data_vertex.runNum.apply(str) + '_' + (data_vertex.evtNum.apply(int)).apply(str)
# reconstructing sign of B
data_vertex['signB'] = data_vertex.tagAnswer * (2 * data_vertex.iscorrect - 1)
# assure sign is +1 or -1
data_vertex['signVtx'] = (data_vertex.signVtx.values > 0) * 2 - 1
data_vertex['label'] = (data_vertex.signVtx.values * data_vertex.signB.values > 0) * 1
where $sw_i$ - sPLot weight
$$\epsilon_{tag} = \frac{N (\text{passed selection})} {N (\text{all events})}$$$$\Delta\epsilon_{tag} = \frac{\sqrt{N (\text{passed selection})}} {N (\text{all events})}$$
In [18]:
sweight_threshold = 1.
data_track_sw_passed = data_track[data_track.N_sig_sw > sweight_threshold]
data_track_sw_not_passed = data_track[data_track.N_sig_sw <= sweight_threshold]
data_vertex_sw_passed = data_vertex[data_vertex.N_sig_sw > sweight_threshold]
data_vertex_sw_not_passed = data_vertex[data_vertex.N_sig_sw <= sweight_threshold]
get_events_statistics(data_track_sw_passed), get_events_statistics(data_vertex_sw_passed)
Out[18]:
In [19]:
data_track_sw_passed_lds = LabeledDataStorage(data_track_sw_passed, data_track_sw_passed.label,
data_track_sw_passed.N_sig_sw.values)
data_vertex_sw_passed_lds = LabeledDataStorage(data_vertex_sw_passed, data_vertex_sw_passed.label,
data_vertex_sw_passed.N_sig_sw.values)
In [20]:
features_track = list(set(data_track.columns) - {'run', 'event', 'i', 'signB', 'signTrack', 'N_sig_sw',
'Bmass', 'mult',
'PIDNNp', 'PIDNNpi', 'label', 'thetaMin',
'Dist_phi', event_id_column,
'mu_cut', 'e_cut', 'K_cut'})
features_track
Out[20]:
In [21]:
features_vertex = ['mult', 'nnkrec', 'ptB', 'vflag', 'ipsmean', 'ptmean', 'vcharge',
'svm', 'svp', 'BDphiDir', 'svtau', 'docamax']
In [22]:
from rep_ef.estimators import MatrixNetSkyGridClassifier
from hep_ml.decisiontrain import DecisionTrainClassifier
from hep_ml.losses import LogLossFunction
In [23]:
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
tt_base = DecisionTrainClassifier(learning_rate=0.02, n_estimators=1500, depth=6, pretransform_needed=True,
max_features=15, loss=LogLossFunction(regularization=100))
mn_base = MatrixNetSkyGridClassifier(connection='skygrid', user_name='antares',
iterations=1000, regularization=0.02, sync=False)
In [24]:
from utils import prepare_B_data_for_given_part, calibrate_probs, calculate_auc_with_and_without_untag_events
In [25]:
def tagging(base_estimator_track, base_estimator_vertex,
track_logistic=False, vertex_logistic=False, random_state=None):
# track training
folding_track = FoldingClassifier(base_estimator_track, n_folds=2,
random_state=random_state, ipc_profile='ssh-ipy',
features=features_track)
folding_track.fit_lds(data_track_sw_passed_lds)
# compute B data from track data
Bdata_tracks = prepare_B_data_for_given_part(folding_track,
[data_track_sw_passed, data_track_sw_not_passed],
logistic=track_logistic, random_state=random_state)
# vertex training
folding_vertex = FoldingClassifier(base_estimator_vertex, n_folds=2, random_state=random_state,
features=features_vertex)
folding_vertex.fit_lds(data_vertex_sw_passed_lds)
Bdata_vertex = prepare_B_data_for_given_part(folding_vertex,
[data_vertex_sw_passed, data_vertex_sw_not_passed],
logistic=vertex_logistic,
sign_part_column='signVtx', part_name='vertex',
random_state=random_state)
# Combine track and vertex
Bdata = pandas.merge(Bdata_tracks, Bdata_vertex, how='outer', on=['event_id', 'Bsign'])
Bdata['Bweight'] = Bdata['Bweight_x'].copy()
Bdata.ix[numpy.isnan(Bdata['Bweight'].values), 'Bweight'] = Bdata.ix[numpy.isnan(Bdata['Bweight'].values), 'Bweight_y']
Bdata = Bdata.drop(['Bweight_x', 'Bweight_y'], axis=1)
# for Nan put 1 as non influence factor
Bdata.ix[numpy.isnan(Bdata.track_relation_prob.values), 'track_relation_prob'] = 1.
Bdata.ix[numpy.isnan(Bdata.vertex_relation_prob.values), 'vertex_relation_prob'] = 1.
relation_prob = Bdata['track_relation_prob'].values * Bdata['vertex_relation_prob'].values
Bprob = relation_prob / (1 + relation_prob)
Bweight = Bdata.Bweight.values
Bsign = Bdata.Bsign.values
Bprob[~numpy.isfinite(Bprob)] = 0.5
Bprob_calibrated = calibrate_probs(Bsign, Bweight, Bprob, random_state=random_state)
figure(figsize=(15, 5))
subplot(1,2,1)
hist(Bprob[Bsign == 1], bins=60, alpha=0.2, normed=True, label='$B^+$')
hist(Bprob[Bsign == -1], bins=60, alpha=0.2, normed=True, label='$B^-$')
legend(), title('B probs')
subplot(1,2,2)
hist(Bprob_calibrated[Bsign == 1], bins=60, alpha=0.2, normed=True, range=(0, 1), label='$B^+$')
hist(Bprob_calibrated[Bsign == -1], bins=60, alpha=0.2, normed=True, range=(0, 1), label='$B^-$')
legend(), title('B probs calibrated')
auc, auc_full = calculate_auc_with_and_without_untag_events(Bsign, Bprob_calibrated, Bweight)
print 'AUC for tagged:', auc, 'AUC with untag:', auc_full
N_B_passed = Bweight.sum()
tagging_efficiency = N_B_passed / get_N_B_events()
tagging_efficiency_delta = numpy.sqrt(N_B_passed) / get_N_B_events()
D2 = numpy.average( (2*(Bprob_calibrated - 0.5))**2, weights=Bweight)
return tagging_efficiency, tagging_efficiency_delta, D2, auc_full
In [26]:
numpy.random.seed(13)
In [27]:
random_states = numpy.random.randint(1, high=10e3, size=30)
In [28]:
random_states
Out[28]:
In [50]:
D2_step = []
tagging_efficiency_step = []
tagging_efficiency_delta_step = []
auc_step = []
for step, random_state in zip(range(len(random_states)), random_states):
tagging_efficiency, tagging_efficiency_delta, D2, auc = tagging(tt_base, mn_base, track_logistic=True,
vertex_logistic=False,
random_state=random_state)
D2_step.append(D2)
tagging_efficiency_step.append(tagging_efficiency)
tagging_efficiency_delta_step.append(tagging_efficiency_delta)
auc_step.append(auc)
In [51]:
from utils import result_table
In [52]:
result = result_table(numpy.mean(tagging_efficiency_step), numpy.mean(tagging_efficiency_delta_step),
D2_step, auc_step, name='NEW + full systematic')
result
Out[52]:
In [53]:
result = result_table(numpy.mean(tagging_efficiency_step), numpy.mean(tagging_efficiency_delta_step),
D2_step, auc_step, name='NEW + full systematic')
result.to_csv('img/new-tagging-systematic.csv', header=True, index=False)