In [1]:
%pylab inline
In [2]:
import pandas
import numpy
from sklearn.metrics import roc_curve, roc_auc_score
In [3]:
import root_numpy
data_full = pandas.DataFrame(root_numpy.root2array('datasets/tracks.root', 'tracks',
branches=['run', 'event', 'N_sig_sw', 'signB', 'signTrack', 'IPs',
'diff_eta', 'diff_phi', 'partPt']))
In [4]:
data_full.head()
Out[4]:
In [5]:
event_id_column = 'event_id'
event_id = data_full.run.apply(str) + '_' + data_full.event.apply(str)
data_full[event_id_column] = event_id
In [6]:
data_full = data_full.dropna()
len(data_full)
Out[6]:
In [7]:
import utils
In [8]:
def compute_sum_of_charges(data, name, bins=60, use_pt=False):
result_event_id, event_positions, data_ids = numpy.unique(data[event_id_column].values, return_index=True, return_inverse=True)
used_weights = data.signTrack.values
if use_pt:
used_weights *= data.partPt.values
used_weights /= sum(data.partPt.values)
result_probs = -numpy.bincount(data_ids, weights=used_weights)
result_label = numpy.bincount(data_ids, weights=data.signB.values) / numpy.bincount(data_ids)
result_weight = numpy.bincount(data_ids, weights=data.N_sig_sw.values) / numpy.bincount(data_ids)
min_max = 10
result = {}
result['ROC $-\sum_i charge_i$'] = [roc_auc_score(result_label, result_probs, sample_weight=result_weight)]
figure(figsize=(16, 7))
subplot(1, 2, 1)
fpr, tpr, _ = roc_curve(result_label, result_probs, sample_weight=result_weight)
plot(fpr, tpr)
plot([0, 1], [0, 1], 'k--')
grid(True), xlim(0, 1), ylim(0, 1), title('ROC $-\sum_i charge_i$')
subplot(1, 2, 2)
hist(result_probs * (result_label == 1), bins=bins, weights=result_weight * (result_label == 1),
range=(-min_max, min_max), alpha=0.2, normed=True, label='$B^+$')
hist(result_probs * (result_label == -1), bins=bins, weights=result_weight * (result_label == -1),
range=(-min_max, min_max), alpha=0.2, normed=True, label='$B^-$')
legend(), title(name + ', $-\sum_i charge_i$'), xlim(-min_max, min_max)
plt.savefig('img/assymetry_tracks_{}.png'.format(name), format='png')
show()
figure(figsize=(16, 7))
subplot(1, 2, 1)
fpr, tpr, _ = roc_curve(result_label, result_probs - result_label, sample_weight=result_weight)
plot(fpr, tpr)
plot([0, 1], [0, 1], 'k--')
grid(True), xlim(0, 1), ylim(0, 1), title('ROC $-\sum_i charge_i$ - signal track sign')
subplot(1, 2, 2)
hist((result_probs - result_label) * (result_label == 1), bins=bins, weights=result_weight * (result_label == 1),
range=(-min_max, min_max), alpha=0.2, normed=True, label='$B^+$')
hist((result_probs - result_label) * (result_label == -1), bins=bins, weights=result_weight * (result_label == -1),
range=(-min_max, min_max), alpha=0.2, normed=True, label='$B^-$')
legend(), title(name + ', $-\sum_i charge_i$ - signal track sign'), xlim(-min_max, min_max)
plt.savefig('img/assymetry_tracks_with_sig_part_{}.png'.format(name), format='png')
show()
result = {}
for mask, bname in zip([result_label == 1, result_label == -1], ['$B^+$', '$B^-$']):
result[bname] = [numpy.sum(result_probs * result_weight * mask) / sum(result_weight * mask)]
result[bname + ', with signal part'] = [numpy.sum((result_probs - result_label) * \
result_weight * mask) / sum(result_weight * mask)]
result['ROC AUC'] = roc_auc_score(result_label, result_probs, sample_weight=result_weight)
result['ROC AUC, with signal part'] = 1 - roc_auc_score(result_label, result_probs - result_label,
sample_weight=result_weight)
result['name'] = [name]
return pandas.DataFrame(result)
In [9]:
means = [compute_sum_of_charges(data_full[mask], name, bins) for mask, name, bins in \
zip([data_full.signB > -100,
(data_full.IPs > 3) & ((abs(data_full.diff_eta) > 0.6) | (abs(data_full.diff_phi) > 0.825)),
(abs(data_full.diff_eta) < 0.6) & (abs(data_full.diff_phi) < 0.825) & (data_full.IPs < 3)],
['full', 'OS', 'SS'], [21, 21, 21])]
In [10]:
means = pandas.concat(means)
In [11]:
means
Out[11]:
In [12]:
means.to_csv('img/track_signs_assymetry_means.csv', index=False, header=True)
In [13]:
means_pt = [compute_sum_of_charges(data_full[mask], name, bins, use_pt=True) for mask, name, bins in \
zip([data_full.signB > -100,
(data_full.IPs > 3) * ((abs(data_full.diff_eta) > 0.6) | (abs(data_full.diff_phi) > 0.825)),
(abs(data_full.diff_eta) < 0.6) & (abs(data_full.diff_phi) < 0.825) * (data_full.IPs < 3)],
['full_pt', 'OS_pt', 'SS_pt'], [21, 21, 21])]
In [14]:
means_pt = pandas.concat(means_pt)
In [15]:
means_pt
Out[15]:
In [18]:
data_mc = pandas.read_csv('datasets/MCTracks.csv', sep='\t')
print len(data_mc)
event_id = data_mc.run.apply(str) + '_' + data_mc.event.apply(str)
data_mc[event_id_column] = event_id
data_mc['N_sig_sw'] = 1.
Out[18]:
In [20]:
means_mc = [compute_sum_of_charges(data_mc[mask], name, bins) for mask, name, bins in \
zip([data_mc.signB > -100,
(data_mc.IPs > 3) & ((abs(data_mc.diff_eta) > 0.6) | (abs(data_mc.diff_phi) > 0.825)),
(abs(data_mc.diff_eta) < 0.6) & (abs(data_mc.diff_phi) < 0.825) & (data_mc.IPs < 3)],
['full_mc', 'OS_mc', 'SS_mc'], [21, 21, 21])]
In [21]:
means_mc = pandas.concat(means_mc)
In [22]:
means_mc
Out[22]:
In [24]:
means_mc.to_csv('img/track_signs_assymetry_means_mc.csv', index=False, header=True)
In [25]:
means_mc_pt = [compute_sum_of_charges(data_mc[mask], name, bins, use_pt=True) for mask, name, bins in \
zip([data_mc.signB > -100,
(data_mc.IPs > 3) & ((abs(data_mc.diff_eta) > 0.6) | (abs(data_mc.diff_phi) > 0.825)),
(abs(data_mc.diff_eta) < 0.6) & (abs(data_mc.diff_phi) < 0.825) & (data_mc.IPs < 3)],
['full_mc_pt', 'OS_mc_pt', 'SS_mc_pt'], [21, 21, 21])]
In [26]:
pandas.concat(means_mc_pt)
Out[26]:
In [15]:
result_event_id, event_positions, data_ids = numpy.unique(data_full[event_id_column].values, return_index=True, return_inverse=True)
In [16]:
result_probs = -numpy.bincount(data_ids, weights=data_full.signTrack.values * data_full.partPt.values / sum(data_full.partPt.values))
In [23]:
result_label = numpy.bincount(data_ids, weights=data_full.signB.values) / numpy.bincount(data_ids)
result_weight = numpy.bincount(data_ids, weights=data_full.N_sig_sw.values) / numpy.bincount(data_ids)
In [28]:
hist(result_probs[result_label == 1], bins=60, normed=True)
hist(result_probs[result_label == -1], bins=60, normed=True)
pass
In [35]:
data_ids # event indices
track_ids = numpy.arange(len(data_ids))
selected_ids = numpy.zeros(numpy.max(data_ids) + 1)
selected_ids[data_ids] = track_ids # assigns last track in each event
permutation = numpy.random.permutation(track_ids)
selected_ids[data_ids[permutation]] = track_ids[permutation]
In [36]:
len(data_full), max(selected_ids.astype(int))
Out[36]:
In [40]:
signs_random = data_full.signTrack.values[selected_ids.astype(int)]
In [41]:
roc_auc_score(result_label, signs_random)
Out[41]:
In [48]:
figsize(7, 6)
plot(*roc_curve(result_label, signs_random)[:2])
plot([0, 1], [0, 1], 'k--')
grid()
title('Random track choose, \n ROC AUC using track sign %1.4f' %(1 - roc_auc_score(result_label, signs_random)), fontsize=14)
plt.savefig('img/assymetry_tracks_random.png', format='png')