In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

Import


In [2]:
import pandas
import numpy
from sklearn.metrics import roc_curve, roc_auc_score

Reading initial data


In [3]:
import root_numpy
data_full = pandas.DataFrame(root_numpy.root2array('datasets/tracks.root', 'tracks', 
                                                  branches=['run', 'event', 'N_sig_sw', 'signB', 'signTrack', 'IPs',
                                                           'diff_eta', 'diff_phi', 'partPt']))

In [4]:
data_full.head()


Out[4]:
run event N_sig_sw signB signTrack IPs diff_eta diff_phi partPt
0 115839 204997902 0.59521 1 -1 0.816143 0.201112 -0.136253 0.300418
1 115839 204997902 0.59521 1 -1 1.375382 0.684863 -0.072971 1.103876
2 115839 204997902 0.59521 1 -1 4.338812 0.749864 -4.177158 1.182519
3 115839 204997902 0.59521 1 1 2.287509 -0.644510 -0.599056 0.905010
4 115839 204997902 0.59521 1 1 0.562424 0.736161 -3.499841 0.516123

In [5]:
event_id_column = 'event_id'
event_id = data_full.run.apply(str) + '_' + data_full.event.apply(str)
data_full[event_id_column] = event_id

Remove rows with NAN from data


In [6]:
data_full = data_full.dropna()
len(data_full)


Out[6]:
27156190

Tagging track sign independence checking


In [7]:
import utils

assign $p(B+) = -\sum signTrack_i$

assign $p(B+) = -\sum signTrack_i - signB$


In [8]:
def compute_sum_of_charges(data, name, bins=60, use_pt=False):
    result_event_id, event_positions, data_ids = numpy.unique(data[event_id_column].values, return_index=True, return_inverse=True)
    used_weights = data.signTrack.values
    if use_pt:
        used_weights *= data.partPt.values
        used_weights /= sum(data.partPt.values)
    result_probs = -numpy.bincount(data_ids, weights=used_weights)
    result_label = numpy.bincount(data_ids, weights=data.signB.values) / numpy.bincount(data_ids)
    result_weight = numpy.bincount(data_ids, weights=data.N_sig_sw.values) / numpy.bincount(data_ids)
    
    min_max = 10
    result = {} 
    result['ROC $-\sum_i charge_i$'] = [roc_auc_score(result_label, result_probs, sample_weight=result_weight)]

    figure(figsize=(16, 7))
    subplot(1, 2, 1)
    fpr, tpr, _ = roc_curve(result_label, result_probs, sample_weight=result_weight)
    plot(fpr, tpr)
    plot([0, 1], [0, 1], 'k--')
    grid(True), xlim(0, 1), ylim(0, 1), title('ROC $-\sum_i charge_i$')
    subplot(1, 2, 2)
    
    hist(result_probs * (result_label == 1), bins=bins, weights=result_weight * (result_label == 1), 
         range=(-min_max, min_max), alpha=0.2, normed=True, label='$B^+$')
    hist(result_probs * (result_label == -1), bins=bins, weights=result_weight * (result_label == -1), 
         range=(-min_max, min_max), alpha=0.2, normed=True, label='$B^-$')
    legend(), title(name + ', $-\sum_i charge_i$'), xlim(-min_max, min_max)
    
    plt.savefig('img/assymetry_tracks_{}.png'.format(name), format='png')
    
    show()
    
    figure(figsize=(16, 7))
    subplot(1, 2, 1)
    fpr, tpr, _ = roc_curve(result_label, result_probs - result_label, sample_weight=result_weight)
    plot(fpr, tpr)
    plot([0, 1], [0, 1], 'k--')
    grid(True), xlim(0, 1), ylim(0, 1), title('ROC $-\sum_i charge_i$ - signal track sign')
    subplot(1, 2, 2)
    
    hist((result_probs - result_label) * (result_label == 1), bins=bins, weights=result_weight * (result_label == 1), 
         range=(-min_max, min_max), alpha=0.2, normed=True, label='$B^+$')
    hist((result_probs - result_label) * (result_label == -1), bins=bins, weights=result_weight * (result_label == -1), 
         range=(-min_max, min_max), alpha=0.2, normed=True, label='$B^-$')
    legend(), title(name + ', $-\sum_i charge_i$ - signal track sign'), xlim(-min_max, min_max)

    plt.savefig('img/assymetry_tracks_with_sig_part_{}.png'.format(name), format='png')
    
    show()
    
    result = {}
    for mask, bname in zip([result_label == 1, result_label == -1], ['$B^+$', '$B^-$']):
        result[bname] = [numpy.sum(result_probs * result_weight * mask) / sum(result_weight * mask)]
        result[bname + ', with signal part'] = [numpy.sum((result_probs - result_label) * \
                                                         result_weight * mask) / sum(result_weight * mask)]
    result['ROC AUC'] = roc_auc_score(result_label, result_probs, sample_weight=result_weight)
    result['ROC AUC, with signal part'] = 1 - roc_auc_score(result_label, result_probs - result_label, 
                                                            sample_weight=result_weight)
    result['name'] = [name]
    return pandas.DataFrame(result)

In [9]:
means = [compute_sum_of_charges(data_full[mask], name, bins) for mask, name, bins in \
         zip([data_full.signB > -100, 
              (data_full.IPs > 3) & ((abs(data_full.diff_eta) > 0.6) | (abs(data_full.diff_phi) > 0.825)), 
              (abs(data_full.diff_eta) < 0.6) & (abs(data_full.diff_phi) < 0.825) & (data_full.IPs < 3)], 
             ['full', 'OS', 'SS'], [21, 21, 21])]



In [10]:
means = pandas.concat(means)

In [11]:
means


Out[11]:
$B^+$ $B^+$, with signal part $B^-$ $B^-$, with signal part ROC AUC ROC AUC, with signal part name
0 0.443411 -0.556589 -0.572163 0.427837 0.571579 0.569146 full
0 0.111173 -0.888827 -0.157273 0.842727 0.529527 0.684603 OS
0 0.175970 -0.824030 -0.178102 0.821898 0.567696 0.777694 SS

In [12]:
means.to_csv('img/track_signs_assymetry_means.csv', index=False, header=True)

In [13]:
means_pt = [compute_sum_of_charges(data_full[mask], name, bins, use_pt=True) for mask, name, bins in \
            zip([data_full.signB > -100, 
              (data_full.IPs > 3) * ((abs(data_full.diff_eta) > 0.6) | (abs(data_full.diff_phi) > 0.825)), 
              (abs(data_full.diff_eta) < 0.6) & (abs(data_full.diff_phi) < 0.825) * (data_full.IPs < 3)], 
             ['full_pt', 'OS_pt', 'SS_pt'], [21, 21, 21])]


/moosefs/miniconda/envs/ipython_py2/lib/python2.7/site-packages/pandas/computation/expressions.py:190: UserWarning: evaluating in Python space because the '*' operator is not supported by numexpr for the bool dtype, use '&' instead
  unsupported[op_str]))

In [14]:
means_pt = pandas.concat(means_pt)

In [15]:
means_pt


Out[15]:
$B^+$ $B^+$, with signal part $B^-$ $B^-$, with signal part ROC AUC ROC AUC, with signal part name
0 2.089463e-08 -1 -3.419350e-08 1 0.500021 1 full_pt
0 2.455944e-08 -1 -3.564637e-08 1 0.500034 1 OS_pt
0 1.521839e-07 -1 -1.682846e-07 1 0.500587 1 SS_pt

check on MC


In [18]:
data_mc = pandas.read_csv('datasets/MCTracks.csv', sep='\t')
print len(data_mc)

event_id = data_mc.run.apply(str) + '_' + data_mc.event.apply(str)
data_mc[event_id_column] = event_id
data_mc['N_sig_sw'] = 1.


Out[18]:
32154831

In [20]:
means_mc = [compute_sum_of_charges(data_mc[mask], name, bins) for mask, name, bins in \
            zip([data_mc.signB > -100, 
              (data_mc.IPs > 3) & ((abs(data_mc.diff_eta) > 0.6) | (abs(data_mc.diff_phi) > 0.825)), 
              (abs(data_mc.diff_eta) < 0.6) & (abs(data_mc.diff_phi) < 0.825) & (data_mc.IPs < 3)], 
             ['full_mc', 'OS_mc', 'SS_mc'], [21, 21, 21])]



In [21]:
means_mc = pandas.concat(means_mc)

In [22]:
means_mc


Out[22]:
$B^+$ $B^+$, with signal part $B^-$ $B^-$, with signal part ROC AUC ROC AUC, with signal part name
0 0.317782 -0.682218 -0.770055 0.229945 0.584897 0.571588 full_mc
0 0.047280 -0.952720 -0.284975 0.715025 0.540302 0.694391 OS_mc
0 0.155203 -0.844797 -0.184142 0.815858 0.568065 0.788316 SS_mc

In [24]:
means_mc.to_csv('img/track_signs_assymetry_means_mc.csv', index=False, header=True)

In [25]:
means_mc_pt = [compute_sum_of_charges(data_mc[mask], name, bins, use_pt=True) for mask, name, bins in \
               zip([data_mc.signB > -100, 
              (data_mc.IPs > 3) & ((abs(data_mc.diff_eta) > 0.6) | (abs(data_mc.diff_phi) > 0.825)), 
              (abs(data_mc.diff_eta) < 0.6) & (abs(data_mc.diff_phi) < 0.825) & (data_mc.IPs < 3)], 
             ['full_mc_pt', 'OS_mc_pt', 'SS_mc_pt'], [21, 21, 21])]



In [26]:
pandas.concat(means_mc_pt)


Out[26]:
$B^+$ $B^+$, with signal part $B^-$ $B^-$, with signal part ROC AUC ROC AUC, with signal part name
0 2.174432e-08 -1 -3.615733e-08 1 0.500015 1 full_mc_pt
0 3.207049e-08 -1 -5.181189e-08 1 0.500032 1 OS_mc_pt
0 1.202127e-07 -1 -1.382307e-07 1 0.500353 1 SS_mc_pt

random track strategy for event


In [15]:
result_event_id, event_positions, data_ids = numpy.unique(data_full[event_id_column].values, return_index=True, return_inverse=True)

In [16]:
result_probs = -numpy.bincount(data_ids, weights=data_full.signTrack.values * data_full.partPt.values / sum(data_full.partPt.values))

In [23]:
result_label = numpy.bincount(data_ids, weights=data_full.signB.values) / numpy.bincount(data_ids)
result_weight = numpy.bincount(data_ids, weights=data_full.N_sig_sw.values) / numpy.bincount(data_ids)

In [28]:
hist(result_probs[result_label == 1], bins=60, normed=True)
hist(result_probs[result_label == -1], bins=60, normed=True)
pass



In [35]:
data_ids # event indices
track_ids = numpy.arange(len(data_ids))
selected_ids = numpy.zeros(numpy.max(data_ids) + 1)
selected_ids[data_ids] = track_ids # assigns last track in each event
permutation = numpy.random.permutation(track_ids)
selected_ids[data_ids[permutation]] = track_ids[permutation]

In [36]:
len(data_full), max(selected_ids.astype(int))


Out[36]:
(27156190, 27156186)

In [40]:
signs_random = data_full.signTrack.values[selected_ids.astype(int)]

In [41]:
roc_auc_score(result_label, signs_random)


Out[41]:
0.48935024021414875

In [48]:
figsize(7, 6)
plot(*roc_curve(result_label, signs_random)[:2])
plot([0, 1], [0, 1], 'k--')
grid()
title('Random track choose, \n ROC AUC using track sign %1.4f' %(1 - roc_auc_score(result_label, signs_random)), fontsize=14)
plt.savefig('img/assymetry_tracks_random.png', format='png')