In [8]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn.cross_validation as cv

In [51]:
df = pd.read_csv("training.csv", index_col=0, na_values=[-999.])
weights = df['Weight']
labels = df['Label']
df = df.drop(['Weight', 'Label'], axis=1)
df.head()


Out[51]:
DER_mass_MMC DER_mass_transverse_met_lep DER_mass_vis DER_pt_h DER_deltaeta_jet_jet DER_mass_jet_jet DER_prodeta_jet_jet DER_deltar_tau_lep DER_pt_tot DER_sum_pt DER_pt_ratio_lep_tau DER_met_phi_centrality DER_lep_eta_centrality PRI_tau_pt PRI_tau_eta PRI_tau_phi PRI_lep_pt PRI_lep_eta PRI_lep_phi PRI_met
EventId
100000 138.470 51.655 97.827 27.980 0.91 124.711 2.666 3.064 41.928 197.760 1.582 1.396 0.2 32.638 1.017 0.381 51.626 2.273 -2.414 16.824 ...
100001 160.937 68.768 103.235 48.146 NaN NaN NaN 3.473 2.078 125.157 0.879 1.414 NaN 42.014 2.039 -3.011 36.918 0.501 0.103 44.704 ...
100002 NaN 162.172 125.953 35.635 NaN NaN NaN 3.148 9.336 197.814 3.776 1.414 NaN 32.154 -0.705 -2.093 121.409 -0.953 1.052 54.283 ...
100003 143.905 81.417 80.943 0.414 NaN NaN NaN 3.310 0.414 75.968 2.354 -1.285 NaN 22.647 -1.655 0.010 53.321 -0.522 -3.100 31.082 ...
100004 175.864 16.915 134.805 16.405 NaN NaN NaN 3.891 16.405 57.983 1.056 -1.385 NaN 28.209 -2.197 -2.231 29.774 0.798 1.569 2.723 ...

5 rows × 30 columns


In [52]:
(n_points, n_features) = df.shape

In [53]:
df = df + np.random.normal(0, .001, df.shape)

In [54]:
sSelector = labels == 's'
bSelector = ~sSelector

In [55]:
weights
wsum = weights.sum()
ss_wsum = sSelector.sum()
bs_wsum = bSelector.sum()

In [56]:
np.random.seed(42)
idx = np.asarray(df.index)
np.random.shuffle(idx)
idx


Out[56]:
array([138683, 164939, 103954, ..., 231932, 246867, 221958])

In [57]:
train_idx, test_idx = cv.train_test_split(df.index, train_size=.9)
train, test = df.loc[train_idx], df.loc[test_idx]

s_train, s_test = sSelector.loc[train_idx], bSelector.loc[test_idx]
b_train, b_test = bSelector.loc[train_idx], bSelector.loc[test_idx]

w_train, w_test = weights.loc[train_idx], weights.loc[test_idx]
ss_w_train = w_train.loc[s_train].sum()
bs_w_train = w_train.loc[b_train].sum()

In [58]:
w_balanced_train = (.5 * (s_train * w_train / ss_w_train) +
                    .5 * (b_train * w_train / bs_w_train))
w_balanced_train.sum()


Out[58]:
0.99999999999987665

In [59]:
n_bins = 10

In [60]:
log_Ps = np.empty([n_features, n_bins])  # convert to frame
bin_max = np.empty([n_features, n_bins])
bin_idx = np.array(range(0, len(train)+1, len(train)//n_bins))

In [72]:
train[train.columns[0]].hist()


Out[72]:
<matplotlib.axes._subplots.AxesSubplot at 0x105bb23d0>

In [34]:
idxs = np.apply_along_axis(np.argsort, 0, train.values)


for fI in range(train.shape[1]):  # columns
    # index permutation of sorted feature column
    idx = idxs[:, fI]

    for bI in range(numBins):
        # upper bin limits
        binMaxs[fI, bI] = xsTrainTranspose[fI, indexes[binIndexes[bI+1]-1]]
        # training indices of points in a bin
        indexesInBin = indexes[binIndexes[bI]:binIndexes[bI+1]]
        # sum of signal weights in bin
        wS = np.sum(weightsBalancedTrain[indexesInBin]
                    [sSelectorTrain[indexesInBin]])
        # sum of background weights in bin
        wB = np.sum(weightsBalancedTrain[indexesInBin]
                    [bSelectorTrain[indexesInBin]])
        # log probability of being a signal in the bin
        logPs[fI, bI] = math.log(wS/(wS+wB))


Out[34]:
array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [ ]:


In [ ]:


In [ ]: