In [8]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.cross_validation as cv
In [51]:
df = pd.read_csv("training.csv", index_col=0, na_values=[-999.])
weights = df['Weight']
labels = df['Label']
df = df.drop(['Weight', 'Label'], axis=1)
df.head()
Out[51]:
In [52]:
(n_points, n_features) = df.shape
In [53]:
df = df + np.random.normal(0, .001, df.shape)
In [54]:
sSelector = labels == 's'
bSelector = ~sSelector
In [55]:
weights
wsum = weights.sum()
ss_wsum = sSelector.sum()
bs_wsum = bSelector.sum()
In [56]:
np.random.seed(42)
idx = np.asarray(df.index)
np.random.shuffle(idx)
idx
Out[56]:
In [57]:
train_idx, test_idx = cv.train_test_split(df.index, train_size=.9)
train, test = df.loc[train_idx], df.loc[test_idx]
s_train, s_test = sSelector.loc[train_idx], bSelector.loc[test_idx]
b_train, b_test = bSelector.loc[train_idx], bSelector.loc[test_idx]
w_train, w_test = weights.loc[train_idx], weights.loc[test_idx]
ss_w_train = w_train.loc[s_train].sum()
bs_w_train = w_train.loc[b_train].sum()
In [58]:
w_balanced_train = (.5 * (s_train * w_train / ss_w_train) +
.5 * (b_train * w_train / bs_w_train))
w_balanced_train.sum()
Out[58]:
In [59]:
n_bins = 10
In [60]:
log_Ps = np.empty([n_features, n_bins]) # convert to frame
bin_max = np.empty([n_features, n_bins])
bin_idx = np.array(range(0, len(train)+1, len(train)//n_bins))
In [72]:
train[train.columns[0]].hist()
Out[72]:
In [34]:
idxs = np.apply_along_axis(np.argsort, 0, train.values)
for fI in range(train.shape[1]): # columns
# index permutation of sorted feature column
idx = idxs[:, fI]
for bI in range(numBins):
# upper bin limits
binMaxs[fI, bI] = xsTrainTranspose[fI, indexes[binIndexes[bI+1]-1]]
# training indices of points in a bin
indexesInBin = indexes[binIndexes[bI]:binIndexes[bI+1]]
# sum of signal weights in bin
wS = np.sum(weightsBalancedTrain[indexesInBin]
[sSelectorTrain[indexesInBin]])
# sum of background weights in bin
wB = np.sum(weightsBalancedTrain[indexesInBin]
[bSelectorTrain[indexesInBin]])
# log probability of being a signal in the bin
logPs[fI, bI] = math.log(wS/(wS+wB))
Out[34]:
In [ ]:
In [ ]:
In [ ]: