notebook.community

Edit and run



In [8]:

    
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn.cross_validation as cv



In [51]:

    
df = pd.read_csv("training.csv", index_col=0, na_values=[-999.])
weights = df['Weight']
labels = df['Label']
df = df.drop(['Weight', 'Label'], axis=1)
df.head()









    Out[51]:






  
    
      
      DER_mass_MMC
      DER_mass_transverse_met_lep
      DER_mass_vis
      DER_pt_h
      DER_deltaeta_jet_jet
      DER_mass_jet_jet
      DER_prodeta_jet_jet
      DER_deltar_tau_lep
      DER_pt_tot
      DER_sum_pt
      DER_pt_ratio_lep_tau
      DER_met_phi_centrality
      DER_lep_eta_centrality
      PRI_tau_pt
      PRI_tau_eta
      PRI_tau_phi
      PRI_lep_pt
      PRI_lep_eta
      PRI_lep_phi
      PRI_met
      
    
    
      EventId
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      100000
       138.470
        51.655
        97.827
       27.980
       0.91
       124.711
       2.666
       3.064
       41.928
       197.760
       1.582
       1.396
       0.2
       32.638
       1.017
       0.381
        51.626
       2.273
      -2.414
       16.824
      ...
    
    
      100001
       160.937
        68.768
       103.235
       48.146
        NaN
           NaN
         NaN
       3.473
        2.078
       125.157
       0.879
       1.414
       NaN
       42.014
       2.039
      -3.011
        36.918
       0.501
       0.103
       44.704
      ...
    
    
      100002
           NaN
       162.172
       125.953
       35.635
        NaN
           NaN
         NaN
       3.148
        9.336
       197.814
       3.776
       1.414
       NaN
       32.154
      -0.705
      -2.093
       121.409
      -0.953
       1.052
       54.283
      ...
    
    
      100003
       143.905
        81.417
        80.943
        0.414
        NaN
           NaN
         NaN
       3.310
        0.414
        75.968
       2.354
      -1.285
       NaN
       22.647
      -1.655
       0.010
        53.321
      -0.522
      -3.100
       31.082
      ...
    
    
      100004
       175.864
        16.915
       134.805
       16.405
        NaN
           NaN
         NaN
       3.891
       16.405
        57.983
       1.056
      -1.385
       NaN
       28.209
      -2.197
      -2.231
        29.774
       0.798
       1.569
        2.723
      ...
    
  

5 rows × 30 columns



In [52]:

    
(n_points, n_features) = df.shape



In [53]:

    
df = df + np.random.normal(0, .001, df.shape)



In [54]:

    
sSelector = labels == 's'
bSelector = ~sSelector



In [55]:

    
weights
wsum = weights.sum()
ss_wsum = sSelector.sum()
bs_wsum = bSelector.sum()



In [56]:

    
np.random.seed(42)
idx = np.asarray(df.index)
np.random.shuffle(idx)
idx









    Out[56]:





array([138683, 164939, 103954, ..., 231932, 246867, 221958])



In [57]:

    
train_idx, test_idx = cv.train_test_split(df.index, train_size=.9)
train, test = df.loc[train_idx], df.loc[test_idx]

s_train, s_test = sSelector.loc[train_idx], bSelector.loc[test_idx]
b_train, b_test = bSelector.loc[train_idx], bSelector.loc[test_idx]

w_train, w_test = weights.loc[train_idx], weights.loc[test_idx]
ss_w_train = w_train.loc[s_train].sum()
bs_w_train = w_train.loc[b_train].sum()



In [58]:

    
w_balanced_train = (.5 * (s_train * w_train / ss_w_train) +
                    .5 * (b_train * w_train / bs_w_train))
w_balanced_train.sum()









    Out[58]:





0.99999999999987665



In [59]:

    
n_bins = 10



In [60]:

    
log_Ps = np.empty([n_features, n_bins])  # convert to frame
bin_max = np.empty([n_features, n_bins])
bin_idx = np.array(range(0, len(train)+1, len(train)//n_bins))



In [72]:

    
train[train.columns[0]].hist()









    Out[72]:





<matplotlib.axes._subplots.AxesSubplot at 0x105bb23d0>



In [34]:

    
idxs = np.apply_along_axis(np.argsort, 0, train.values)


for fI in range(train.shape[1]):  # columns
    # index permutation of sorted feature column
    idx = idxs[:, fI]

    for bI in range(numBins):
        # upper bin limits
        binMaxs[fI, bI] = xsTrainTranspose[fI, indexes[binIndexes[bI+1]-1]]
        # training indices of points in a bin
        indexesInBin = indexes[binIndexes[bI]:binIndexes[bI+1]]
        # sum of signal weights in bin
        wS = np.sum(weightsBalancedTrain[indexesInBin]
                    [sSelectorTrain[indexesInBin]])
        # sum of background weights in bin
        wB = np.sum(weightsBalancedTrain[indexesInBin]
                    [bSelectorTrain[indexesInBin]])
        # log probability of being a signal in the bin
        logPs[fI, bI] = math.log(wS/(wS+wB))









    Out[34]:





array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])



In [ ]:



In [ ]:



In [ ]:

	DER_mass_MMC	DER_mass_transverse_met_lep	DER_mass_vis	DER_pt_h	DER_deltaeta_jet_jet	DER_mass_jet_jet	DER_prodeta_jet_jet	DER_deltar_tau_lep	DER_pt_tot	DER_sum_pt	DER_pt_ratio_lep_tau	DER_met_phi_centrality	DER_lep_eta_centrality	PRI_tau_pt	PRI_tau_eta	PRI_tau_phi	PRI_lep_pt	PRI_lep_eta	PRI_lep_phi	PRI_met
EventId
100000	138.470	51.655	97.827	27.980	0.91	124.711	2.666	3.064	41.928	197.760	1.582	1.396	0.2	32.638	1.017	0.381	51.626	2.273	-2.414	16.824	...
100001	160.937	68.768	103.235	48.146	NaN	NaN	NaN	3.473	2.078	125.157	0.879	1.414	NaN	42.014	2.039	-3.011	36.918	0.501	0.103	44.704	...
100002	NaN	162.172	125.953	35.635	NaN	NaN	NaN	3.148	9.336	197.814	3.776	1.414	NaN	32.154	-0.705	-2.093	121.409	-0.953	1.052	54.283	...
100003	143.905	81.417	80.943	0.414	NaN	NaN	NaN	3.310	0.414	75.968	2.354	-1.285	NaN	22.647	-1.655	0.010	53.321	-0.522	-3.100	31.082	...
100004	175.864	16.915	134.805	16.405	NaN	NaN	NaN	3.891	16.405	57.983	1.056	-1.385	NaN	28.209	-2.197	-2.231	29.774	0.798	1.569	2.723	...