In [1]:
import GAN.cms_datasets as cms
import GAN.plotting as plotting
import GAN.preprocessing as preprocessing


Using Theano backend.

In [2]:
import GAN.utils as utils

# reload(utils)

class Parameters(utils.Parameters):
    
    load_datasets=utils.param(["moriond_v9","abs(ScEta) < 1.5"])

    c_names = utils.param(['Phi','ScEta'])
    x_names = utils.param(['EtaWidth','R9','SigmaIeIe','S4','PhiWidth','mass'])#,'Pt','mass'])

#     # reweight = utils.param('rewei_zee_barrel.npy')
#     reweight = utils.param(['rewei_zee_barrel.npy','rewei_zee_pt_barrel.npy'])#,'rewei_zee_pu_barrel.npy'])

    mcweight = utils.param('weight')
    
    feat_transform = utils.param('minmax')
#    feat_transform = utils.param('gaus')

class MyApp(utils.MyApp):
    classes = utils.List([Parameters])

# Read all parameters above from command line. 
# Note: names are all converted to be all capital
notebook_parameters = Parameters(MyApp()).get_params()

# copy parameters to global scope
globals().update(notebook_parameters)

notebook_parameters


Out[2]:
{'C_NAMES': ['Phi', 'ScEta'],
 'FEAT_TRANSFORM': u'minmax',
 'LOAD_DATASETS': ['moriond_v9', 'abs(ScEta) < 1.5'],
 'MCWEIGHT': u'weight',
 'X_NAMES': ['EtaWidth', 'R9', 'SigmaIeIe', 'S4', 'PhiWidth', 'mass']}

In [3]:
# reload(cms)
data,mc = cms.load_zee(*LOAD_DATASETS)

In [4]:
data.columns


Out[4]:
Index([u'index', u'run', u'rho', u'nvtx', u'mass', u'weight', u'SigMoM', u'Pt',
       u'ScEta', u'Phi', u'R9', u'S4', u'SigmaIeIe', u'EtaWidth', u'PhiWidth',
       u'CovarianceIphiIphi', u'SigmaRR', u'ScEnergy', u'CovarianceIetaIphi',
       u'PhoIso03', u'ChIso03', u'ChIso03worst', u'ScPreshowerEnergy',
       u'PhoIDMVA', u'SigEOverE', u'run_quantile'],
      dtype='object')

In [5]:
mc.columns


Out[5]:
Index([u'index', u'run', u'rho', u'nvtx', u'mass', u'weight', u'SigMoM', u'Pt',
       u'ScEta', u'Phi', u'R9', u'S4', u'SigmaIeIe', u'EtaWidth', u'PhiWidth',
       u'CovarianceIphiIphi', u'SigmaRR', u'ScEnergy', u'CovarianceIetaIphi',
       u'PhoIso03', u'ChIso03', u'ChIso03worst', u'ScPreshowerEnergy',
       u'PhoIDMVA', u'SigEOverE'],
      dtype='object')

In [6]:
c_names = C_NAMES
x_names = X_NAMES

data_c = data[c_names]
data_x = data[x_names]

mc_c = mc[c_names]
mc_x = mc[x_names]

In [7]:
data_x.columns, data_x.shape, data_c.columns, data_c.shape


Out[7]:
(Index([u'EtaWidth', u'R9', u'SigmaIeIe', u'S4', u'PhiWidth', u'mass'], dtype='object'),
 (17520217, 6),
 Index([u'Phi', u'ScEta'], dtype='object'),
 (17520217, 2))

In [8]:
data_x.columns, data_c.columns


Out[8]:
(Index([u'EtaWidth', u'R9', u'SigmaIeIe', u'S4', u'PhiWidth', u'mass'], dtype='object'),
 Index([u'Phi', u'ScEta'], dtype='object'))

In [9]:
mc_x.columns, mc_c.columns


Out[9]:
(Index([u'EtaWidth', u'R9', u'SigmaIeIe', u'S4', u'PhiWidth', u'mass'], dtype='object'),
 Index([u'Phi', u'ScEta'], dtype='object'))

In [15]:
# # reload(preprocessing)
# if MCWEIGHT is None:
#     mc_w = np.ones(mc_x.shape[0])
# else:
#     mc_w = mc[MCWEIGHT].values

# if not REWEIGHT is None:
#     for fil in REWEIGHT:
#         info = np.load(fil)
#         inputs = info[0]
#         weights = info[1]
#         bins = info[2:]
#         # print(bins[1])
#         print('weighting',inputs)
#         mc_w *= preprocessing.reweight(mc,inputs,bins,weights,base=None)
    
# data_w = np.ones(data_x.shape[0])

In [16]:
q5 = data_x.quantile(0.05)
q95 = data_x.quantile(0.95)
data_min = data_x.min()
data_max = data_x.max()

iqr = data_x.quantile(0.75) - data_x.quantile(0.25)

In [17]:
data_min


Out[17]:
EtaWidth      0.001511
R9            0.096056
SigmaIeIe     0.000000
S4            0.115319
PhiWidth      0.001289
mass         70.000038
dtype: float32

In [18]:
q5


Out[18]:
EtaWidth      0.006840
R9            0.521665
SigmaIeIe     0.007390
S4            0.593159
PhiWidth      0.009082
mass         81.228424
dtype: float64

In [19]:
q5 - iqr


Out[19]:
EtaWidth      0.004387
R9            0.323129
SigmaIeIe     0.006670
S4            0.463765
PhiWidth     -0.025666
mass         76.532669
dtype: float64

In [20]:
(q5 - data_min)/iqr


Out[20]:
EtaWidth      2.172245
R9            2.143735
SigmaIeIe    10.253435
S4            3.692925
PhiWidth      0.224259
mass          2.391178
dtype: float64

In [21]:
data_max


Out[21]:
EtaWidth        0.772673
R9              9.882552
SigmaIeIe       0.029891
S4              0.988874
PhiWidth        0.782234
mass         1230.203613
dtype: float32

In [22]:
q95


Out[22]:
EtaWidth       0.013756
R9             0.966613
SigmaIeIe      0.010354
S4             0.916355
PhiWidth       0.077592
mass         109.754976
dtype: float64

In [23]:
q95 + iqr


Out[23]:
EtaWidth       0.016209
R9             1.165149
SigmaIeIe      0.011074
S4             1.045748
PhiWidth       0.112340
mass         114.450731
dtype: float64

In [24]:
(data_max - q95)/iqr


Out[24]:
EtaWidth     309.341930
R9            44.908390
SigmaIeIe     27.105768
S4             0.560458
PhiWidth      20.279064
mass         238.608836
dtype: float64

In [25]:
thr = 1.5

In [26]:
reject_low = data_x < q5 - thr*iqr
reject_high = data_x > q95 + thr*iqr

In [27]:
n_reject_low = reject_low.any(axis=1).sum()
n_reject_high = reject_high.any(axis=1).sum()

In [28]:
n_reject_low/data_x.shape[0], n_reject_high/data_x.shape[0]


Out[28]:
(0, 0)

In [29]:
accept_data = ((data_x >= q5 - thr*iqr) & ( data_x <= q95 + thr*iqr )).all(axis=1)
accept_mc   = ((mc_x   >= q5 - thr*iqr) & ( mc_x <= q95 + thr*iqr )).all(axis=1)

In [39]:
accept_data.sum() / float(accept_data.shape[0])


Out[39]:
0.91360175504675545

In [40]:
accept_mc.sum() / float(accept_mc.shape[0])


Out[40]:
0.94725635677823405

In [41]:
data_x[accept_data].describe()


Out[41]:
EtaWidth R9 SigmaIeIe S4 PhiWidth mass
count 1.600650e+07 1.600650e+07 1.600650e+07 1.600650e+07 1.600650e+07 1.600650e+07
mean 9.132168e-03 9.078916e-01 8.118966e-03 9.151140e-01 3.054186e-02 8.629321e+01
std 1.834475e-03 1.536037e-01 9.883147e-04 1.327628e-01 2.184211e-02 6.117148e+00
min 3.180516e-03 2.238724e-01 6.309268e-03 3.990729e-01 2.515367e-03 7.418481e+01
25% 7.909074e-03 7.631535e-01 8.528570e-03 7.777047e-01 1.229331e-02 8.759052e+01
50% 9.023006e-03 8.989664e-01 8.878669e-03 8.652633e-01 2.499229e-02 8.987257e+01
75% 1.024940e-02 9.479852e-01 9.210394e-03 8.969558e-01 4.568314e-02 9.190556e+01
max 1.743610e-02 1.264299e+00 1.143475e-02 9.888741e-01 1.297113e-01 1.167986e+02

In [42]:
mc_x[accept_mc].describe()


Out[42]:
EtaWidth R9 SigmaIeIe S4 PhiWidth mass
count 9.108999e+06 9.108999e+06 9.108999e+06 9.108999e+06 9.108999e+06 9.108999e+06
mean 8.740167e-03 8.732899e-01 8.335227e-03 8.667808e-01 2.976822e-02 8.386697e+01
std 1.738866e-03 1.408023e-01 8.353423e-04 1.046282e-01 2.086873e-02 7.714291e+00
min 3.494020e-03 2.239490e-01 6.309276e-03 3.990765e-01 1.274645e-06 7.418497e+01
25% 7.538203e-03 7.720036e-01 8.521758e-03 7.826572e-01 1.150721e-02 8.754033e+01
50% 8.561344e-03 9.062228e-01 8.873915e-03 8.691059e-01 2.228022e-02 8.985310e+01
75% 9.803479e-03 9.537584e-01 9.200254e-03 8.998990e-01 4.241024e-02 9.189629e+01
max 1.743612e-02 1.263729e+00 1.143472e-02 9.940255e-01 1.297105e-01 1.167986e+02

In [43]:
thr_up = q95+thr*iqr
thr_down = q5-thr*iqr

In [44]:
thr_up


Out[44]:
EtaWidth       0.017436
R9             1.264417
SigmaIeIe      0.011435
S4             1.110445
PhiWidth       0.129713
mass         116.798608
dtype: float64

In [45]:
thr_down


Out[45]:
EtaWidth      0.003160
R9            0.223861
SigmaIeIe     0.006309
S4            0.399069
PhiWidth     -0.043039
mass         74.184792
dtype: float64

In [46]:
thr_up['mass'] = 110.
thr_down['mass'] = 70.
# thr_down['Pt'] = 25.
# thr_up['Pt'] = 100.

In [47]:
thr_up.to_hdf('cleaning_zee_m_barrel.hd5','thr_up',mode='w')
thr_down.to_hdf('cleaning_zee_m_barrel.hd5','thr_down',mode='a')

In [ ]:


In [48]:
#data_x = data_x[accept_data]
mc_x = mc_x[accept_mc]

data_c = data_c[accept_data]
mc_c = mc_c[accept_mc]

data_w = data_w[accept_data]
mc_w = mc_w[accept_mc]


data_x,data_c,mc_x,mc_c,scaler_x,scaler_c = preprocessing.transform(data_x,data_c,mc_x,mc_c,FEAT_TRANSFORM)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-48-02d976d7f8c4> in <module>()
      5 mc_c = mc_c[accept_mc]
      6 
----> 7 data_w = data_w[accept_data]
      8 mc_w = mc_w[accept_mc]
      9 

NameError: name 'data_w' is not defined

In [49]:
for ix in range(len(x_names)):
    plotting.plot_hists(data_x[:,0,ix],mc_x[:,0,ix],bins=100)#,range=[-3,3])
    plt.xlabel(x_names[ix])
    plt.show()

for ic in range(len(c_names)):
    plotting.plot_hists(data_c[:,0,ic],mc_c[:,0,ic],bins=100)#,range=[-3,3])
    plt.xlabel(c_names[ic])
    plt.show()


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-49-0d8debaf1b2f> in <module>()
      1 for ix in range(len(x_names)):
----> 2     plotting.plot_hists(data_x[:,0,ix],mc_x[:,0,ix],bins=100)#,range=[-3,3])
      3     plt.xlabel(x_names[ix])
      4     plt.show()
      5 

/swshare/anaconda/lib/python2.7/site-packages/pandas/core/frame.py in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

/swshare/anaconda/lib/python2.7/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

/swshare/anaconda/lib/python2.7/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1346         """Return the cached item, item represents a label indexer."""
   1347         cache = self._item_cache
-> 1348         res = cache.get(item)
   1349         if res is None:
   1350             values = self._data.get(item)

TypeError: unhashable type

In [ ]: