notebook.community

Edit and run



In [1]:

    
import GAN.cms_datasets as cms
import GAN.plotting as plotting
import GAN.preprocessing as preprocessing









    



Using Theano backend.



In [2]:

    
import GAN.utils as utils

# reload(utils)

class Parameters(utils.Parameters):
    
    load_datasets=utils.param(["moriond_v9","abs(ScEta) < 1.5"])

    c_names = utils.param(['Phi','ScEta'])
    x_names = utils.param(['EtaWidth','R9','SigmaIeIe','S4','PhiWidth','mass'])#,'Pt','mass'])

#     # reweight = utils.param('rewei_zee_barrel.npy')
#     reweight = utils.param(['rewei_zee_barrel.npy','rewei_zee_pt_barrel.npy'])#,'rewei_zee_pu_barrel.npy'])

    mcweight = utils.param('weight')
    
    feat_transform = utils.param('minmax')
#    feat_transform = utils.param('gaus')

class MyApp(utils.MyApp):
    classes = utils.List([Parameters])

# Read all parameters above from command line. 
# Note: names are all converted to be all capital
notebook_parameters = Parameters(MyApp()).get_params()

# copy parameters to global scope
globals().update(notebook_parameters)

notebook_parameters









    Out[2]:





{'C_NAMES': ['Phi', 'ScEta'],
 'FEAT_TRANSFORM': u'minmax',
 'LOAD_DATASETS': ['moriond_v9', 'abs(ScEta) < 1.5'],
 'MCWEIGHT': u'weight',
 'X_NAMES': ['EtaWidth', 'R9', 'SigmaIeIe', 'S4', 'PhiWidth', 'mass']}



In [3]:

    
# reload(cms)
data,mc = cms.load_zee(*LOAD_DATASETS)



In [4]:

    
data.columns









    Out[4]:





Index([u'index', u'run', u'rho', u'nvtx', u'mass', u'weight', u'SigMoM', u'Pt',
       u'ScEta', u'Phi', u'R9', u'S4', u'SigmaIeIe', u'EtaWidth', u'PhiWidth',
       u'CovarianceIphiIphi', u'SigmaRR', u'ScEnergy', u'CovarianceIetaIphi',
       u'PhoIso03', u'ChIso03', u'ChIso03worst', u'ScPreshowerEnergy',
       u'PhoIDMVA', u'SigEOverE', u'run_quantile'],
      dtype='object')



In [5]:

    
mc.columns









    Out[5]:





Index([u'index', u'run', u'rho', u'nvtx', u'mass', u'weight', u'SigMoM', u'Pt',
       u'ScEta', u'Phi', u'R9', u'S4', u'SigmaIeIe', u'EtaWidth', u'PhiWidth',
       u'CovarianceIphiIphi', u'SigmaRR', u'ScEnergy', u'CovarianceIetaIphi',
       u'PhoIso03', u'ChIso03', u'ChIso03worst', u'ScPreshowerEnergy',
       u'PhoIDMVA', u'SigEOverE'],
      dtype='object')



In [6]:

    
c_names = C_NAMES
x_names = X_NAMES

data_c = data[c_names]
data_x = data[x_names]

mc_c = mc[c_names]
mc_x = mc[x_names]



In [7]:

    
data_x.columns, data_x.shape, data_c.columns, data_c.shape









    Out[7]:





(Index([u'EtaWidth', u'R9', u'SigmaIeIe', u'S4', u'PhiWidth', u'mass'], dtype='object'),
 (17520217, 6),
 Index([u'Phi', u'ScEta'], dtype='object'),
 (17520217, 2))



In [8]:

    
data_x.columns, data_c.columns









    Out[8]:





(Index([u'EtaWidth', u'R9', u'SigmaIeIe', u'S4', u'PhiWidth', u'mass'], dtype='object'),
 Index([u'Phi', u'ScEta'], dtype='object'))



In [9]:

    
mc_x.columns, mc_c.columns









    Out[9]:





(Index([u'EtaWidth', u'R9', u'SigmaIeIe', u'S4', u'PhiWidth', u'mass'], dtype='object'),
 Index([u'Phi', u'ScEta'], dtype='object'))



In [15]:

    
# # reload(preprocessing)
# if MCWEIGHT is None:
#     mc_w = np.ones(mc_x.shape[0])
# else:
#     mc_w = mc[MCWEIGHT].values

# if not REWEIGHT is None:
#     for fil in REWEIGHT:
#         info = np.load(fil)
#         inputs = info[0]
#         weights = info[1]
#         bins = info[2:]
#         # print(bins[1])
#         print('weighting',inputs)
#         mc_w *= preprocessing.reweight(mc,inputs,bins,weights,base=None)
    
# data_w = np.ones(data_x.shape[0])



In [16]:

    
q5 = data_x.quantile(0.05)
q95 = data_x.quantile(0.95)
data_min = data_x.min()
data_max = data_x.max()

iqr = data_x.quantile(0.75) - data_x.quantile(0.25)



In [17]:

    
data_min









    Out[17]:





EtaWidth      0.001511
R9            0.096056
SigmaIeIe     0.000000
S4            0.115319
PhiWidth      0.001289
mass         70.000038
dtype: float32



In [18]:

    
q5









    Out[18]:





EtaWidth      0.006840
R9            0.521665
SigmaIeIe     0.007390
S4            0.593159
PhiWidth      0.009082
mass         81.228424
dtype: float64



In [19]:

    
q5 - iqr









    Out[19]:





EtaWidth      0.004387
R9            0.323129
SigmaIeIe     0.006670
S4            0.463765
PhiWidth     -0.025666
mass         76.532669
dtype: float64



In [20]:

    
(q5 - data_min)/iqr









    Out[20]:





EtaWidth      2.172245
R9            2.143735
SigmaIeIe    10.253435
S4            3.692925
PhiWidth      0.224259
mass          2.391178
dtype: float64



In [21]:

    
data_max









    Out[21]:





EtaWidth        0.772673
R9              9.882552
SigmaIeIe       0.029891
S4              0.988874
PhiWidth        0.782234
mass         1230.203613
dtype: float32



In [22]:

    
q95









    Out[22]:





EtaWidth       0.013756
R9             0.966613
SigmaIeIe      0.010354
S4             0.916355
PhiWidth       0.077592
mass         109.754976
dtype: float64



In [23]:

    
q95 + iqr









    Out[23]:





EtaWidth       0.016209
R9             1.165149
SigmaIeIe      0.011074
S4             1.045748
PhiWidth       0.112340
mass         114.450731
dtype: float64



In [24]:

    
(data_max - q95)/iqr









    Out[24]:





EtaWidth     309.341930
R9            44.908390
SigmaIeIe     27.105768
S4             0.560458
PhiWidth      20.279064
mass         238.608836
dtype: float64



In [25]:

    
thr = 1.5



In [26]:

    
reject_low = data_x < q5 - thr*iqr
reject_high = data_x > q95 + thr*iqr



In [27]:

    
n_reject_low = reject_low.any(axis=1).sum()
n_reject_high = reject_high.any(axis=1).sum()



In [28]:

    
n_reject_low/data_x.shape[0], n_reject_high/data_x.shape[0]









    Out[28]:





(0, 0)



In [29]:

    
accept_data = ((data_x >= q5 - thr*iqr) & ( data_x <= q95 + thr*iqr )).all(axis=1)
accept_mc   = ((mc_x   >= q5 - thr*iqr) & ( mc_x <= q95 + thr*iqr )).all(axis=1)



In [39]:

    
accept_data.sum() / float(accept_data.shape[0])









    Out[39]:





0.91360175504675545



In [40]:

    
accept_mc.sum() / float(accept_mc.shape[0])









    Out[40]:





0.94725635677823405



In [41]:

    
data_x[accept_data].describe()









    Out[41]:






  
    
      
      EtaWidth
      R9
      SigmaIeIe
      S4
      PhiWidth
      mass
    
  
  
    
      count
      1.600650e+07
      1.600650e+07
      1.600650e+07
      1.600650e+07
      1.600650e+07
      1.600650e+07
    
    
      mean
      9.132168e-03
      9.078916e-01
      8.118966e-03
      9.151140e-01
      3.054186e-02
      8.629321e+01
    
    
      std
      1.834475e-03
      1.536037e-01
      9.883147e-04
      1.327628e-01
      2.184211e-02
      6.117148e+00
    
    
      min
      3.180516e-03
      2.238724e-01
      6.309268e-03
      3.990729e-01
      2.515367e-03
      7.418481e+01
    
    
      25%
      7.909074e-03
      7.631535e-01
      8.528570e-03
      7.777047e-01
      1.229331e-02
      8.759052e+01
    
    
      50%
      9.023006e-03
      8.989664e-01
      8.878669e-03
      8.652633e-01
      2.499229e-02
      8.987257e+01
    
    
      75%
      1.024940e-02
      9.479852e-01
      9.210394e-03
      8.969558e-01
      4.568314e-02
      9.190556e+01
    
    
      max
      1.743610e-02
      1.264299e+00
      1.143475e-02
      9.888741e-01
      1.297113e-01
      1.167986e+02



In [42]:

    
mc_x[accept_mc].describe()









    Out[42]:






  
    
      
      EtaWidth
      R9
      SigmaIeIe
      S4
      PhiWidth
      mass
    
  
  
    
      count
      9.108999e+06
      9.108999e+06
      9.108999e+06
      9.108999e+06
      9.108999e+06
      9.108999e+06
    
    
      mean
      8.740167e-03
      8.732899e-01
      8.335227e-03
      8.667808e-01
      2.976822e-02
      8.386697e+01
    
    
      std
      1.738866e-03
      1.408023e-01
      8.353423e-04
      1.046282e-01
      2.086873e-02
      7.714291e+00
    
    
      min
      3.494020e-03
      2.239490e-01
      6.309276e-03
      3.990765e-01
      1.274645e-06
      7.418497e+01
    
    
      25%
      7.538203e-03
      7.720036e-01
      8.521758e-03
      7.826572e-01
      1.150721e-02
      8.754033e+01
    
    
      50%
      8.561344e-03
      9.062228e-01
      8.873915e-03
      8.691059e-01
      2.228022e-02
      8.985310e+01
    
    
      75%
      9.803479e-03
      9.537584e-01
      9.200254e-03
      8.998990e-01
      4.241024e-02
      9.189629e+01
    
    
      max
      1.743612e-02
      1.263729e+00
      1.143472e-02
      9.940255e-01
      1.297105e-01
      1.167986e+02



In [43]:

    
thr_up = q95+thr*iqr
thr_down = q5-thr*iqr



In [44]:

    
thr_up









    Out[44]:





EtaWidth       0.017436
R9             1.264417
SigmaIeIe      0.011435
S4             1.110445
PhiWidth       0.129713
mass         116.798608
dtype: float64



In [45]:

    
thr_down









    Out[45]:





EtaWidth      0.003160
R9            0.223861
SigmaIeIe     0.006309
S4            0.399069
PhiWidth     -0.043039
mass         74.184792
dtype: float64



In [46]:

    
thr_up['mass'] = 110.
thr_down['mass'] = 70.
# thr_down['Pt'] = 25.
# thr_up['Pt'] = 100.



In [47]:

    
thr_up.to_hdf('cleaning_zee_m_barrel.hd5','thr_up',mode='w')
thr_down.to_hdf('cleaning_zee_m_barrel.hd5','thr_down',mode='a')



In [ ]:



In [48]:

    
#data_x = data_x[accept_data]
mc_x = mc_x[accept_mc]

data_c = data_c[accept_data]
mc_c = mc_c[accept_mc]

data_w = data_w[accept_data]
mc_w = mc_w[accept_mc]


data_x,data_c,mc_x,mc_c,scaler_x,scaler_c = preprocessing.transform(data_x,data_c,mc_x,mc_c,FEAT_TRANSFORM)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-48-02d976d7f8c4> in <module>()
      5 mc_c = mc_c[accept_mc]
      6 
----> 7 data_w = data_w[accept_data]
      8 mc_w = mc_w[accept_mc]
      9 

NameError: name 'data_w' is not defined



In [49]:

    
for ix in range(len(x_names)):
    plotting.plot_hists(data_x[:,0,ix],mc_x[:,0,ix],bins=100)#,range=[-3,3])
    plt.xlabel(x_names[ix])
    plt.show()

for ic in range(len(c_names)):
    plotting.plot_hists(data_c[:,0,ic],mc_c[:,0,ic],bins=100)#,range=[-3,3])
    plt.xlabel(c_names[ic])
    plt.show()









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-49-0d8debaf1b2f> in <module>()
      1 for ix in range(len(x_names)):
----> 2     plotting.plot_hists(data_x[:,0,ix],mc_x[:,0,ix],bins=100)#,range=[-3,3])
      3     plt.xlabel(x_names[ix])
      4     plt.show()
      5 

/swshare/anaconda/lib/python2.7/site-packages/pandas/core/frame.py in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

/swshare/anaconda/lib/python2.7/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

/swshare/anaconda/lib/python2.7/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1346         """Return the cached item, item represents a label indexer."""
   1347         cache = self._item_cache
-> 1348         res = cache.get(item)
   1349         if res is None:
   1350             values = self._data.get(item)

TypeError: unhashable type



In [ ]:

	EtaWidth	R9	SigmaIeIe	S4	PhiWidth	mass
count	1.600650e+07	1.600650e+07	1.600650e+07	1.600650e+07	1.600650e+07	1.600650e+07
mean	9.132168e-03	9.078916e-01	8.118966e-03	9.151140e-01	3.054186e-02	8.629321e+01
std	1.834475e-03	1.536037e-01	9.883147e-04	1.327628e-01	2.184211e-02	6.117148e+00
min	3.180516e-03	2.238724e-01	6.309268e-03	3.990729e-01	2.515367e-03	7.418481e+01
25%	7.909074e-03	7.631535e-01	8.528570e-03	7.777047e-01	1.229331e-02	8.759052e+01
50%	9.023006e-03	8.989664e-01	8.878669e-03	8.652633e-01	2.499229e-02	8.987257e+01
75%	1.024940e-02	9.479852e-01	9.210394e-03	8.969558e-01	4.568314e-02	9.190556e+01
max	1.743610e-02	1.264299e+00	1.143475e-02	9.888741e-01	1.297113e-01	1.167986e+02

	EtaWidth	R9	SigmaIeIe	S4	PhiWidth	mass
count	9.108999e+06	9.108999e+06	9.108999e+06	9.108999e+06	9.108999e+06	9.108999e+06
mean	8.740167e-03	8.732899e-01	8.335227e-03	8.667808e-01	2.976822e-02	8.386697e+01
std	1.738866e-03	1.408023e-01	8.353423e-04	1.046282e-01	2.086873e-02	7.714291e+00
min	3.494020e-03	2.239490e-01	6.309276e-03	3.990765e-01	1.274645e-06	7.418497e+01
25%	7.538203e-03	7.720036e-01	8.521758e-03	7.826572e-01	1.150721e-02	8.754033e+01
50%	8.561344e-03	9.062228e-01	8.873915e-03	8.691059e-01	2.228022e-02	8.985310e+01
75%	9.803479e-03	9.537584e-01	9.200254e-03	8.998990e-01	4.241024e-02	9.189629e+01
max	1.743612e-02	1.263729e+00	1.143472e-02	9.940255e-01	1.297105e-01	1.167986e+02