Data / MC cluster shape corrections using $Z\rightarrow ee$ sample.


In [1]:
import GAN.models as models
import GAN.cms_datasets as cms
import GAN.plotting as plotting
import GAN.preprocessing as preprocessing
import GAN.base as base


Using TensorFlow backend.

Configuration parameters


In [2]:
import GAN.utils as utils

# reload(utils)

class Parameters(utils.Parameters):
    
    # dataset to be loaded
    load_datasets=utils.param(["moriond_v9","abs(ScEta) < 1.5"])

    c_names = utils.param(['Phi','ScEta','Pt','rho','run'])
    x_names = utils.param(['EtaWidth','R9','SigmaIeIe','S4','PhiWidth','CovarianceIetaIphi',
                           'CovarianceIphiIphi'])
    
    # generate variables in MC to be distributed like in data
    sample_from_data = ['run']
    
    # MC reweighting
    reweight = ['rewei_zee_5d_barrel_m_cleaning_xgb']
    mcweight = utils.param('') # if empty ignore original weight
    use_abs_weight = utils.param(True) # negative weights are notoriously nasty

    # cleaning cuts
    cleaning = utils.param('cleaning_zee_m_barrel.hd5')
    
    # input features transformation
    feat_transform = utils.param('minmax')
    global_match = utils.param(False) 
    
    # generator and discriminator (aka as 'critic' in wGAN)
    g_opts=utils.param(dict(name="G_512to64x8",
                            kernel_sizes=[64,64,128,128,256,256,512,512],
                            do_weight_reg=1e-5,do_last_l1reg=1e-5
                            ))
    pretrain_g = utils.param(False)
    d_opts=utils.param(dict(name="D_1024to32x6",kernel_sizes=[1024,512,128,64,32],do_bn=False,
                        do_dropout=False,hidden_activation="relu",
                        clip_weights=2.e-2,activation=None)) # weights clipping and no actication
    # optimizers
    dm_opts=utils.param(dict(optimizer="RMSprop",opt_kwargs=dict(lr=1e-5,decay=1e-6)))
    am_opts=utils.param(dict(optimizer="RMSprop",opt_kwargs=dict(lr=1e-5,decay=1e-6)))
        
    loss = utils.param("wgan_loss") # use WGAN loss 
    gan_targets = utils.param('gan_targets_hinge') # hinge targets are 1, -1 instead of 0, 1
    schedule = utils.param([0]*5+[1]*1) # number of critic iterations per generator iterations

    # schedule training
    epochs=utils.param(200)
    batch_size=utils.param(4096)
    plot_every=utils.param(10)
    frac_data=utils.param(10) # fraction of data to use
    
    monitor_dir = utils.param('log') # folder for logging
    
    batch = utils.param(False) # are we running in batch?

class MyApp(utils.MyApp):
    classes = utils.List([Parameters])

# Read all parameters above from command line. 
# Note: names are all converted to be all capital
notebook_parameters = Parameters(MyApp()).get_params()

# copy parameters to global scope
globals().update(notebook_parameters)
DM_OPTS.update( {"loss":LOSS} )
AM_OPTS.update( {"loss":LOSS} )


plotting.batch = BATCH 

notebook_parameters


Out[2]:
{'AM_OPTS': {'loss': 'wgan_loss',
  'opt_kwargs': {'decay': 1e-06, 'lr': 1e-05},
  'optimizer': 'RMSprop'},
 'BATCH': False,
 'BATCH_SIZE': 4096,
 'CLEANING': 'cleaning_zee_m_barrel.hd5',
 'C_NAMES': ['Phi', 'ScEta', 'Pt', 'rho', 'run'],
 'DM_OPTS': {'loss': 'wgan_loss',
  'opt_kwargs': {'decay': 1e-06, 'lr': 1e-05},
  'optimizer': 'RMSprop'},
 'D_OPTS': {'activation': None,
  'clip_weights': 0.02,
  'do_bn': False,
  'kernel_sizes': [1024, 512, 128, 64, 32],
  'name': 'D_1024to32x6'},
 'EPOCHS': 200,
 'FEAT_TRANSFORM': 'gaus',
 'FRAC_DATA': 10,
 'GAN_TARGETS': 'gan_targets_hinge',
 'GLOBAL_MATCH': True,
 'G_OPTS': {'do_last_l1reg': 1e-06,
  'do_weight_reg': 1e-05,
  'kernel_sizes': [64, 64, 128, 128, 256, 256, 512, 512],
  'name': 'G_512to64x8'},
 'LOAD_DATASETS': ['moriond_v9', 'abs(ScEta) < 1.5'],
 'LOSS': 'wgan_loss',
 'MCWEIGHT': '',
 'MONITOR_DIR': 'log',
 'PLOT_EVERY': 10,
 'PRETRAIN_G': False,
 'REWEIGHT': ['rewei_zee_5d_barrel_m_cleaning_xgb'],
 'SAMPLE_FROM_DATA': ['run'],
 'SCHEDULE': [0, 0, 0, 0, 0, 1],
 'USE_ABS_WEIGHT': True,
 'X_NAMES': ['EtaWidth',
  'R9',
  'SigmaIeIe',
  'S4',
  'PhiWidth',
  'CovarianceIetaIphi',
  'CovarianceIphiIphi']}

Load datasets

Apply cleaning and reweight MC to match data


In [3]:
data,mc = cms.load_zee(*LOAD_DATASETS)

In [4]:
data.columns


Out[4]:
Index(['index', 'run', 'rho', 'nvtx', 'mass', 'weight', 'SigMoM', 'Pt',
       'ScEta', 'Phi', 'R9', 'S4', 'SigmaIeIe', 'EtaWidth', 'PhiWidth',
       'CovarianceIphiIphi', 'SigmaRR', 'ScEnergy', 'CovarianceIetaIphi',
       'PhoIso03', 'ChIso03', 'ChIso03worst', 'ScPreshowerEnergy', 'PhoIDMVA',
       'SigEOverE', 'run_quantile'],
      dtype='object')

In [5]:
mc.columns


Out[5]:
Index(['index', 'run', 'rho', 'nvtx', 'mass', 'weight', 'SigMoM', 'Pt',
       'ScEta', 'Phi', 'R9', 'S4', 'SigmaIeIe', 'EtaWidth', 'PhiWidth',
       'CovarianceIphiIphi', 'SigmaRR', 'ScEnergy', 'CovarianceIetaIphi',
       'PhoIso03', 'ChIso03', 'ChIso03worst', 'ScPreshowerEnergy', 'PhoIDMVA',
       'SigEOverE'],
      dtype='object')

Cleaning


In [7]:
if not CLEANING is None:
    print('cleaning data and mc')
    thr_up = pd.read_hdf(CLEANING,'thr_up')
    thr_down = pd.read_hdf(CLEANING,'thr_down')
    nevts_data = data.shape[0]
    nevts_mc = mc.shape[0]
    data = data[ ((data[thr_down.index] >= thr_down) & (data[thr_up.index] <= thr_up)).all(axis=1) ]
    mc = mc[ ((mc[thr_down.index] >= thr_down) & (mc[thr_up.index] <= thr_up)).all(axis=1) ]
    print('cleaning eff (data,mc): %1.2f % 1.2f' % (  data.shape[0] / nevts_data, mc.shape[0] / nevts_mc  ))


cleaning data and mc
cleaning eff (data,mc): 0.91  0.95

Generate extra variables for MC (eg run number) distributed like data


In [8]:
for col in SAMPLE_FROM_DATA:
    print('sampling', col)
    mc[col] = data[col].sample(mc.shape[0]).values


sampling run

Select target and conditional features


In [10]:
c_names = C_NAMES
x_names = X_NAMES

data_c = data[c_names]
data_x = data[x_names]

mc_c = mc[c_names]
mc_x = mc[x_names]

In [11]:
data_x.columns, data_x.shape, data_c.columns, data_c.shape


Out[11]:
(Index(['EtaWidth', 'R9', 'SigmaIeIe', 'S4', 'PhiWidth', 'CovarianceIetaIphi',
        'CovarianceIphiIphi'],
       dtype='object'),
 (16024468, 7),
 Index(['Phi', 'ScEta', 'Pt', 'rho', 'run'], dtype='object'),
 (16024468, 5))

In [12]:
data_x.columns, data_c.columns


Out[12]:
(Index(['EtaWidth', 'R9', 'SigmaIeIe', 'S4', 'PhiWidth', 'CovarianceIetaIphi',
        'CovarianceIphiIphi'],
       dtype='object'),
 Index(['Phi', 'ScEta', 'Pt', 'rho', 'run'], dtype='object'))

In [13]:
mc_x.columns, mc_c.columns


Out[13]:
(Index(['EtaWidth', 'R9', 'SigmaIeIe', 'S4', 'PhiWidth', 'CovarianceIetaIphi',
        'CovarianceIphiIphi'],
       dtype='object'),
 Index(['Phi', 'ScEta', 'Pt', 'rho', 'run'], dtype='object'))

Reweight MC


In [14]:
# reload(preprocessing)
# reload(utils)

# iniliatlize weights to default
if MCWEIGHT == '':
    mc_w = np.ones(mc_x.shape[0])
else:
    mc_w = mc[MCWEIGHT].values

print(mc_w[:10])
    
# take care of negative weights
if USE_ABS_WEIGHT:
    mc_w = np.abs(mc_w)

# reweighting
if not REWEIGHT is None:
    for fil in REWEIGHT:
        # apply weights from n-dimnesional histograms
        if 'npy' in fil:
            info = np.load(fil)
            inputs = info[0]
            weights = info[1]
            bins = info[2:]
            # print(bins[1])
            print('weighting',inputs)
            mc_w *= preprocessing.reweight(mc,inputs,bins,weights,base=None)
        else:
            # or use a classifier
            clf = utils.read_pickle(fil,encoding='latin1')
            if hasattr(clf,'nthread'):
                clf.nthread = min(4,clf.nthread)
                clf.n_jobs = clf.nthread
            mc_w *= preprocessing.reweight_multidim(mc[clf.features_],clf)
            
data_w = np.ones(data_x.shape[0])


[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]

Features transformation


In [24]:
# reload(preprocessing)

if GLOBAL_MATCH:
    _,data_c,mc_x,mc_c,scaler_x,scaler_c = preprocessing.transform(None,data_c,mc_x,mc_c,FEAT_TRANSFORM)
    _,_,data_x,_,scaler_x_data,_ = preprocessing.transform(None,None,data_x,None,FEAT_TRANSFORM)
    
else:
    data_x,data_c,mc_x,mc_c,scaler_x,scaler_c = preprocessing.transform(data_x,data_c,mc_x,mc_c,FEAT_TRANSFORM)

In [25]:
data_x.shape,mc_x.shape


Out[25]:
((16024468, 1, 7), (9131160, 1, 7))

In [26]:
data_x[2].max(),mc_x[2].max()


Out[26]:
(1.2261241192018459, 1.3815607851977525)

In [27]:
print(mc_c.max(axis=0),mc_c.min(axis=0))
print(data_c.max(axis=0),data_c.min(axis=0))
print(mc_w.shape,mc_c.shape)


[[ 5.19933758  5.19933758  5.19933758  5.19933758  5.19933758]] [[-5.19933758 -5.19933758 -5.19933758 -5.19933758 -5.19933758]]
[[ 5.19933758  5.19933758  5.19933758  3.48306001  5.19933758]] [[-5.19933758 -5.19933758 -5.19933758 -5.19933758 -5.19933758]]
(9131160,) (9131160, 1, 5)

In [28]:
for ic in range(len(c_names)):
    plotting.plot_hists(data_c[:,0,ic],mc_c[:,0,ic],generated_w=mc_w,bins=100,range=[-2.0,2.0])
    plt.xlabel(c_names[ic])
    plt.show()

for ix in range(len(x_names)):
    plotting.plot_hists(data_x[:,0,ix],mc_x[:,0,ix],generated_w=mc_w,bins=100,range=[-3,3])
    plt.xlabel(x_names[ix])
    plt.show()


Prepare train and test sample


In [ ]:
nmax = min(data_x.shape[0]//FRAC_DATA,mc_x.shape[0])

data_x_train,data_x_test,data_c_train,data_c_test,data_w_train,data_w_test = cms.train_test_split(data_x[:nmax],data_c[:nmax],data_w[:nmax],test_size=0.1)
mc_x_train,mc_x_test,mc_c_train,mc_c_test,mc_w_train,mc_w_test = cms.train_test_split(mc_x[:nmax],mc_c[:nmax],mc_w[:nmax],test_size=0.1)

wscl = data_w[:nmax].sum() / mc_w[:nmax].sum()
mc_w_train *= wscl
mc_w_test *= wscl

In [ ]:
print(nmax,mc_w_train.sum()/data_w_train.sum())

Instantiate the model

Create the model and compile it.


In [15]:
# reload(models)

xz_shape = (1,len(x_names))
c_shape = (1,len(c_names))
gan = models.MyFFGAN( xz_shape, xz_shape, c_shape=c_shape,
                     g_opts=G_OPTS,
                     d_opts=D_OPTS,
                     dm_opts=DM_OPTS,
                     am_opts=AM_OPTS,
                     gan_targets=GAN_TARGETS
                    )

This will actually trigger the instantiation of the generator (if not done here, it will happen befor compilation).


In [16]:
gan.get_generator()


(1, 7)
Out[16]:
<keras.engine.training.Model at 0x2b2c3ec2f240>

Same as above for the discriminator.
Two discriminator models are created: one for the discriminator training phase and another for the generator trainin phase.
The difference between the two is that the second does not contain dropout layers.


In [17]:
gan.get_discriminator()


WeightClip 0.02
WeightClip 0.02
WeightClip 0.02
WeightClip 0.02
WeightClip 0.02
WeightClip 0.02
Out[17]:
(<keras.engine.training.Model at 0x2b2c3eecba20>,
 <keras.engine.training.Model at 0x2b2c3eecbc18>)

model compilation


In [18]:
gan.adversarial_compile(loss=LOSS,schedule=SCHEDULE)


wgan_loss

In [19]:
gan.get_generator().summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
G_512to64x8_c_input (InputLayer) (None, 1, 5)          0                                            
____________________________________________________________________________________________________
G_512to64x8_input (InputLayer)   (None, 1, 7)          0                                            
____________________________________________________________________________________________________
G_512to64x8_all_inputs (Concaten (None, 1, 12)         0           G_512to64x8_c_input[0][0]        
                                                                   G_512to64x8_input[0][0]          
____________________________________________________________________________________________________
G_512to64x8_up1_dense (Dense)    (None, 1, 512)        6656        G_512to64x8_all_inputs[0][0]     
____________________________________________________________________________________________________
G_512to64x8_up1_activ (PReLU)    (None, 1, 512)        512         G_512to64x8_up1_dense[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up2_dense (Dense)    (None, 1, 512)        262656      G_512to64x8_up1_activ[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up2_activ (PReLU)    (None, 1, 512)        512         G_512to64x8_up2_dense[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up3_dense (Dense)    (None, 1, 256)        131328      G_512to64x8_up2_activ[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up3_activ (PReLU)    (None, 1, 256)        256         G_512to64x8_up3_dense[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up4_dense (Dense)    (None, 1, 256)        65792       G_512to64x8_up3_activ[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up4_activ (PReLU)    (None, 1, 256)        256         G_512to64x8_up4_dense[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up5_dense (Dense)    (None, 1, 128)        32896       G_512to64x8_up4_activ[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up5_activ (PReLU)    (None, 1, 128)        128         G_512to64x8_up5_dense[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up6_dense (Dense)    (None, 1, 128)        16512       G_512to64x8_up5_activ[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up6_activ (PReLU)    (None, 1, 128)        128         G_512to64x8_up6_dense[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up7_dense (Dense)    (None, 1, 64)         8256        G_512to64x8_up6_activ[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up7_activ (PReLU)    (None, 1, 64)         64          G_512to64x8_up7_dense[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up8_dense (Dense)    (None, 1, 64)         4160        G_512to64x8_up7_activ[0][0]      
____________________________________________________________________________________________________
G_512to64x8_up8_activ (PReLU)    (None, 1, 64)         64          G_512to64x8_up8_dense[0][0]      
____________________________________________________________________________________________________
G_512to64x8_output (Dense)       (None, 1, 7)          455         G_512to64x8_up8_activ[0][0]      
____________________________________________________________________________________________________
G_512to64x8_add (Add)            (None, 1, 7)          0           G_512to64x8_input[0][0]          
                                                                   G_512to64x8_output[0][0]         
====================================================================================================
Total params: 530,631
Trainable params: 530,631
Non-trainable params: 0
____________________________________________________________________________________________________

In [20]:
gan.get_discriminator()[0].summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
D_1024to32x6_c_input (InputLayer (None, 1, 5)          0                                            
____________________________________________________________________________________________________
D_1024to32x6_input (InputLayer)  (None, 1, 7)          0                                            
____________________________________________________________________________________________________
D_1024to32x6_all_inputs (Concate (None, 1, 12)         0           D_1024to32x6_c_input[0][0]       
                                                                   D_1024to32x6_input[0][0]         
____________________________________________________________________________________________________
D_1024to32x6_down1_dense (Dense) (None, 1, 1024)       13312       D_1024to32x6_all_inputs[0][0]    
____________________________________________________________________________________________________
D_1024to32x6_down1_activ (Activa (None, 1, 1024)       0           D_1024to32x6_down1_dense[0][0]   
____________________________________________________________________________________________________
D_1024to32x6_down2_dense (Dense) (None, 1, 512)        524800      D_1024to32x6_down1_activ[0][0]   
____________________________________________________________________________________________________
D_1024to32x6_down2_activ (Activa (None, 1, 512)        0           D_1024to32x6_down2_dense[0][0]   
____________________________________________________________________________________________________
D_1024to32x6_down3_dense (Dense) (None, 1, 128)        65664       D_1024to32x6_down2_activ[0][0]   
____________________________________________________________________________________________________
D_1024to32x6_down3_activ (Activa (None, 1, 128)        0           D_1024to32x6_down3_dense[0][0]   
____________________________________________________________________________________________________
D_1024to32x6_down4_dense (Dense) (None, 1, 64)         8256        D_1024to32x6_down3_activ[0][0]   
____________________________________________________________________________________________________
D_1024to32x6_down4_activ (Activa (None, 1, 64)         0           D_1024to32x6_down4_dense[0][0]   
____________________________________________________________________________________________________
D_1024to32x6_down5_dense (Dense) (None, 1, 32)         2080        D_1024to32x6_down4_activ[0][0]   
____________________________________________________________________________________________________
D_1024to32x6_down5_activ (Activa (None, 1, 32)         0           D_1024to32x6_down5_dense[0][0]   
____________________________________________________________________________________________________
D_1024to32x6_flat (Flatten)      (None, 32)            0           D_1024to32x6_down5_activ[0][0]   
____________________________________________________________________________________________________
D_1024to32x6_output (Dense)      (None, 1)             33          D_1024to32x6_flat[0][0]          
====================================================================================================
Total params: 614,145
Trainable params: 614,145
Non-trainable params: 0
____________________________________________________________________________________________________

In [21]:
gan.get_discriminator()[1].summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
D_1024to32x6_c_input (InputLayer (None, 1, 5)          0                                            
____________________________________________________________________________________________________
D_1024to32x6_input (InputLayer)  (None, 1, 7)          0                                            
____________________________________________________________________________________________________
D_1024to32x6_all_inputs (Concate (None, 1, 12)         0           D_1024to32x6_c_input[0][0]       
                                                                   D_1024to32x6_input[0][0]         
____________________________________________________________________________________________________
D_1024to32x6_down1_dense (Dense) (None, 1, 1024)       13312       D_1024to32x6_all_inputs[0][0]    
____________________________________________________________________________________________________
D_1024to32x6_down1_activ (Activa (None, 1, 1024)       0           D_1024to32x6_down1_dense[1][0]   
____________________________________________________________________________________________________
D_1024to32x6_down2_dense (Dense) (None, 1, 512)        524800      D_1024to32x6_down1_activ[1][0]   
____________________________________________________________________________________________________
D_1024to32x6_down2_activ (Activa (None, 1, 512)        0           D_1024to32x6_down2_dense[1][0]   
____________________________________________________________________________________________________
D_1024to32x6_down3_dense (Dense) (None, 1, 128)        65664       D_1024to32x6_down2_activ[1][0]   
____________________________________________________________________________________________________
D_1024to32x6_down3_activ (Activa (None, 1, 128)        0           D_1024to32x6_down3_dense[1][0]   
____________________________________________________________________________________________________
D_1024to32x6_down4_dense (Dense) (None, 1, 64)         8256        D_1024to32x6_down3_activ[1][0]   
____________________________________________________________________________________________________
D_1024to32x6_down4_activ (Activa (None, 1, 64)         0           D_1024to32x6_down4_dense[1][0]   
____________________________________________________________________________________________________
D_1024to32x6_down5_dense (Dense) (None, 1, 32)         2080        D_1024to32x6_down4_activ[1][0]   
____________________________________________________________________________________________________
D_1024to32x6_down5_activ (Activa (None, 1, 32)         0           D_1024to32x6_down5_dense[1][0]   
____________________________________________________________________________________________________
D_1024to32x6_flat (Flatten)      (None, 32)            0           D_1024to32x6_down5_activ[1][0]   
____________________________________________________________________________________________________
D_1024to32x6_output (Dense)      (None, 1)             33          D_1024to32x6_flat[1][0]          
====================================================================================================
Total params: 614,145
Trainable params: 614,145
Non-trainable params: 0
____________________________________________________________________________________________________

In [22]:
# sanity check
set(gan.am[0].trainable_weights)-set(gan.am[1].trainable_weights)


Out[22]:
set()

Training

Everything is ready. We start training.


In [ ]:
from keras.optimizers import RMSprop

if PRETRAIN_G:
    generator = gan.get_generator()
    generator.compile(loss="mse",optimizer=RMSprop(lr=1.e-3))
    generator.fit( [mc_c_train,mc_x_train], [mc_c_train,mc_x_train], sample_weight=[mc_w_train,mc_w_train], epochs=1, batch_size=BATCH_SIZE  )

In [ ]:
# reload(base)

initial_epoch = 0
# if hasattr(gan.model,"history"):
#     initial_epoch = gan.model.history.epoch[-1] + 1

do = dict(
    x_train=data_x_train,
    z_train=mc_x_train,
    c_x_train=data_c_train,
    c_z_train=mc_c_train,
    w_x_train = data_w_train,
    w_z_train = mc_w_train,
          
    x_test=data_x_test,
    z_test=mc_x_test,
    c_x_test=data_c_test,
    c_z_test=mc_c_test,
    w_x_test = data_w_test,
    w_z_test = mc_w_test,
    
    n_epochs=EPOCHS + initial_epoch - 1,
    initial_epoch=initial_epoch,
    batch_size=BATCH_SIZE,
    plot_every=PLOT_EVERY,
    
    monitor_dir=MONITOR_DIR
)

exc = None
try:
    gan.adversarial_fit(**do)
except Exception as e:
    exc = e

In [ ]:
if not exc is None:
    print(exc)

In [ ]: