In [1]:
import sys
sys.path.append("..")

In [2]:
import numpy as np
import pickle
from recnn.preprocessing import load_from_pickle

W vs QCD

The original splits were made as 180k for training and 20k for test.

We ended up rebalancing the splits as 100k for training and 100 for test. This repartition is found in the last cell of 03-preprocessing.


In [3]:
background = load_from_pickle("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/w-vs-qcd/anti-kt/antikt-qcd.pickle", 100000)
signal = load_from_pickle("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/w-vs-qcd/anti-kt/antikt-w.pickle", 100000)

X_train = []
y_train = []
X_test = []
y_test = []

for i in range(90000):
    X_train.append(background[i])
    y_train.append(0)
for i in range(90000):
    X_train.append(signal[i])
    y_train.append(1)
for i in range(90000, 100000):
    X_test.append(background[i])
    y_test.append(0)
for i in range(90000, 100000):
    X_test.append(signal[i])
    y_test.append(1)
    
fd = open("../data/w-vs-qcd/anti-kt/antikt-train.pickle", "wb")
pickle.dump((X_train, y_train), fd, protocol=pickle.HIGHEST_PROTOCOL)
fd.close()
fd = open("../data/w-vs-qcd/anti-kt/antikt-test.pickle", "wb")
pickle.dump((X_test, y_test), fd, protocol=pickle.HIGHEST_PROTOCOL)
fd.close()

In [3]:
# soft
background = load_from_pickle("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/w-vs-qcd/anti-kt/antikt-soft-qcd.pickle", 100000)
signal = load_from_pickle("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/w-vs-qcd/anti-kt/antikt-soft-w.pickle", 100000)

X_train = []
y_train = []
X_test = []
y_test = []

for i in range(90000):
    X_train.append(background[i])
    y_train.append(0)
for i in range(90000):
    X_train.append(signal[i])
    y_train.append(1)
for i in range(90000, 100000):
    X_test.append(background[i])
    y_test.append(0)
for i in range(90000, 100000):
    X_test.append(signal[i])
    y_test.append(1)
    
fd = open("../data/w-vs-qcd/anti-kt/antikt-soft-train.pickle", "wb")
pickle.dump((X_train, y_train), fd, protocol=pickle.HIGHEST_PROTOCOL)
fd.close()
fd = open("../data/w-vs-qcd/anti-kt/antikt-soft-test.pickle", "wb")
pickle.dump((X_test, y_test), fd, protocol=pickle.HIGHEST_PROTOCOL)
fd.close()

In [3]:
# delphes data
background = load_from_pickle("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/w-vs-qcd/anti-kt/antikt-qcd-delphes.pickle", 100000)
signal = load_from_pickle("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/w-vs-qcd/anti-kt/antikt-w-delphes.pickle", 100000)

X_train = []
y_train = []
X_test = []
y_test = []

for i in range(90000):
    X_train.append(background[i])
    y_train.append(0)
for i in range(90000):
    X_train.append(signal[i])
    y_train.append(1)
for i in range(90000, 100000):
    X_test.append(background[i])
    y_test.append(0)
for i in range(90000, 100000):
    X_test.append(signal[i])
    y_test.append(1)
    
fd = open("../data/w-vs-qcd/anti-kt/antikt-delphes-train.pickle", "wb")
pickle.dump((X_train, y_train), fd, protocol=pickle.HIGHEST_PROTOCOL)
fd.close()
fd = open("../data/w-vs-qcd/anti-kt/antikt-delphes-test.pickle", "wb")
pickle.dump((X_test, y_test), fd, protocol=pickle.HIGHEST_PROTOCOL)
fd.close()

In [3]:
# images data
background = load_from_pickle("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/w-vs-qcd/anti-kt/images-qcd.pickle", 100000)
signal = load_from_pickle("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/w-vs-qcd/anti-kt/images-w.pickle", 100000)

X_train = []
y_train = []
X_test = []
y_test = []

for i in range(50000):
    X_train.append(background[i])
    y_train.append(0)
for i in range(50000):
    X_train.append(signal[i])
    y_train.append(1)
for i in range(50000, 100000):
    X_test.append(background[i])
    y_test.append(0)
for i in range(50000, 100000):
    X_test.append(signal[i])
    y_test.append(1)
    
fd = open("../data/w-vs-qcd/anti-kt/images-train.pickle", "wb")
pickle.dump((X_train, y_train), fd, protocol=2)
fd.close()
fd = open("../data/w-vs-qcd/anti-kt/images-test.pickle", "wb")
pickle.dump((X_test, y_test), fd, protocol=2)
fd.close()


(100000, 100000)

In [4]:
# event-level data
fd_background = open("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/w-vs-qcd/anti-kt/antikt-qcd-event.pickle", "rb")
fd_signal = open("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/w-vs-qcd/anti-kt/antikt-w-event.pickle", "rb")
# fd_background = open("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/w-vs-qcd/anti-kt/antikt-delphes-qcd-event.pickle", "rb")
# fd_signal = open("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/w-vs-qcd/anti-kt/antikt-delphes-w-event.pickle", "rb")

fd_train = open("../data/w-vs-qcd/anti-kt/antikt-event-train.pickle", "wb")
# fd_train = open("../data/w-vs-qcd/anti-kt/antikt-delphes-event-train.pickle", "wb")
for i in range(40000):
    event = pickle.load(fd_background)
    pickle.dump((event, 0), fd_train, protocol=2)
    event = pickle.load(fd_signal)
    pickle.dump((event, 1), fd_train, protocol=2)
fd_train.close()

fd_test = open("../data/w-vs-qcd/anti-kt/antikt-event-test.pickle", "wb")
# fd_test = open("../data/w-vs-qcd/anti-kt/antikt-delphes-event-test.pickle", "wb")
for i in range(10000):
    event = pickle.load(fd_background)
    pickle.dump((event, 0), fd_test, protocol=2)
    event = pickle.load(fd_signal)
    pickle.dump((event, 1), fd_test, protocol=2)
fd_test.close()