In [1]:
import sys
sys.path.append("..")
In [2]:
%load_ext cython
import pickle
import numpy as np
In [3]:
%%cython -f -+ -I/home/gilles/anaconda3/envs/hep/include --link-args=-Wl,-rpath,/home/gilles/anaconda3/envs/hep/lib -lm -L/home/gilles/anaconda3/envs/hep/lib -lfastjettools -lfastjet -lfastjetplugins -lsiscone_spherical -lsiscone
import numpy as np
cimport numpy as np
np.import_array()
from libcpp.pair cimport pair
from libcpp.vector cimport vector
cdef extern from "/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/notebooks/fj.cc":
void fj(vector[double]& a,
vector[vector[int]]& trees,
vector[vector[double]]& contents,
vector[double]& masses,
vector[double]& pts,
double R, int jet_algorithm)
cpdef cluster(np.ndarray[np.double_t, ndim=2, mode="c"] a,
R=0.3, jet_algorithm=0):
cdef vector[double] v
cdef vector[vector[int]] trees
cdef vector[vector[double]] contents
cdef vector[double] masses
cdef vector[double] pts
for value in a.ravel():
v.push_back(value)
fj(v, trees, contents, masses, pts, R=R, jet_algorithm=jet_algorithm)
jets = []
for tree, content, mass, pt in zip(trees, contents, masses, pts):
tree = np.array([e for e in tree]).reshape(-1, 2)
content = np.array([e for e in content]).reshape(-1, 4)
jets.append((tree, content, mass, pt))
return jets
In [4]:
import copy
from rootpy.vector import LorentzVector
from recnn.preprocessing import _pt
# Preprocessing algorithm:
# 1. j = the highest pt anti-kt jet (R=1)
# 2. run kt (R=0.3) on the constituents c of j, resulting in subjets sj1, sj2, ..., sjN
# 3. phi = sj1.phi(); for all c, do c.rotate_z(-phi)
# 4. bv = sj1.boost_vector(); bv.set_perp(0); for all c, do c.boost(-bv)
# 5. deltaz = sj1.pz - sj2.pz; deltay = sj1.py - sj2.py; alpha = -atan2(deltaz, deltay); for all c, do c.rotate_x(alpha)
# 6. if sj3.pz < 0: for all c, do c.set_pz(-c.pz)
# 7. finally recluster all transformed constituents c into a single jet (using kt or anti-kt? r?)
def preprocess(jet, output="kt", colinear_splits=0, trimming=0.0):
jet = copy.deepcopy(jet)
constituents = jet["content"][jet["tree"][:, 0] == -1]
for i in range(colinear_splits):
#j = np.random.randint(len(constituents))
j = np.argmax([_pt(c) for c in constituents])
v = LorentzVector(constituents[j])
eps = np.random.rand()
p1 = LorentzVector()
p2 = LorentzVector()
p1.set_pt_eta_phi_m(v.pt() * eps, v.eta(), v.phi(), v.m() * eps ** 0.5)
p2.set_pt_eta_phi_m(v.pt() * (1. - eps), v.eta(), v.phi(), 0.0)
constituents[j][0] = p1.px
constituents[j][1] = p1.py
constituents[j][2] = p1.pz
constituents[j][3] = p1.e
constituents = np.vstack([constituents,
np.array([[p2.px, p2.py, p2.pz, p2.e]])])
# run kt (R=0.3) on the constituents c of j, resulting in subjets sj1, sj2, ..., sjN
subjets = cluster(constituents, R=0.3, jet_algorithm=0)
# trimming
if trimming > 0.0:
subjets = [(tree, content, mass, pt) for tree, content, mass, pt in subjets if pt > trimming * jet["pt"]]
else:
subjets = [(tree, content, mass, pt) for tree, content, mass, pt in subjets]
# phi = sj1.phi()
# for all c, do c.rotate_z(-phi)
v = subjets[0][1][0]
v = LorentzVector(v)
phi = v.phi()
for _, content, _, _ in subjets:
for i in range(len(content)):
v = LorentzVector(content[i])
v.rotate_z(-phi)
content[i, 0] = v[0]
content[i, 1] = v[1]
content[i, 2] = v[2]
content[i, 3] = v[3]
# bv = sj1.boost_vector()
# bv.set_perp(0)
# for all c, do c.boost(-bv)
v = subjets[0][1][0]
v = LorentzVector(v)
bv = v.boost_vector()
bv.set_perp(0)
for _, content, _, _ in subjets:
for i in range(len(content)):
v = LorentzVector(content[i])
v.boost(-bv)
content[i, 0] = v[0]
content[i, 1] = v[1]
content[i, 2] = v[2]
content[i, 3] = v[3]
# deltaz = sj1.pz - sj2.pz
# deltay = sj1.py - sj2.py
# alpha = -atan2(deltaz, deltay)
# for all c, do c.rotate_x(alpha)
if len(subjets) >= 2:
deltaz = subjets[0][1][0, 2] - subjets[1][1][0, 2]
deltay = subjets[0][1][0, 1] - subjets[1][1][0, 1]
alpha = -np.arctan2(deltaz, deltay)
for _, content, _, _ in subjets:
for i in range(len(content)):
v = LorentzVector(content[i])
v.rotate_x(alpha)
content[i, 0] = v[0]
content[i, 1] = v[1]
content[i, 2] = v[2]
content[i, 3] = v[3]
# if sj3.pz < 0: for all c, do c.set_pz(-c.pz)
if len(subjets) >= 3 and subjets[2][1][0, 2] < 0:
for _, content, _, _ in subjets:
for i in range(len(content)):
content[i, 2] *= -1.0
# finally recluster all transformed constituents c into a single jet
constituents = []
for tree, content, _, _ in subjets:
constituents.append(content[tree[:, 0] == -1])
constituents = np.vstack(constituents)
if output == "anti-kt":
subjets = cluster(constituents, R=100., jet_algorithm=1)
elif output == "kt":
subjets = cluster(constituents, R=100., jet_algorithm=0)
elif output == "cambridge":
subjets = cluster(constituents, R=100., jet_algorithm=2)
else:
raise
jet["tree"] = subjets[0][0]
jet["content"] = subjets[0][1]
v = LorentzVector(jet["content"][0])
jet["phi"] = v.phi()
jet["eta"] = v.eta()
jet["energy"] = v.E()
jet["mass"] = v.m()
jet["pt"] = v.pt()
jet["root_id"] = 0
return jet
In [5]:
import sys
sys.path.append("..")
In [8]:
from recnn.preprocessing import randomize
from recnn.preprocessing import sequentialize_by_pt
f = "../data/w-vs-qcd/anti-kt/antikt-train.pickle-py27"
# f = "../data/w-vs-qcd/anti-kt/antikt-test.pickle-py27"
# f = "../data/w-vs-qcd/anti-kt/antikt-soft-train.pickle"
# f = "../data/w-vs-qcd/anti-kt/antikt-soft-test.pickle"
fd = open(f, "rb")
X, y = pickle.load(fd)
fd.close()
In [ ]:
# 1. anti-kt highest pt -> preprocessing -> anti-kt
X_ = []
for j in X:
X_.append(preprocess(j, output="anti-kt"))
fd = open("%s-anti-kt" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()
In [9]:
# 2. anti-kt highest pt -> preprocessing -> kt
X_ = []
for j in X:
X_.append(preprocess(j, output="kt"))
fd = open("%s-kt" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()
In [ ]:
# 3. anti-kt highest pt -> preprocessing -> random
X_ = []
for j in X:
X_.append(randomize(preprocess(j, output="anti-kt")))
fd = open("%s-random" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()
In [ ]:
# 4. anti-kt highest pt -> preprocessing -> seq by pt
X_ = []
for j in X:
X_.append(sequentialize_by_pt(preprocess(j, output="anti-kt"), reverse=False))
fd = open("%s-seqpt" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()
X_ = []
for j in X:
X_.append(sequentialize_by_pt(preprocess(j, output="anti-kt"), reverse=True))
fd = open("%s-seqpt-reversed" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()
In [7]:
# 5. anti-kt highest pt -> preprocessing -> cambridge
X_ = []
for j in X:
X_.append(preprocess(j, output="cambridge"))
fd = open("%s-cambridge" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()
In [9]:
# 6. anti-kt highest pt -> preprocessing -> kt-colinear1
X_ = []
for j in X:
X_.append(preprocess(j, output="kt", colinear_splits=10))
fd = open("%s-kt-colinear10-max" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()
In [9]:
# Trimmed data
f = "../data/w-vs-qcd/anti-kt/antikt-train.pickle-py27"
# f = "../data/w-vs-qcd/anti-kt/antikt-test.pickle-py27"
fd = open(f, "rb")
X, y = pickle.load(fd)
fd.close()
# anti-kt highest pt -> preprocessing(trimming=0.05) -> kt
X_ = []
for j in X:
X_.append(preprocess(j, output="anti-kt", trimming=0.05))
fd = open("%s-anti-kt-trimmed" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()
In [10]:
from recnn.preprocessing import sequentialize_by_pt
from recnn.preprocessing import randomize
# random trees, asc-pt, desc-pt
# delphes
#f = "../data/w-vs-qcd/anti-kt/antikt-delphes-train.pickle"
f = "../data/w-vs-qcd/anti-kt/antikt-delphes-test.pickle"
fd = open(f, "rb")
X, y = pickle.load(fd)
fd.close()
# anti-kt highest pt -> preprocessing -> kt
X_ = []
for j in X:
X_.append(randomize(preprocess(j, output="anti-kt"))
fd = open("%s-random" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()
X_ = []
for j in X:
X_.append(sequentialize_by_pt(preprocess(j, output="anti-kt"), reverse=False))
fd = open("%s-seqpt" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()
X_ = []
for j in X:
X_.append(sequentialize_by_pt(preprocess(j, output="anti-kt"), reverse=True))
fd = open("%s-seqpt-reversed" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()
In [16]:
# event-level
import pickle
# f = "../data/w-vs-qcd/anti-kt/antikt-event-train.pickle"
# f = "../data/w-vs-qcd/anti-kt/antikt-event-test.pickle"
# f = "../data/w-vs-qcd/anti-kt/antikt-delphes-event-train.pickle"
f = "../data/w-vs-qcd/anti-kt/antikt-delphes-event-test.pickle"
fd = open(f, "rb")
fd_out = open("%s-kt" % f, "wb")
for i in range(20000):
try:
event, y = pickle.load(fd)
jets = [(j["phi"], j["eta"], j["pt"], j["mass"], preprocess(j, output="kt")) for j in event]
pickle.dump((jets, y), fd_out, protocol=2)
except IndexError:
print(i)
fd.close()
fd_out.close()
In [9]:
fd = open("../data/w-vs-qcd/final/antikt-kt-test.pickle", "rb")
X, y = pickle.load(fd)
fd.close()
In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
plt.rcParams["figure.figsize"] = (6,6)
from rootpy.vector import LorentzVector
In [21]:
a = []
for j in X[10000:]:
constituents = j["content"][j["tree"][:, 0] == -1]
a.append(np.array([[LorentzVector(c).eta(),
LorentzVector(c).phi()] for c in constituents]))
a = np.vstack(a)
plt.hist2d(a[:, 0], a[:, 1], range=[(-2,2), (-2,2)],
bins=50, cmap="hsv", norm=LogNorm())
plt.show()
a = []
for j in X[:10000]:
constituents = j["content"][j["tree"][:, 0] == -1]
a.append(np.array([[LorentzVector(c).eta(),
LorentzVector(c).phi()] for c in constituents]))
a = np.vstack(a)
plt.hist2d(a[:, 0], a[:, 1], range=[(-2,2), (-2,2)],
bins=50, cmap="hsv", norm=LogNorm())
plt.show()
In [6]:
from sklearn.utils import check_random_state
# prefixes = ["antikt-antikt", "antikt-antikt-trimmed",
# "antikt-cambridge", "antikt-kt-delphes",
# "antikt-kt-images", "antikt-kt", "antikt-kt-trimmed",
# "antikt-random", "antikt-seqpt", "antikt-kt-soft",
# "antikt-kt-colinear1", "antikt-kt-colinear10",
# "antikt-antikt-delphes", "antikt-seqpt-delphes", "antikt-random-delphes",
# "antikt-cambridge-delphes", "antikt-seqpt-reversed", "antikt-seqpt-reversed-delphes", "antikt-kt-colinear1-max"]
prefixes = ["antikt-kt-colinear10-max"]
for prefix in prefixes:
print(prefix)
filename_train = "../data/w-vs-qcd/final/%s-train.pickle" % prefix
filename_test = "../data/w-vs-qcd/final/%s-test.pickle" % prefix
fd = open(filename_train, "rb")
X1, y1 = pickle.load(fd)
fd.close()
fd = open(filename_test, "rb")
X2, y2 = pickle.load(fd)
fd.close()
rng = check_random_state(1)
indices = rng.permutation(len(X1))
X_train = [X1[j] for j in indices[:100000]]
y_train = [y1[j] for j in indices[:100000]]
X_test = [X1[j] for j in indices[100000:]]
X_test.extend(X2)
y_test = [y1[j] for j in indices[100000:]]
y_test.extend(y2)
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))
fd = open(filename_train, "wb")
pickle.dump((X_train, y_train), fd, protocol=2)
fd.close()
fd = open(filename_test, "wb")
pickle.dump((X_test, y_test), fd, protocol=2)
fd.close()