In [1]:
import sys
sys.path.append("..")

In [2]:
%load_ext cython
import pickle
import numpy as np

In [3]:
%%cython -f -+ -I/home/gilles/anaconda3/envs/hep/include --link-args=-Wl,-rpath,/home/gilles/anaconda3/envs/hep/lib  -lm -L/home/gilles/anaconda3/envs/hep/lib -lfastjettools -lfastjet -lfastjetplugins -lsiscone_spherical -lsiscone
import numpy as np
cimport numpy as np
np.import_array()

from libcpp.pair cimport pair
from libcpp.vector cimport vector

cdef extern from "/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/notebooks/fj.cc":
    void fj(vector[double]& a, 
            vector[vector[int]]& trees, 
            vector[vector[double]]& contents, 
            vector[double]& masses, 
            vector[double]& pts, 
            double R, int jet_algorithm)
    
cpdef cluster(np.ndarray[np.double_t, ndim=2, mode="c"] a, 
              R=0.3, jet_algorithm=0):
    cdef vector[double] v
    cdef vector[vector[int]] trees
    cdef vector[vector[double]] contents
    cdef vector[double] masses
    cdef vector[double] pts 
    for value in a.ravel():
        v.push_back(value)
    
    fj(v, trees, contents, masses, pts, R=R, jet_algorithm=jet_algorithm)
    jets = []
    
    for tree, content, mass, pt in zip(trees, contents, masses, pts):
        tree = np.array([e for e in tree]).reshape(-1, 2)
        content = np.array([e for e in content]).reshape(-1, 4)
        jets.append((tree, content, mass, pt))
        
    return jets

In [4]:
import copy
from rootpy.vector import LorentzVector
from recnn.preprocessing import _pt

# Preprocessing algorithm:
# 1. j = the highest pt anti-kt jet (R=1)
# 2. run kt (R=0.3) on the constituents c of j, resulting in subjets sj1, sj2, ..., sjN
# 3. phi = sj1.phi(); for all c, do c.rotate_z(-phi)
# 4. bv = sj1.boost_vector(); bv.set_perp(0); for all c, do c.boost(-bv)
# 5. deltaz = sj1.pz - sj2.pz; deltay = sj1.py - sj2.py; alpha = -atan2(deltaz, deltay); for all c, do c.rotate_x(alpha)
# 6. if sj3.pz < 0: for all c, do c.set_pz(-c.pz)
# 7. finally recluster all transformed constituents c into a single jet (using kt or anti-kt? r?)

def preprocess(jet, output="kt", colinear_splits=0, trimming=0.0):
    jet = copy.deepcopy(jet)
    constituents = jet["content"][jet["tree"][:, 0] == -1] 
    
    for i in range(colinear_splits):
        #j = np.random.randint(len(constituents))
        j = np.argmax([_pt(c) for c in constituents])
        v = LorentzVector(constituents[j])
        
        eps = np.random.rand()
        
        p1 = LorentzVector()
        p2 = LorentzVector()
        p1.set_pt_eta_phi_m(v.pt() * eps, v.eta(), v.phi(), v.m() * eps ** 0.5)
        p2.set_pt_eta_phi_m(v.pt() * (1. - eps), v.eta(), v.phi(), 0.0)

        constituents[j][0] = p1.px
        constituents[j][1] = p1.py
        constituents[j][2] = p1.pz
        constituents[j][3] = p1.e
        
        constituents = np.vstack([constituents, 
                                  np.array([[p2.px, p2.py, p2.pz, p2.e]])])

    # run kt (R=0.3) on the constituents c of j, resulting in subjets sj1, sj2, ..., sjN
    subjets = cluster(constituents, R=0.3, jet_algorithm=0)
    
    # trimming
    if trimming > 0.0:
        subjets = [(tree, content, mass, pt) for tree, content, mass, pt in subjets if pt > trimming * jet["pt"]]
    else:
        subjets = [(tree, content, mass, pt) for tree, content, mass, pt in subjets]
    
    # phi = sj1.phi()
    # for all c, do c.rotate_z(-phi)
    v = subjets[0][1][0]
    v = LorentzVector(v)
    phi = v.phi()
    
    for _, content, _, _ in subjets:
        for i in range(len(content)):
            v = LorentzVector(content[i])
            v.rotate_z(-phi)
            content[i, 0] = v[0]
            content[i, 1] = v[1]
            content[i, 2] = v[2]
            content[i, 3] = v[3]
            
    # bv = sj1.boost_vector()
    # bv.set_perp(0)
    # for all c, do c.boost(-bv)
    v = subjets[0][1][0]
    v = LorentzVector(v)
    bv = v.boost_vector()
    bv.set_perp(0)
    
    for _, content, _, _ in subjets:        
        for i in range(len(content)):
            v = LorentzVector(content[i])
            v.boost(-bv)
            content[i, 0] = v[0]
            content[i, 1] = v[1]
            content[i, 2] = v[2]
            content[i, 3] = v[3]
    
    # deltaz = sj1.pz - sj2.pz
    # deltay = sj1.py - sj2.py
    # alpha = -atan2(deltaz, deltay)
    # for all c, do c.rotate_x(alpha)
    if len(subjets) >= 2:
        deltaz = subjets[0][1][0, 2] - subjets[1][1][0, 2]
        deltay = subjets[0][1][0, 1] - subjets[1][1][0, 1]
        alpha = -np.arctan2(deltaz, deltay)

        for _, content, _, _ in subjets:
            for i in range(len(content)):
                v = LorentzVector(content[i])
                v.rotate_x(alpha)
                content[i, 0] = v[0]
                content[i, 1] = v[1]
                content[i, 2] = v[2]
                content[i, 3] = v[3]
    
    # if sj3.pz < 0: for all c, do c.set_pz(-c.pz)
    if len(subjets) >= 3 and subjets[2][1][0, 2] < 0:
        for _, content, _, _ in subjets:
            for i in range(len(content)):
                content[i, 2] *= -1.0
                
    # finally recluster all transformed constituents c into a single jet 
    constituents = []
    
    for tree, content, _, _ in subjets:
        constituents.append(content[tree[:, 0] == -1])
        
    constituents = np.vstack(constituents)
    
    if output == "anti-kt":
        subjets = cluster(constituents, R=100., jet_algorithm=1)
    elif output == "kt":
        subjets = cluster(constituents, R=100., jet_algorithm=0)
    elif output == "cambridge":
        subjets = cluster(constituents, R=100., jet_algorithm=2)
    else:
        raise
    
    jet["tree"] = subjets[0][0]
    jet["content"] = subjets[0][1]
    
    v = LorentzVector(jet["content"][0])
    jet["phi"] = v.phi()
    jet["eta"] = v.eta()
    jet["energy"] = v.E()
    jet["mass"] = v.m()
    jet["pt"] = v.pt()
    jet["root_id"] = 0
    
    return jet

Convert data


In [5]:
import sys
sys.path.append("..")

In [8]:
from recnn.preprocessing import randomize
from recnn.preprocessing import sequentialize_by_pt

f = "../data/w-vs-qcd/anti-kt/antikt-train.pickle-py27"
# f = "../data/w-vs-qcd/anti-kt/antikt-test.pickle-py27"

# f = "../data/w-vs-qcd/anti-kt/antikt-soft-train.pickle"
# f = "../data/w-vs-qcd/anti-kt/antikt-soft-test.pickle"

fd = open(f, "rb")
X, y = pickle.load(fd)
fd.close()

In [ ]:
# 1. anti-kt highest pt -> preprocessing -> anti-kt
X_ = []

for j in X:
    X_.append(preprocess(j, output="anti-kt"))
    
fd = open("%s-anti-kt" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()

In [9]:
# 2. anti-kt highest pt -> preprocessing -> kt
X_ = []

for j in X:
    X_.append(preprocess(j, output="kt"))
    
fd = open("%s-kt" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()

In [ ]:
# 3. anti-kt highest pt -> preprocessing -> random
X_ = []

for j in X:
    X_.append(randomize(preprocess(j, output="anti-kt")))
    
fd = open("%s-random" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()

In [ ]:
# 4. anti-kt highest pt -> preprocessing -> seq by pt
X_ = []

for j in X:
    X_.append(sequentialize_by_pt(preprocess(j, output="anti-kt"), reverse=False))
    
fd = open("%s-seqpt" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()

X_ = []

for j in X:
    X_.append(sequentialize_by_pt(preprocess(j, output="anti-kt"), reverse=True))
    
fd = open("%s-seqpt-reversed" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()

In [7]:
# 5. anti-kt highest pt -> preprocessing -> cambridge
X_ = []

for j in X:
    X_.append(preprocess(j, output="cambridge"))
    
fd = open("%s-cambridge" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()


5

In [9]:
# 6. anti-kt highest pt -> preprocessing -> kt-colinear1
X_ = []

for j in X:
    X_.append(preprocess(j, output="kt", colinear_splits=10))
    
fd = open("%s-kt-colinear10-max" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()

In [9]:
# Trimmed data
f = "../data/w-vs-qcd/anti-kt/antikt-train.pickle-py27"
# f = "../data/w-vs-qcd/anti-kt/antikt-test.pickle-py27"

fd = open(f, "rb")
X, y = pickle.load(fd)
fd.close()

# anti-kt highest pt -> preprocessing(trimming=0.05) -> kt
X_ = []

for j in X:
    X_.append(preprocess(j, output="anti-kt", trimming=0.05))
    
fd = open("%s-anti-kt-trimmed" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()


2

In [10]:
from recnn.preprocessing import sequentialize_by_pt
from recnn.preprocessing import randomize

# random trees, asc-pt, desc-pt

# delphes
#f = "../data/w-vs-qcd/anti-kt/antikt-delphes-train.pickle"
f = "../data/w-vs-qcd/anti-kt/antikt-delphes-test.pickle"

fd = open(f, "rb")
X, y = pickle.load(fd)
fd.close()

# anti-kt highest pt -> preprocessing -> kt
X_ = []

for j in X:
    X_.append(randomize(preprocess(j, output="anti-kt"))
    
fd = open("%s-random" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()

X_ = []

for j in X:
    X_.append(sequentialize_by_pt(preprocess(j, output="anti-kt"), reverse=False))
    
fd = open("%s-seqpt" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()

X_ = []

for j in X:
    X_.append(sequentialize_by_pt(preprocess(j, output="anti-kt"), reverse=True))
    
fd = open("%s-seqpt-reversed" % f, "wb")
pickle.dump((X_, y), fd, protocol=2)
fd.close()

In [16]:
# event-level
import pickle

# f = "../data/w-vs-qcd/anti-kt/antikt-event-train.pickle"
# f = "../data/w-vs-qcd/anti-kt/antikt-event-test.pickle"
# f = "../data/w-vs-qcd/anti-kt/antikt-delphes-event-train.pickle"
f = "../data/w-vs-qcd/anti-kt/antikt-delphes-event-test.pickle"

fd = open(f, "rb")
fd_out = open("%s-kt" % f, "wb")

for i in range(20000):
    try:
        event, y = pickle.load(fd)
        jets = [(j["phi"], j["eta"], j["pt"], j["mass"], preprocess(j, output="kt")) for j in event]
        pickle.dump((jets, y), fd_out, protocol=2)
    except IndexError:
        print(i)

fd.close()
fd_out.close()

Checks


In [9]:
fd = open("../data/w-vs-qcd/final/antikt-kt-test.pickle", "rb")
X, y = pickle.load(fd)
fd.close()

In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

plt.rcParams["figure.figsize"] = (6,6)
from rootpy.vector import LorentzVector

In [21]:
a = []

for j in X[10000:]:
    constituents = j["content"][j["tree"][:, 0] == -1]
    a.append(np.array([[LorentzVector(c).eta(), 
                        LorentzVector(c).phi()] for c in constituents]))
    
a = np.vstack(a)

plt.hist2d(a[:, 0], a[:, 1], range=[(-2,2), (-2,2)], 
           bins=50, cmap="hsv", norm=LogNorm())
plt.show()

a = []

for j in X[:10000]:
    constituents = j["content"][j["tree"][:, 0] == -1]
    a.append(np.array([[LorentzVector(c).eta(), 
                        LorentzVector(c).phi()] for c in constituents]))
    
a = np.vstack(a)

plt.hist2d(a[:, 0], a[:, 1], range=[(-2,2), (-2,2)], 
           bins=50, cmap="hsv", norm=LogNorm())
plt.show()


Repartition into 100k train / 100k test


In [6]:
from sklearn.utils import check_random_state

# prefixes = ["antikt-antikt", "antikt-antikt-trimmed",
#             "antikt-cambridge", "antikt-kt-delphes",
#             "antikt-kt-images", "antikt-kt", "antikt-kt-trimmed",
#             "antikt-random", "antikt-seqpt", "antikt-kt-soft",
#             "antikt-kt-colinear1", "antikt-kt-colinear10",
#             "antikt-antikt-delphes", "antikt-seqpt-delphes", "antikt-random-delphes", 
#             "antikt-cambridge-delphes", "antikt-seqpt-reversed", "antikt-seqpt-reversed-delphes", "antikt-kt-colinear1-max"]

prefixes = ["antikt-kt-colinear10-max"]

for prefix in prefixes:
    print(prefix)

    filename_train = "../data/w-vs-qcd/final/%s-train.pickle" % prefix
    filename_test = "../data/w-vs-qcd/final/%s-test.pickle" % prefix

    fd = open(filename_train, "rb")
    X1, y1 = pickle.load(fd)
    fd.close()

    fd = open(filename_test, "rb")
    X2, y2 = pickle.load(fd)
    fd.close()

    rng = check_random_state(1)
    indices = rng.permutation(len(X1))

    X_train = [X1[j] for j in indices[:100000]]
    y_train = [y1[j] for j in indices[:100000]]
    X_test = [X1[j] for j in indices[100000:]]
    X_test.extend(X2)
    y_test = [y1[j] for j in indices[100000:]]
    y_test.extend(y2)

    print(len(X_train), len(y_train))
    print(len(X_test), len(y_test))

    fd = open(filename_train, "wb")
    pickle.dump((X_train, y_train), fd, protocol=2)
    fd.close()

    fd = open(filename_test, "wb")
    pickle.dump((X_test, y_test), fd, protocol=2)
    fd.close()


antikt-kt-colinear10-max
(100000, 100000)
(100000, 100000)