In [1]:
import sys
sys.path.append("..")

In [2]:
import numpy as np
import logging
import pickle

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler
from sklearn.utils import check_random_state

from recnn.preprocessing import permute_by_pt
from recnn.preprocessing import extract
from recnn.recnn import grnn_transform_gated
from recnn.recnn import grnn_predict_gated

%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8,8)

In [15]:
# filename_train = "../data/w-vs-qcd/final/antikt-kt-train.pickle"
# filename_test = "../data/w-vs-qcd/final/antikt-kt-test.pickle"

filename_train = "../data/w-vs-qcd/final/antikt-antikt-train.pickle"
filename_test = "../data/w-vs-qcd/final/antikt-antikt-test.pickle"

rng = check_random_state(1)

# Make training data
print("Loading training data...")

fd = open(filename_train, "rb")
X, y = pickle.load(fd)
fd.close()
y = np.array(y)

print("\tfilename = %s" % filename_train)
print("\tX size = %d" % len(X))
print("\ty size = %d" % len(y))

# Preprocessing 
print("Preprocessing...")
X = [extract(permute_by_pt(jet)) for jet in X]
tf = RobustScaler().fit(np.vstack([jet["content"] for jet in X]))

for jet in X:
    jet["content"] = tf.transform(jet["content"])

# Make test data 
print("Loading test data...")

fd = open(filename_test, "rb")
X, y = pickle.load(fd)
fd.close()
y = np.array(y)

print("\tfilename = %s" % filename_test)
print("\tX size = %d" % len(X))
print("\ty size = %d" % len(y))

# Preprocessing 
print("Preprocessing...")
X = [extract(permute_by_pt(jet)) for jet in X]

for jet in X:
    jet["content"] = tf.transform(jet["content"])


Loading training data...
	filename = ../data/w-vs-qcd/final/antikt-antikt-train.pickle
	X size = 100000
	y size = 100000
Preprocessing...
Loading test data...
	filename = ../data/w-vs-qcd/final/antikt-antikt-test.pickle
	X size = 100000
	y size = 100000
Preprocessing...

Visualization

Simple


In [16]:
# Loading model 
print("Loading model...")

#filename_model = "../models/jet-study-2/model-w-s-antikt-kt-1.pickle"
filename_model = "../models/jet-study-2/model-w-s-antikt-antikt-1.pickle"
fd = open(filename_model, "rb")
params = pickle.load(fd)
fd.close()

print("\tfilename = %s" % filename_model)


Loading model...
	filename = ../models/jet-study-2/model-w-s-antikt-antikt-1.pickle

In [5]:
from recnn.recnn import grnn_predict_simple
from sklearn.metrics import roc_auc_score
roc_auc_score(y, grnn_predict_simple(params, X))


Out[5]:
0.89237760716057901

In [6]:
import autograd as ag
import autograd.numpy as np
from recnn.recnn import log_loss, relu, sigmoid


def get_branch(tree, target, node_id, is_left):
    if target == node_id:
        return [(node_id, is_left)]
    elif tree[node_id, 0] == -1:
        return False
    else:
        left_result = get_branch(tree, target, tree[node_id, 0], True)
        right_result = get_branch(tree, target, tree[node_id, 1], False)
        
        if left_result:
            return [(node_id, is_left)] + left_result
        elif right_result:
            return [(node_id, is_left)] + right_result
        else:
            return False  


def grnn_embeddings(content, tree, params):
    embeddings = {}
    
    def _rec(node_id):
        u_k = relu(np.dot(params["W_u"], content[node_id].T).T + 
                   params["b_u"]).reshape(1, -1)
        
        if tree[node_id, 0] == -1:
            embeddings[node_id] = u_k
            return u_k 
        
        else:
            h_L = _rec(tree[node_id, 0])
            h_R = _rec(tree[node_id, 1])
            h = relu(np.dot(params["W_h"],
                            np.hstack((h_L, h_R, u_k)).T).T +
                     params["b_h"])
            embeddings[node_id] = h
            return h
        
    _rec(0)
    return embeddings


def make_dL_dh(content, tree, params, target):
    branch = get_branch(tree, target, 0, True)
    embeddings = grnn_embeddings(content, tree, params)
    
    def f(v):
        e = v
        
        for i, (node_id, is_left) in enumerate(branch[::-1]):
            if i == 0:
                e = v
                was_left = is_left
                
            elif i > 0:
                u_k = relu(np.dot(params["W_u"], content[node_id].T).T + 
                           params["b_u"]).reshape(1, -1)
                
                if was_left:
                    h_L = e
                    h_R = embeddings[tree[node_id, 1]]
                    
                else:
                    h_L = embeddings[tree[node_id, 0]]
                    h_R = e
                    
                h = relu(np.dot(params["W_h"], 
                                np.hstack((h_L, h_R, u_k)).T).T + 
                         params["b_h"])
                
                e = h
                was_left = is_left
                
        h = e
        h = relu(np.dot(params["W_clf"][0], h.T).T + params["b_clf"][0])
        h = relu(np.dot(params["W_clf"][1], h.T).T + params["b_clf"][1])
        h = sigmoid(np.dot(params["W_clf"][2], h.T).T + params["b_clf"][2])
        
        return h.ravel()
    
    df = ag.grad(f)
    
    return df


def make_dL_du(content, tree, params, target):
    branch = get_branch(tree, target, 0, True)
    embeddings = grnn_embeddings(content, tree, params)
    
    def f(v):
        e = v
        
        for i, (node_id, is_left) in enumerate(branch[::-1]):
            if i == 0:
                if tree[node_id, 0] == -1:
                    e = relu(np.dot(params["W_u"], v.T).T + 
                             params["b_u"]).reshape(1, -1)
                    
                else:
                    u_k = relu(np.dot(params["W_u"], v.T).T + 
                               params["b_u"]).reshape(1, -1)
                    h_L = embeddings[tree[node_id, 0]]
                    h_R = embeddings[tree[node_id, 1]]
                    h = relu(np.dot(params["W_h"], 
                                    np.hstack((h_L, h_R, u_k)).T).T + 
                             params["b_h"])
                    e = h
                
                was_left = is_left
                
            elif i > 0:
                u_k = relu(np.dot(params["W_u"], content[node_id].T).T + 
                           params["b_u"]).reshape(1, -1)
                
                if was_left:
                    h_L = e
                    h_R = embeddings[tree[node_id, 1]]
                    
                else:
                    h_L = embeddings[tree[node_id, 0]]
                    h_R = e
                    
                h = relu(np.dot(params["W_h"], 
                                np.hstack((h_L, h_R, u_k)).T).T + 
                         params["b_h"])
                
                e = h
                was_left = is_left
                
        h = e
        h = relu(np.dot(params["W_clf"][0], h.T).T + params["b_clf"][0])
        h = relu(np.dot(params["W_clf"][1], h.T).T + params["b_clf"][1])
        h = sigmoid(np.dot(params["W_clf"][2], h.T).T + params["b_clf"][2])

        return h.ravel()
    
    df = ag.grad(f)
        
    return df

In [11]:
from graphviz import Digraph

import matplotlib as mpl
import matplotlib.cm as cm


norm = mpl.colors.Normalize(vmin=0.001, vmax=2.0)
cmap = cm.viridis_r
m = cm.ScalarMappable(norm=norm, cmap=cmap)

def number_to_color(x):
    color = m.to_rgba(x)
    return "#%.2x%.2x%.2x%.2x" % (int(255*color[0]), 
                                  int(255*color[1]), 
                                  int(255*color[2]), 
                                  int(255*color[3]))

def plot_jet_simple(params, jet, label=None):   
    content = tf.inverse_transform(jet["content"])
    embeddings = grnn_embeddings(jet["content"], jet["tree"], params)
    
    # Build graph recursively
    dot = Digraph(graph_attr={"rank": "flow"}, 
                  edge_attr={"arrowsize": "0.5", "fontsize": "8.0"}, 
                  node_attr={"style": "filled"},
                  format="png")
    
    df = make_dL_dh(jet["content"], jet["tree"], params, 0)
    norm_root = (df(embeddings[0]) ** 2).sum() ** 0.5 
    
    if label:
        dot.graph_attr["label"] = label
    
    def _rec(jet, parent, node_id): 
        df = make_dL_du(jet["content"], jet["tree"], params, node_id)
        norm_u = (df(jet["content"][node_id]) ** 2).sum() ** 0.5 / norm_root
        
        df = make_dL_dh(jet["content"], jet["tree"], params, node_id)
        norm_h = (df(embeddings[node_id]) ** 2).sum() ** 0.5 / norm_root
        
        # Build subgraph
        sub = Digraph(graph_attr={"rank": "flow"}, 
                      node_attr={"fixedsize": "true", 
                                 "label": "", 
                                 "height": "0.1", 
                                 "width": "0.1",
                                 "style": "filled"},
                      edge_attr={"arrowsize": "0.5",
                                 "fontsize": "8.0"})
        
        size = "%.4f" % max(0.1, norm_h ** 0.5 / 2.0)
        sub.node("%d" % node_id, width=size, height=size, shape="circle",
                 color=number_to_color(norm_h)) 
        
        if jet["tree"][node_id, 0] == -1:
            size = "%.4f" % max(0.1, norm_u / (2.0 ** 0.5))
            sub.node("%d-N" % node_id, width=size, height=size, shape="diamond", 
                     color=number_to_color(norm_u))
            sub.edge("%d-N" % node_id, "%d" % node_id,  
                     color=number_to_color(norm_u),
                     #label="o_%d" % (1+node_id)
                     )#label="%.4f" % norm_u)
        
        dot.subgraph(sub)
        
        # Connect to parent
        if parent >= 0:            
            dot.edge("%d" % node_id, "%d" % parent,
                     color=number_to_color(norm_h),
                     #label="h_%d" % (1+node_id)
                     )
                     #label="%.4f" % norm_h)
            
        # Recursive calls
        if jet["tree"][node_id, 0] != -1:
            _rec(jet, node_id, jet["tree"][node_id, 0])
            _rec(jet, node_id, jet["tree"][node_id, 1])
        
    _rec(jet, -1, jet["root_id"]) 
    
    return dot

In [17]:
jet_id = np.argmin([len(j["tree"]) if len(j["tree"]) > 27 else np.inf for j in X]) 
print(jet_id)
dot = plot_jet_simple(params, X[jet_id],)
#                           label="y=%d, y_pred=%.4f" % (y[jet_id], 
#                                                        grnn_predict_simple(params, [X[jet_id]])[0]))


599

In [18]:
dot


Out[18]:
%3 0 1 1->0 2 2->1 3 3->2 4 4->3 5 5->4 6 6->5 7 7->6 8 8->7 9 9->8 10 10->9 11 11->10 12 12->11 13 13->12 14 14->13 14-N 14-N->14 15 15->13 15-N 15-N->15 16 16->12 16-N 16-N->16 17 17->11 17-N 17-N->17 18 18->10 18-N 18-N->18 19 19->9 19-N 19-N->19 20 20->8 20-N 20-N->20 21 21->7 21-N 21-N->21 22 22->6 22-N 22-N->22 23 23->5 23-N 23-N->23 24 24->4 24-N 24-N->24 25 25->3 25-N 25-N->25 26 26->2 26-N 26-N->26 27 27->1 27-N 27-N->27 28 28->0 28-N 28-N->28

In [19]:
dot.save("jet-antikt-%d.gv" % jet_id)


Out[19]:
'jet-antikt-599.gv'

In [32]:
np.random.seed(123)

for jet_id in np.random.permutation(len(X))[:20]:
    dot = plot_jet_simple(params, X[jet_id],
                          label="y=%d, y_pred=%.4f" % (y[jet_id], 
                                                       grnn_predict_simple(params, [X[jet_id]])[0]))
    dot.render("figures/%d-anti-kt" % jet_id)