In [116]:
import sys
sys.path.append("..")

In [117]:
import numpy as np
np.seterr(divide="ignore")
import logging
import pickle
import glob

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler
from sklearn.utils import check_random_state

from scipy import interp

from recnn.preprocessing import rewrite_content
from recnn.preprocessing import permute_by_pt
from recnn.preprocessing import extract
from recnn.preprocessing import sequentialize_by_pt
from recnn.preprocessing import randomize

%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (6, 6)

Plotting functions


In [118]:
from recnn.preprocessing import sequentialize_by_pt

def load_tf(filename_train, preprocess=None, n_events_train=-1):
    # Make training data
    print("Loading training data...")

    fd = open(filename_train, "rb")
    X, y = pickle.load(fd)
    fd.close()
    
    y = np.array(y)
    
    if n_events_train > 0:
        indices = check_random_state(123).permutation(len(X))[:n_events_train]
        X = [X[i] for i in indices]
        y = y[indices]

    print("\tfilename = %s" % filename_train)
    print("\tX size = %d" % len(X))
    print("\ty size = %d" % len(y))

    # Preprocessing 
    print("Preprocessing...")
    X = [rewrite_content(jet) for jet in X]
    
    if preprocess:
        X = [preprocess(jet) for jet in X]
        
    X = [extract(permute_by_pt(jet)) for jet in X]
    tf = RobustScaler().fit(np.vstack([jet["content"] for jet in X]))
    
    return tf

def load_test(tf, filename_test, preprocess=None, cropping=True):
    # Make test data 
    print("Loading test data...")

    fd = open(filename_test, "rb")
    X, y = pickle.load(fd)
    fd.close()
    y = np.array(y)

    print("\tfilename = %s" % filename_test)
    print("\tX size = %d" % len(X))
    print("\ty size = %d" % len(y))

    # Preprocessing 
    print("Preprocessing...")
    X = [rewrite_content(jet) for jet in X]
    
    if preprocess:
        X = [preprocess(jet) for jet in X]
        
    X = [extract(permute_by_pt(jet)) for jet in X]

    for jet in X:
        jet["content"] = tf.transform(jet["content"])
    
    if not cropping:
        return X, y
        
    # Cropping
    X_ = [j for j in X if 250 < j["pt"] < 300 and 50 < j["mass"] < 110]
    y_ = [y[i] for i, j in enumerate(X) if 250 < j["pt"] < 300 and 50 < j["mass"] < 110]

    X = X_
    y = y_
    y = np.array(y)
    
    print("\tX size = %d" % len(X))
    print("\ty size = %d" % len(y))
    
    # Weights for flatness in pt
    w = np.zeros(len(y))    
    
    X0 = [X[i] for i in range(len(y)) if y[i] == 0]
    pdf, edges = np.histogram([j["pt"] for j in X0], density=True, range=[250, 300], bins=50)
    pts = [j["pt"] for j in X0]
    indices = np.searchsorted(edges, pts) - 1
    inv_w = 1. / pdf[indices]
    inv_w /= inv_w.sum()
    w[y==0] = inv_w
        
    X1 = [X[i] for i in range(len(y)) if y[i] == 1]
    pdf, edges = np.histogram([j["pt"] for j in X1], density=True, range=[250, 300], bins=50)
    pts = [j["pt"] for j in X1]
    indices = np.searchsorted(edges, pts) - 1
    inv_w = 1. / pdf[indices]
    inv_w /= inv_w.sum()
    w[y==1] = inv_w
        
    return X, y, w

In [111]:
from recnn.recnn import grnn_transform_simple
from recnn.recnn import grnn_predict_simple
from recnn.recnn import grnn_predict_gated
from recnn.recnn import grnn_predict_simple_join


def predict(X, filename, func=grnn_predict_simple):
    fd = open(filename, "rb")
    params = pickle.load(fd)
    fd.close()
    y_pred = func(params, X)
    return y_pred


def evaluate_models(X, y, w, pattern, func=grnn_predict_simple):
    rocs = []
    fprs = []
    tprs = []
    
    for filename in glob.glob(pattern):
        print("Loading %s" % filename),
            
        y_pred = predict(X, filename, func=func)
        
        # Roc
        rocs.append(roc_auc_score(y, y_pred, sample_weight=w))
        fpr, tpr, _ = roc_curve(y, y_pred, sample_weight=w)
        
        fprs.append(fpr)
        tprs.append(tpr)
        
        print("ROC AUC = %.4f" % rocs[-1])
         
    print("Mean ROC AUC = %.4f" % np.mean(rocs))
        
    return rocs, fprs, tprs

def build_rocs(prefix_train, prefix_test, model_pattern, preprocess=None, gated=False):
    tf = load_tf("../data/w-vs-qcd/final/%s-train.pickle" % prefix_train, preprocess=preprocess)
    X, y, w = load_test(tf, "../data/w-vs-qcd/final/%s-test.pickle" % prefix_test, preprocess=preprocess) 
    
    if not gated:
        rocs, fprs, tprs = evaluate_models(X, y, w, 
                                           "../models/jet-study-2/model-w-s-%s-[0-9]*.pickle" % model_pattern)
    else:
        rocs, fprs, tprs = evaluate_models(X, y, w, 
                                           "../models/jet-study-2/model-w-g-%s-[0-9]*.pickle" % model_pattern, func=grnn_predict_gated)
        
    return rocs, fprs, tprs

In [112]:
def remove_outliers(rocs, fprs, tprs):
    inv_fprs = []
    base_tpr = np.linspace(0.05, 1, 476)

    for fpr, tpr in zip(fprs, tprs):
        inv_fpr = interp(base_tpr, tpr, 1. / fpr)
        inv_fprs.append(inv_fpr)

    inv_fprs = np.array(inv_fprs)
    scores = inv_fprs[:, 225]
    
    p25 = np.percentile(scores, 1 / 6. * 100.)
    p75 = np.percentile(scores, 5 / 6. * 100)
    
    robust_mean = np.mean([scores[i] for i in range(len(scores)) if p25 <= scores[i] <= p75])
    robust_std = np.std([scores[i] for i in range(len(scores)) if p25 <= scores[i] <= p75])
    
    indices = [i for i in range(len(scores)) if robust_mean - 3*robust_std <= scores[i] <= robust_mean + 3*robust_std]
    
    new_r, new_f, new_t = [], [], []
    
    for i in indices:
        new_r.append(rocs[i])
        new_f.append(fprs[i])
        new_t.append(tprs[i])
    
    return new_r, new_f, new_t


def report_score(rocs, fprs, tprs, label, latex=False, input="particles", short=False):       
    inv_fprs = []
    base_tpr = np.linspace(0.05, 1, 476)
    
    for fpr, tpr in zip(fprs, tprs):
        inv_fpr = interp(base_tpr, tpr, 1. / fpr)
        inv_fprs.append(inv_fpr)
        
    inv_fprs = np.array(inv_fprs)
    mean_inv_fprs = inv_fprs.mean(axis=0)
    
    if not latex:
        print("%32s\tROC AUC=%.4f+-%.2f\t1/FPR@TPR=0.5=%.2f+-%.2f" %  (label, 
                                                                       np.mean(rocs), 
                                                                       np.std(rocs),
                                                                       np.mean(inv_fprs[:, 225]),
                                                                       np.std(inv_fprs[:, 225])))
    else:
        if not short:
            print("%10s \t& %30s \t& %.4f $\pm$ %.4f \t& %.1f $\pm$ %.1f \\\\" % 
                  (input,
                   label,
                   np.mean(rocs), 
                   np.std(rocs),
                   np.mean(inv_fprs[:, 225]),
                   np.std(inv_fprs[:, 225])))
        else:
            print("%30s \t& %.4f $\pm$ %.4f \t& %.1f $\pm$ %.1f \\\\" % 
                  (label,
                   np.mean(rocs), 
                   np.std(rocs),
                   np.mean(inv_fprs[:, 225]),
                   np.std(inv_fprs[:, 225])))
        
def plot_rocs(rocs, fprs, tprs, label="", color="r", show_all=False):
    inv_fprs = []
    base_tpr = np.linspace(0.05, 1, 476)
    
    for fpr, tpr in zip(fprs, tprs):
        inv_fpr = interp(base_tpr, tpr, 1. / fpr)
        inv_fprs.append(inv_fpr)
        if show_all:
            plt.plot(base_tpr, inv_fpr, alpha=0.1, color=color)
        
    inv_fprs = np.array(inv_fprs)
    mean_inv_fprs = inv_fprs.mean(axis=0)


    plt.plot(base_tpr, mean_inv_fprs, color, 
             label="%s" % label)
    
def plot_show(filename=None):
    plt.xlabel("Signal efficiency")
    plt.ylabel("1 / Background efficiency")
    plt.xlim([0.1, 1.0])
    plt.ylim(1, 500)
    plt.yscale("log")
    plt.legend(loc="best")
    plt.grid()
    
    if filename:
        plt.savefig(filename)
    
    plt.show()

Count parameters


In [113]:
def count(params):
    def _count(thing):
        if isinstance(thing, list):
            c = 0
            for stuff in thing:
                c += _count(stuff)
            return c 

        elif isinstance(thing, np.ndarray):
            return np.prod(thing.shape)
    
    c = 0
    for k, v in params.items():
        c += _count(v)
    return c
    
# Simple vs gated
fd = open("../models/jet-study-2/model-w-s-antikt-kt-1.pickle", "rb")
params = pickle.load(fd)
fd.close()
print("Simple =", count(params))   

fd = open("../models/jet-study-2/model-w-g-antikt-kt-1.pickle", "rb")
params = pickle.load(fd)
fd.close()
print("Gated =", count(params))


('Simple =', 8481)
('Gated =', 48761)

In [7]:
# double
# Simple vs gated
fd = open("../models/jet-study-2/model-w-sd-antikt-kt-1.pickle", "rb")
params = pickle.load(fd)
fd.close()
print("Simple =", count(params))   

fd = open("../models/jet-study-2/model-w-gd-antikt-kt-1.pickle", "rb")
params = pickle.load(fd)
fd.close()
print("Gated =", count(params))


('Simple =', 10081)
('Gated =', 50361)

Embedding visualization


In [ ]:
prefix_train = "antikt-kt"
prefix_test = prefix_train
tf = load_tf("../data/w-vs-qcd/final/%s-train.pickle" % prefix_train)
X, y, w = load_test(tf, "../data/w-vs-qcd/final/%s-test.pickle" % prefix_test)

In [7]:
fd = open("../models/jet-study-2/model-w-s-antikt-kt-1.pickle", "rb")
params = pickle.load(fd)
fd.close()

In [17]:
Xt = grnn_transform_simple(params, X[:5000])

In [23]:
from sklearn.manifold import TSNE
Xtt = TSNE(n_components=2).fit_transform(Xt)

In [27]:
for i in range(5000):
    plt.scatter(Xtt[i, 0], Xtt[i, 1], color="b" if y[i] == 1 else "r", alpha=0.5)

plt.show()



In [28]:
from sklearn.decomposition import PCA
Xtt = PCA(n_components=2).fit_transform(Xt)

for i in range(5000):
    plt.scatter(Xtt[i, 0], Xtt[i, 1], color="b" if y[i] == 1 else "r", alpha=0.5)

plt.show()


Generate all ROCs


In [ ]:
for pattern, gated in [
        # Simple
        ## Particles
        ("antikt-kt", False),
        ("antikt-cambridge", False),
        ("antikt-antikt", False),
        ("antikt-random", False),
        ("antikt-seqpt", False),
        ("antikt-seqpt-reversed", False),
        ## Towers
        ("antikt-kt-delphes", False),
        ("antikt-cambridge-delphes", False),
        ("antikt-antikt-delphes", False),
        ("antikt-random-delphes", False),
        ("antikt-seqpt-delphes", False),
        ("antikt-seqpt-reversed-delphes", False),
        ## Images
        ("antikt-kt-images", False),

        # Gated
        ## Particles
        ("antikt-kt", True),
        ("antikt-antikt", True),
        ("antikt-seqpt", True),
        ("antikt-seqpt-reversed", True),
        ("antikt-cambridge", True),
        ("antikt-random", True),
        ## Towers
        ("antikt-kt-delphes", True),
        ("antikt-antikt-delphes", True),
        ("antikt-seqpt-delphes", True),
        ("antikt-seqpt-reversed-delphes", True),
        ("antikt-cambridge-delphes", True),
        ("antikt-random-delphes", True),
        ## Images
        ("antikt-kt-images", True)
    ]:
    r, f, t = build_rocs(pattern, pattern, pattern, gated=gated)
    
    # Save
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s.pickle" % ("s" if not gated else "g", pattern), "wb")
    pickle.dump((r, f, t), fd)
    fd.close()

In [ ]:
# sd/gd == contatenate embeddings of h1_L + h1_R
for pattern, gated in [
        # Simple
        ## Particles
        ("antikt-kt", False),
        ## Towers
        ("antikt-kt-delphes", False),
        ## Images
        ("antikt-kt-images", False),

        # Gated
        ## Particles
        ("antikt-kt", True),
        ## Towers
        ("antikt-kt-delphes", True),
        ## Images
        ("antikt-kt-images", True)
    ]:
    r, f, t = build_rocs(pattern, pattern, pattern, gated=gated)
    
    # Save
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s.pickle" % ("sd" if not gated else "gd", pattern), "wb")
    pickle.dump((r, f, t), fd)
    fd.close()

Table


In [7]:
for pattern, gated, label in [
        # Simple
        ## Particles
        ("antikt-kt", False, "RNN $k_t$"),
        ("antikt-cambridge", False, "RNN C/A"),
        ("antikt-antikt", False, "RNN anti-$k_t$"),
        ("antikt-random", False, "RNN random"),
        ("antikt-seqpt", False, "RNN asc-$p_T$"),
        ("antikt-seqpt-reversed", False, "RNN desc-$p_T$"),
        ## Towers
        ("antikt-kt-delphes", False, "RNN $k_t$"),
        ("antikt-cambridge-delphes", False, "RNN C/A"),
        ("antikt-antikt-delphes", False, "RNN anti-$k_t$"),
        ("antikt-random-delphes", False, "RNN random"),
        ("antikt-seqpt-delphes", False, "RNN asc-$p_T$"),
        ("antikt-seqpt-reversed-delphes", False, "RNN desc-$p_T$"),
        ## Images
        ("antikt-kt-images", False, "RNN $k_t$"),

        # Gated
        ## Particles
        ("antikt-kt", True, "RNN $k_t$ (gated)"),
        ("antikt-cambridge", True, "RNN C/A (gated)"),
        ("antikt-antikt", True, "RNN anti-$k_t$ (gated)"),
        ("antikt-random", True, "RNN random (gated)"),
        ("antikt-seqpt", True, "RNN asc-$p_T$ (gated)"),
        ("antikt-seqpt-reversed", True, "RNN desc-$p_T$ (gated)"),
        ## Towers
        ("antikt-kt-delphes", True, "RNN $k_t$ (gated)"),
        ("antikt-cambridge-delphes", True, "RNN C/A (gated)"),
        ("antikt-antikt-delphes", True, "RNN anti-$k_t$ (gated)"),
        ("antikt-random-delphes", True, "RNN random (gated)"),
        ("antikt-seqpt-delphes", True, "RNN asc-$p_T$ (gated)"),
        ("antikt-seqpt-reversed-delphes", True, "RNN desc-$p_T$ (gated)"),
    
        # Images
        ("antikt-kt-images", False, "RNN $k_t$"),
        ("antikt-kt-images", True, "RNN $k_t$ (gated)")
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s.pickle" % ("s" if not gated else "g", pattern), "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    
    report_score(r, f, t, label=label, 
                 latex=True, 
                 input="particles" if "delphes" not in pattern and "images" not in pattern else "towers")


 particles 	&                      RNN $k_t$ 	& 0.9185 $\pm$ 0.0006 	& 68.3 $\pm$ 1.8 \\
 particles 	&                        RNN C/A 	& 0.9192 $\pm$ 0.0008 	& 68.3 $\pm$ 3.6 \\
 particles 	&                 RNN anti-$k_t$ 	& 0.9096 $\pm$ 0.0013 	& 51.7 $\pm$ 3.5 \\
 particles 	&                     RNN random 	& 0.9121 $\pm$ 0.0008 	& 51.1 $\pm$ 2.0 \\
 particles 	&                  RNN asc-$p_T$ 	& 0.9130 $\pm$ 0.0031 	& 52.5 $\pm$ 7.3 \\
 particles 	&                 RNN desc-$p_T$ 	& 0.9189 $\pm$ 0.0009 	& 70.4 $\pm$ 3.6 \\
    towers 	&                      RNN $k_t$ 	& 0.8807 $\pm$ 0.0010 	& 24.1 $\pm$ 0.6 \\
    towers 	&                        RNN C/A 	& 0.8831 $\pm$ 0.0010 	& 24.2 $\pm$ 0.7 \\
    towers 	&                 RNN anti-$k_t$ 	& 0.8737 $\pm$ 0.0017 	& 22.3 $\pm$ 0.8 \\
    towers 	&                     RNN random 	& 0.8704 $\pm$ 0.0011 	& 20.4 $\pm$ 0.3 \\
    towers 	&                  RNN asc-$p_T$ 	& 0.8835 $\pm$ 0.0009 	& 26.2 $\pm$ 0.7 \\
    towers 	&                 RNN desc-$p_T$ 	& 0.8838 $\pm$ 0.0010 	& 25.1 $\pm$ 0.6 \\
    towers 	&                      RNN $k_t$ 	& 0.8321 $\pm$ 0.0025 	& 12.7 $\pm$ 0.4 \\
 particles 	&              RNN $k_t$ (gated) 	& 0.9195 $\pm$ 0.0009 	& 74.3 $\pm$ 2.4 \\
 particles 	&                RNN C/A (gated) 	& 0.9222 $\pm$ 0.0007 	& 81.8 $\pm$ 3.1 \\
 particles 	&         RNN anti-$k_t$ (gated) 	& 0.9156 $\pm$ 0.0012 	& 68.3 $\pm$ 3.2 \\
 particles 	&             RNN random (gated) 	& 0.9106 $\pm$ 0.0035 	& 50.7 $\pm$ 6.7 \\
 particles 	&          RNN asc-$p_T$ (gated) 	& 0.9137 $\pm$ 0.0046 	& 54.8 $\pm$ 11.7 \\
 particles 	&         RNN desc-$p_T$ (gated) 	& 0.9212 $\pm$ 0.0005 	& 83.3 $\pm$ 3.1 \\
    towers 	&              RNN $k_t$ (gated) 	& 0.8822 $\pm$ 0.0006 	& 25.4 $\pm$ 0.4 \\
    towers 	&                RNN C/A (gated) 	& 0.8861 $\pm$ 0.0014 	& 26.2 $\pm$ 0.8 \\
    towers 	&         RNN anti-$k_t$ (gated) 	& 0.8804 $\pm$ 0.0010 	& 24.4 $\pm$ 0.4 \\
    towers 	&             RNN random (gated) 	& 0.8751 $\pm$ 0.0029 	& 22.8 $\pm$ 1.2 \\
    towers 	&          RNN asc-$p_T$ (gated) 	& 0.8849 $\pm$ 0.0012 	& 27.2 $\pm$ 0.8 \\
    towers 	&         RNN desc-$p_T$ (gated) 	& 0.8864 $\pm$ 0.0007 	& 27.5 $\pm$ 0.6 \\
    towers 	&                      RNN $k_t$ 	& 0.8321 $\pm$ 0.0025 	& 12.7 $\pm$ 0.4 \\
    towers 	&              RNN $k_t$ (gated) 	& 0.8277 $\pm$ 0.0028 	& 12.4 $\pm$ 0.3 \\

In [7]:
for pattern, gated, label in [
        # Simple
        ## Particles
        ("antikt-kt", False, "RNN $k_t$"),
        ## Towers
        ("antikt-kt-delphes", False, "RNN $k_t$"),
        ## Images
        ("antikt-kt-images", False, "RNN $k_t$"),

        # Gated
        ## Particles
        ("antikt-kt", True, "RNN $k_t$ (gated)"),
        ## Towers
        ("antikt-kt-delphes", True, "RNN $k_t$ (gated)"),
        # Images
        ("antikt-kt-images", True, "RNN $k_t$ (gated)")
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s.pickle" % ("sd" if not gated else "gd", pattern), "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    
    report_score(r, f, t, label=label, 
                 latex=True, 
                 input="particles" if "delphes" not in pattern and "images" not in pattern else "towers")


 particles 	&                      RNN $k_t$ 	& 0.9178 $\pm$ 0.0008 	& 67.0 $\pm$ 2.6 \\
    towers 	&                      RNN $k_t$ 	& 0.8801 $\pm$ 0.0012 	& 24.1 $\pm$ 0.6 \\
    towers 	&                      RNN $k_t$ 	& 0.8332 $\pm$ 0.0029 	& 12.6 $\pm$ 0.4 \\
 particles 	&              RNN $k_t$ (gated) 	& 0.9187 $\pm$ 0.0011 	& 70.2 $\pm$ 3.8 \\
    towers 	&              RNN $k_t$ (gated) 	& 0.8810 $\pm$ 0.0012 	& 24.5 $\pm$ 0.5 \\
    towers 	&              RNN $k_t$ (gated) 	& 0.8255 $\pm$ 0.0050 	& 12.2 $\pm$ 0.6 \\

Plots


In [27]:
# Simple vs gated
for pattern, gated, label, color in [
        ("antikt-kt", False, "RNN $k_t$ (simple)", "r"),
        ("antikt-kt", True, "RNN $k_t$ (gated)", "b")
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s.pickle" % ("s" if not gated else "g", pattern), "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    
    plot_rocs(r, f, t, label=label, color=color)
    report_score(r, f, t, label=label)
    
plot_show()


              RNN $k_t$ (simple)	ROC AUC=0.9185+-0.00	1/FPR@TPR=0.5=68.32+-1.80
               RNN $k_t$ (gated)	ROC AUC=0.9195+-0.00	1/FPR@TPR=0.5=74.34+-2.40
/home/gilles/anaconda3/envs/hep/lib/python2.7/site-packages/matplotlib/scale.py:101: RuntimeWarning: invalid value encountered in less_equal
  a[a <= 0.0] = 1e-300

In [34]:
# Topologies (particles, simple)
for pattern, gated, label, color in [
        ("antikt-kt", False, "$k_t$", "r"), 
        ("antikt-cambridge", False, "C/A", "g"),
        ("antikt-antikt", False, "anti-$k_t$", "b"), 
        ("antikt-seqpt", False, "asc-$p_T$", "c"),
        ("antikt-seqpt-reversed", False, "desc-$p_T$", "m"),
        ("antikt-random", False, "random", "orange")
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s.pickle" % ("s" if not gated else "g", pattern), "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    
    plot_rocs(r, f, t, label=label, color=color)
    report_score(r, f, t, label=label)
    
plot_show()


                           $k_t$	ROC AUC=0.9185+-0.00	1/FPR@TPR=0.5=68.32+-1.80
                             C/A	ROC AUC=0.9192+-0.00	1/FPR@TPR=0.5=68.29+-3.62
                      anti-$k_t$	ROC AUC=0.9096+-0.00	1/FPR@TPR=0.5=51.66+-3.49
                       asc-$p_T$	ROC AUC=0.9130+-0.00	1/FPR@TPR=0.5=52.51+-7.27
                      desc-$p_T$	ROC AUC=0.9189+-0.00	1/FPR@TPR=0.5=70.38+-3.65
                          random	ROC AUC=0.9121+-0.00	1/FPR@TPR=0.5=51.06+-2.00

In [29]:
# Topologies (towers, simple)
for pattern, gated, label, color in [
        ("antikt-kt-delphes", False, "RNN $k_t$", "r"), 
        ("antikt-cambridge-delphes", False, "RNN C/A", "g"),
        ("antikt-antikt-delphes", False, "RNN anti-$k_t$", "b"), 
        ("antikt-seqpt-delphes", False, "RNN asc-$p_T$", "c"),
        ("antikt-seqpt-reversed-delphes", False, "RNN desc-$p_T$", "m"),
        ("antikt-random-delphes", False, "RNN random", "orange")
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s.pickle" % ("s" if not gated else "g", pattern), "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    
    plot_rocs(r, f, t, label=label, color=color)
    report_score(r, f, t, label=label)
    
plot_show()


                       RNN $k_t$	ROC AUC=0.8807+-0.00	1/FPR@TPR=0.5=24.07+-0.55
                         RNN C/A	ROC AUC=0.8831+-0.00	1/FPR@TPR=0.5=24.15+-0.69
                  RNN anti-$k_t$	ROC AUC=0.8737+-0.00	1/FPR@TPR=0.5=22.33+-0.85
                   RNN asc-$p_T$	ROC AUC=0.8835+-0.00	1/FPR@TPR=0.5=26.15+-0.70
                  RNN desc-$p_T$	ROC AUC=0.8838+-0.00	1/FPR@TPR=0.5=25.13+-0.58
                      RNN random	ROC AUC=0.8704+-0.00	1/FPR@TPR=0.5=20.35+-0.34

In [30]:
# Topologies (particles, gated)
for pattern, gated, label, color in [
        ("antikt-kt", True, "RNN $k_t$", "r"), 
        ("antikt-antikt", True, "RNN anti-$k_t$", "b"), 
        ("antikt-seqpt", True, "RNN asc-$p_T$", "c"),
        ("antikt-seqpt-reversed", True, "RNN desc-$p_T$", "m"),
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s.pickle" % ("s" if not gated else "g", pattern), "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    
    plot_rocs(r, f, t, label=label, color=color)
    report_score(r, f, t, label=label)
    
plot_show()


                       RNN $k_t$	ROC AUC=0.9195+-0.00	1/FPR@TPR=0.5=74.34+-2.40
                  RNN anti-$k_t$	ROC AUC=0.9156+-0.00	1/FPR@TPR=0.5=68.25+-3.18
                   RNN asc-$p_T$	ROC AUC=0.9137+-0.00	1/FPR@TPR=0.5=54.79+-11.70
                  RNN desc-$p_T$	ROC AUC=0.9212+-0.00	1/FPR@TPR=0.5=83.27+-3.08

In [31]:
# Topologies (towers, gated)
for pattern, gated, label, color in [
        ("antikt-kt-delphes", True, "RNN $k_t$", "r"), 
        ("antikt-antikt-delphes", True, "RNN anti-$k_t$", "b"), 
        ("antikt-seqpt-delphes", True, "RNN asc-$p_T$", "c"),
        ("antikt-seqpt-reversed-delphes", True, "RNN desc-$p_T$", "m"),
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s.pickle" % ("s" if not gated else "g", pattern), "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    
    plot_rocs(r, f, t, label=label, color=color)
    report_score(r, f, t, label=label)
    
plot_show()


                       RNN $k_t$	ROC AUC=0.8822+-0.00	1/FPR@TPR=0.5=25.41+-0.36
                  RNN anti-$k_t$	ROC AUC=0.8804+-0.00	1/FPR@TPR=0.5=24.37+-0.42
                   RNN asc-$p_T$	ROC AUC=0.8849+-0.00	1/FPR@TPR=0.5=27.16+-0.82
                  RNN desc-$p_T$	ROC AUC=0.8864+-0.00	1/FPR@TPR=0.5=27.50+-0.55

In [10]:
# Particles vs towers vs images (simple)
for pattern, gated, label, color in [
        ("antikt-kt", False, "particles", "r"), 
        ("antikt-kt-delphes", False, "towers", "g"),
        ("antikt-kt-images", False, "images", "b"), 
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s.pickle" % ("s" if not gated else "g", pattern), "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    
    plot_rocs(r, f, t, label=label, color=color)
    report_score(r, f, t, label=label)
    
plot_show(filename="particles-towers-images.pdf")


                       particles	ROC AUC=0.9185+-0.00	1/FPR@TPR=0.5=68.32+-1.80
                          towers	ROC AUC=0.8807+-0.00	1/FPR@TPR=0.5=24.07+-0.55
                          images	ROC AUC=0.8321+-0.00	1/FPR@TPR=0.5=12.67+-0.43

In [33]:
# Particles vs towers vs images (gated)
for pattern, gated, label, color in [
        ("antikt-kt", True, "particles", "r"), 
        ("antikt-kt-delphes", True, "towers", "g"),
        ("antikt-kt-images", True, "images", "b"), 
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s.pickle" % ("s" if not gated else "g", pattern), "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    
    plot_rocs(r, f, t, label=label, color=color)
    report_score(r, f, t, label=label)
    
plot_show()


                       particles	ROC AUC=0.9195+-0.00	1/FPR@TPR=0.5=74.34+-2.40
                          towers	ROC AUC=0.8822+-0.00	1/FPR@TPR=0.5=25.41+-0.36
                          images	ROC AUC=0.8277+-0.00	1/FPR@TPR=0.5=12.36+-0.30

Trimming


In [ ]:
for pattern_train, pattern_test, gated in [
        ("antikt-kt", "antikt-kt", False),
        ("antikt-kt", "antikt-kt-trimmed", False),
        ("antikt-kt-trimmed", "antikt-kt-trimmed", False),
        ("antikt-kt-trimmed", "antikt-kt", False),
    ]:
    r, f, t = build_rocs(pattern_train, pattern_test, pattern_train, gated=gated)
    
    # Save
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s-%s.pickle" % 
              ("s" if not gated else "g", pattern_train, pattern_test), "wb")
    pickle.dump((r, f, t), fd)
    fd.close()

In [35]:
for pattern_train, pattern_test, gated, label, color in [
        ("antikt-kt", "antikt-kt", False, "$k_t$ on $k_t$", "b"),
        ("antikt-kt", "antikt-kt-trimmed", False, "$k_t$ on $k_t$-trimmed", "c"),
        ("antikt-kt-trimmed", "antikt-kt-trimmed", False, "$k_t$-trimmed on $k_t$-trimmed", "r"),
        ("antikt-kt-trimmed", "antikt-kt", False, "$k_t$-trimmed on $k_t$", "orange"),
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s-%s.pickle" % 
              ("s" if not gated else "g", pattern_train, pattern_test), "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    
    plot_rocs(r, f, t, label=label, color=color)
    report_score(r, f, t, label=label)
    
plot_show()


                  $k_t$ on $k_t$	ROC AUC=0.9185+-0.00	1/FPR@TPR=0.5=68.32+-1.80
          $k_t$ on $k_t$-trimmed	ROC AUC=0.8952+-0.00	1/FPR@TPR=0.5=28.70+-1.30
  $k_t$-trimmed on $k_t$-trimmed	ROC AUC=0.9034+-0.00	1/FPR@TPR=0.5=33.05+-1.85
          $k_t$-trimmed on $k_t$	ROC AUC=0.8893+-0.01	1/FPR@TPR=0.5=36.78+-5.88

Colinear splits


In [ ]:
from functools import partial
from recnn.preprocessing import sequentialize_by_pt

preprocess_seqpt = partial(sequentialize_by_pt, reverse=False)
preprocess_seqpt_rev = partial(sequentialize_by_pt, reverse=True)

for pattern_train, pattern_test, gated, preprocess in [
        # kt
        ("antikt-kt", "antikt-kt-colinear1", False, None),
        ("antikt-kt", "antikt-kt-colinear10", False, None),
        ("antikt-kt", "antikt-kt-colinear1-max", False, None),
        ("antikt-kt", "antikt-kt-colinear10-max", False, None),
    
        # asc-pt
        ("antikt-seqpt", "antikt-kt-colinear1", False, preprocess_seqpt),
        ("antikt-seqpt", "antikt-kt-colinear10", False, preprocess_seqpt),
        ("antikt-seqpt", "antikt-kt-colinear1-max", False, preprocess_seqpt),
        ("antikt-seqpt", "antikt-kt-colinear10-max", False, preprocess_seqpt),
    
        # desc-pt
        ("antikt-seqpt-reversed", "antikt-kt-colinear1", False, preprocess_seqpt_rev),
        ("antikt-seqpt-reversed", "antikt-kt-colinear10", False, preprocess_seqpt_rev),
        ("antikt-seqpt-reversed", "antikt-kt-colinear1-max", False, preprocess_seqpt_rev),
        ("antikt-seqpt-reversed", "antikt-kt-colinear10-max", False, preprocess_seqpt_rev),
    ]:
    
    r, f, t = build_rocs(pattern_train, pattern_test, pattern_train, gated=gated, preprocess=preprocess)
    
    # Save
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s-%s.pickle" % 
              ("s" if not gated else "g", pattern_train, pattern_test), "wb")
    pickle.dump((r, f, t), fd)
    fd.close()

In [37]:
for pattern_train, pattern_test, gated, label in [
        # kt
        ("antikt-kt", "antikt-kt-colinear1", False, "$k_t$ colinear1"),
        ("antikt-kt", "antikt-kt-colinear10", False, "$k_t$ colinear10"),
        ("antikt-kt", "antikt-kt-colinear1-max", False, "$k_t$ colinear1-max"),
        ("antikt-kt", "antikt-kt-colinear10-max", False, "$k_t$ colinear10-max"),
    
        # asc-pt
        ("antikt-seqpt", "antikt-kt-colinear1", False, "asc-$p_T$ colinear1"),
        ("antikt-seqpt", "antikt-kt-colinear10", False, "asc-$p_T$ colinear10"),
        ("antikt-seqpt", "antikt-kt-colinear1-max", False, "asc-$p_T$ colinear1-max"),
        ("antikt-seqpt", "antikt-kt-colinear10-max", False, "asc-$p_T$ colinear10-max"),
    
        # desc-pt
        ("antikt-seqpt-reversed", "antikt-kt-colinear1", False, "desc-$p_T$ colinear1"),
        ("antikt-seqpt-reversed", "antikt-kt-colinear10", False, "desc-$p_T$ colinear10"),
        ("antikt-seqpt-reversed", "antikt-kt-colinear1-max", False, "desc-$p_T$ colinear1-max"),
        ("antikt-seqpt-reversed", "antikt-kt-colinear10-max", False, "desc-$p_T$ colinear10-max"),
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s-%s.pickle" % 
              ("s" if not gated else "g", pattern_train, pattern_test), "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    
    report_score(r, f, t, label=label,
                 latex=True, short=True)


               $k_t$ colinear1 	& 0.9183 $\pm$ 0.0006 	& 68.7 $\pm$ 2.0 \\
              $k_t$ colinear10 	& 0.9174 $\pm$ 0.0006 	& 67.5 $\pm$ 2.6 \\
           $k_t$ colinear1-max 	& 0.9184 $\pm$ 0.0006 	& 68.5 $\pm$ 2.8 \\
          $k_t$ colinear10-max 	& 0.9159 $\pm$ 0.0009 	& 65.7 $\pm$ 2.7 \\
           asc-$p_T$ colinear1 	& 0.9129 $\pm$ 0.0031 	& 53.0 $\pm$ 7.5 \\
          asc-$p_T$ colinear10 	& 0.9126 $\pm$ 0.0033 	& 53.6 $\pm$ 7.2 \\
       asc-$p_T$ colinear1-max 	& 0.9129 $\pm$ 0.0032 	& 54.2 $\pm$ 7.9 \\
      asc-$p_T$ colinear10-max 	& 0.9090 $\pm$ 0.0039 	& 50.2 $\pm$ 8.2 \\
          desc-$p_T$ colinear1 	& 0.9188 $\pm$ 0.0010 	& 70.7 $\pm$ 4.0 \\
         desc-$p_T$ colinear10 	& 0.9178 $\pm$ 0.0011 	& 67.9 $\pm$ 4.3 \\
      desc-$p_T$ colinear1-max 	& 0.9191 $\pm$ 0.0010 	& 72.4 $\pm$ 4.3 \\
     desc-$p_T$ colinear10-max 	& 0.9140 $\pm$ 0.0016 	& 63.5 $\pm$ 5.2 \\

Soft particles


In [ ]:
from functools import partial
from recnn.preprocessing import sequentialize_by_pt

preprocess_seqpt = partial(sequentialize_by_pt, reverse=False)
preprocess_seqpt_rev = partial(sequentialize_by_pt, reverse=True)

for pattern_train, pattern_test, gated, preprocess in [
        ("antikt-kt", "antikt-kt-soft", False, None),
        ("antikt-seqpt", "antikt-kt-soft", False, preprocess_seqpt),
        ("antikt-seqpt-reversed", "antikt-kt-soft", False, preprocess_seqpt_rev),
    ]:
    
    r, f, t = build_rocs(pattern_train, pattern_test, pattern_train, gated=gated, preprocess=preprocess)
    
    # Save
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s-%s.pickle" % 
              ("s" if not gated else "g", pattern_train, pattern_test), "wb")
    pickle.dump((r, f, t), fd)
    fd.close()

In [39]:
for pattern_train, pattern_test, gated, label in [
        ("antikt-kt", "antikt-kt-soft", False, "$k_t$ soft"),
        ("antikt-seqpt", "antikt-kt-soft", False, "asc-$p_T$ soft"),
        ("antikt-seqpt-reversed", "antikt-kt-soft", False, "desc-$p_T$ soft"),
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s-%s.pickle" % 
              ("s" if not gated else "g", pattern_train, pattern_test), "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    
    report_score(r, f, t, label=label, latex=True, short=True)


                    $k_t$ soft 	& 0.9179 $\pm$ 0.0006 	& 68.2 $\pm$ 2.3 \\
                asc-$p_T$ soft 	& 0.9121 $\pm$ 0.0032 	& 51.3 $\pm$ 6.0 \\
               desc-$p_T$ soft 	& 0.9188 $\pm$ 0.0009 	& 70.2 $\pm$ 3.7 \\

Learning curve


In [ ]:
for pattern, gated, n_events in [
#         ("antikt-kt", False, 6000),
#         ("antikt-seqpt-reversed", False, 6000),
        ("antikt-kt", True, 6000),
        ("antikt-seqpt-reversed", True, 6000),
#         ("antikt-kt", False, 15000),
#         ("antikt-seqpt-reversed", False, 15000),
        ("antikt-kt", True, 15000),
        ("antikt-seqpt-reversed", True, 15000),
    ]:
    
    tf = load_tf("../data/w-vs-qcd/final/%s-train.pickle" % pattern, n_events_train=n_events)
    X, y, w = load_test(tf, "../data/w-vs-qcd/final/%s-test.pickle" % pattern) 
    
    if not gated:
        rocs, fprs, tprs = evaluate_models(X, y, w, 
                                           "../models/jet-study-2/model-w-s-%s-%d-[0-9]*.pickle" % (pattern, n_events))
    else:
        rocs, fprs, tprs = evaluate_models(X, y, w, 
                                           "../models/jet-study-2/model-w-g-%s-%d-[0-9]*.pickle" % (pattern, n_events), func=grnn_predict_gated)
        
    # Save
    fd = open("../models/jet-study-2/rocs/rocs-%s-%s-%d.pickle" % ("s" if not gated else "g", pattern, n_events), "wb")
    pickle.dump((rocs, fprs, tprs), fd)
    fd.close()

In [17]:
for pattern, label, color in [
        ("s-antikt-kt", "$k_t$ 100k", "r"),
        ("s-antikt-kt-15000", "$k_t$ 10k", "g"),
        ("s-antikt-kt-6000", "$k_t$ 1k", "b"),
        ("s-antikt-seqpt-reversed", "desc-$p_T$ 100k", "r--"),
        ("s-antikt-seqpt-reversed-15000", "desc-$p_T$ 10k", "g--"),
        ("s-antikt-seqpt-reversed-6000", "desc-$p_T$ 1k", "b--"),
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s.pickle" % pattern, "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    plot_rocs(r, f, t, label=label, color=color)
    report_score(r, f, t, label=label)
    
plot_show()


                      $k_t$ 100k	ROC AUC=0.9185+-0.00	1/FPR@TPR=0.5=68.32+-1.80
                       $k_t$ 10k	ROC AUC=0.9038+-0.00	1/FPR@TPR=0.5=40.66+-4.58
                        $k_t$ 1k	ROC AUC=0.8758+-0.01	1/FPR@TPR=0.5=23.74+-1.95
                 desc-$p_T$ 100k	ROC AUC=0.9189+-0.00	1/FPR@TPR=0.5=70.38+-3.65
                  desc-$p_T$ 10k	ROC AUC=0.9030+-0.00	1/FPR@TPR=0.5=36.71+-4.00
                   desc-$p_T$ 1k	ROC AUC=0.8768+-0.00	1/FPR@TPR=0.5=22.04+-2.38

In [16]:
for pattern, label, color in [
        ("g-antikt-kt", "$k_t$ 100k", "r"),
        ("g-antikt-kt-15000", "$k_t$ 10k", "g"),
        ("g-antikt-kt-6000", "$k_t$ 1k", "b"),
        ("g-antikt-seqpt-reversed", "desc-$p_T$ 100k", "r--"),
        ("g-antikt-seqpt-reversed-15000", "desc-$p_T$ 10k", "g--"),
        ("g-antikt-seqpt-reversed-6000", "desc-$p_T$ 1k", "b--"),
    ]:
    
    fd = open("../models/jet-study-2/rocs/rocs-%s.pickle" % pattern, "rb")
    r, f, t = pickle.load(fd)
    fd.close()
    
    r, f, t = remove_outliers(r, f, t)
    plot_rocs(r, f, t, label=label, color=color)
    report_score(r, f, t, label=label)
    
plot_show()


                      $k_t$ 100k	ROC AUC=0.9195+-0.00	1/FPR@TPR=0.5=74.34+-2.40
                       $k_t$ 10k	ROC AUC=0.9042+-0.00	1/FPR@TPR=0.5=46.42+-5.09
                        $k_t$ 1k	ROC AUC=0.8726+-0.01	1/FPR@TPR=0.5=22.63+-2.97
                 desc-$p_T$ 100k	ROC AUC=0.9212+-0.00	1/FPR@TPR=0.5=83.27+-3.08
                  desc-$p_T$ 10k	ROC AUC=0.9123+-0.00	1/FPR@TPR=0.5=56.12+-4.75
                   desc-$p_T$ 1k	ROC AUC=0.8860+-0.00	1/FPR@TPR=0.5=27.35+-2.51

Tau21


In [119]:
import h5py

In [120]:
f = h5py.File("../data/w-vs-qcd/h5/w_100000_j1p0_sj0p30_delphes_jets_images.h5", "r")["auxvars"]
tau1 = f["tau_1"]
tau2 = f["tau_2"]
tau21 = np.true_divide(tau2, tau1)
pt = f["pt_trimmed"]
mass = f["mass_trimmed"]
mask = (f["mass_trimmed"] < 110) & (f["mass_trimmed"] > 50) & (f["pt_trimmed"] < 300) & (f["pt_trimmed"] > 250)
#mask = mask & np.isfinite(tau21) & (tau21 != 0.)
signal_tau21 = tau21[mask]
signal_pt = pt[mask]
signal_mass = mass[mask]

f = h5py.File("../data/w-vs-qcd/h5/qcd_100000_j1p0_sj0p30_delphes_jets_images.h5", "r")["auxvars"]
tau1 = f["tau_1"]
tau2 = f["tau_2"]
tau21 = np.true_divide(tau2, tau1)
pt = f["pt_trimmed"]
mass = f["mass_trimmed"]
mask = (f["mass_trimmed"] < 110) & (f["mass_trimmed"] > 50) & (f["pt_trimmed"] < 300) & (f["pt_trimmed"] > 250)
#mask = mask & np.isfinite(tau21) & (tau21 != 0.)
bkg_tau21 = tau21[mask]
bkg_pt = pt[mask]
bkg_mass = mass[mask]


/home/gilles/anaconda3/envs/hep/lib/python2.7/site-packages/ipykernel/__main__.py:4: RuntimeWarning: invalid value encountered in true_divide
/home/gilles/anaconda3/envs/hep/lib/python2.7/site-packages/ipykernel/__main__.py:16: RuntimeWarning: invalid value encountered in true_divide

In [121]:
plt.hist(bkg_mass, histtype="step", bins=40, normed=1)
plt.hist(signal_mass, histtype="step", bins=40, normed=1)


Out[121]:
(array([ 0.00362252,  0.00371848,  0.00484602,  0.00592558,  0.00585361,
         0.00760489,  0.0080607 ,  0.01105948,  0.01158726,  0.014658  ,
         0.01727294,  0.01816057,  0.02310255,  0.02593339,  0.02948394,
         0.03339434,  0.03728075,  0.03884011,  0.03855223,  0.04109519,
         0.03778455,  0.03711282,  0.03253069,  0.03039557,  0.02655714,
         0.02245482,  0.01770476,  0.0149219 ,  0.01221101,  0.00966805,
         0.00803671,  0.00789277,  0.0058776 ,  0.00441419,  0.00398237,
         0.00379045,  0.00371848,  0.00278286,  0.00247099,  0.00247099]),
 array([  50.00960289,   51.50923265,   53.0088624 ,   54.50849216,
          56.00812192,   57.50775168,   59.00738143,   60.50701119,
          62.00664095,   63.50627071,   65.00590046,   66.50553022,
          68.00515998,   69.50478974,   71.00441949,   72.50404925,
          74.00367901,   75.50330877,   77.00293852,   78.50256828,
          80.00219804,   81.5018278 ,   83.00145756,   84.50108731,
          86.00071707,   87.50034683,   88.99997659,   90.49960634,
          91.9992361 ,   93.49886586,   94.99849562,   96.49812537,
          97.99775513,   99.49738489,  100.99701465,  102.4966444 ,
         103.99627416,  105.49590392,  106.99553368,  108.49516343,
         109.99479319]),
 <a list of 1 Patch objects>)

In [122]:
tau21 = np.concatenate((signal_tau21, bkg_tau21))
pts = np.concatenate((signal_pt, bkg_pt))
masss = np.concatenate((signal_mass, bkg_mass))

X = np.hstack([tau21.reshape(-1,1), masss.reshape(-1,1)])
y = np.concatenate((np.ones(len(signal_tau21)), np.zeros(len(bkg_tau21))))

w = np.zeros(len(y)) 

pdf, edges = np.histogram(pts[y == 0], density=True, range=[250, 300], bins=50)
indices = np.searchsorted(edges, pts[y == 0]) - 1
inv_w = 1. / pdf[indices]
inv_w /= inv_w.sum()
w[y==0] = inv_w

pdf, edges = np.histogram(pts[y == 1], density=True, range=[250, 300], bins=50)
indices = np.searchsorted(edges, pts[y == 1]) - 1
inv_w = 1. / pdf[indices]
inv_w /= inv_w.sum()
w[y==1] = inv_w

In [127]:
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, train_size=0.5)

In [128]:
def evaluate_models(X, y, w):
    rocs = []
    fprs = []
    tprs = []
     
    y_pred = X

    # Roc
    rocs.append(roc_auc_score(y, y_pred, sample_weight=w))
    fpr, tpr, _ = roc_curve(y, y_pred, sample_weight=w)

    fprs.append(fpr)
    tprs.append(tpr)
        
    return rocs, fprs, tprs

r, f, t = evaluate_models(-tau21, y, w)
plot_rocs(r, f, t, label="tau21")
report_score(r, f, t, label="tau21")

r, f, t = evaluate_models(masss, y, w)
plot_rocs(r, f, t, label="mass")
report_score(r, f, t, label="mass")

plot_show()


                           tau21	ROC AUC=0.7644+-0.00	1/FPR@TPR=0.5=6.79+-0.00
                            mass	ROC AUC=0.6194+-0.00	1/FPR@TPR=0.5=2.80+-0.00

In [129]:
clf = ExtraTreesClassifier(n_estimators=1000, min_samples_leaf=100, max_features=1)
clf.fit(X_train, y_train)


Out[129]:
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features=1, max_leaf_nodes=None,
           min_samples_leaf=100, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [130]:
r, f, t = evaluate_models(-clf.predict_proba(X_test)[:, 0], y_test, w_test)
plot_rocs(r, f, t, label="tau21+mass")
report_score(r, f, t, label="tau21+mass")

plot_show()


                      tau21+mass	ROC AUC=0.8207+-0.00	1/FPR@TPR=0.5=11.06+-0.00