In [1]:
# import MNIST data, Tensorflow, and other helpers
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")
import tensorflow as tf
import numpy as np
import sys
import os
import pickle
from load_cifar10 import load_data10
import sklearn.metrics as sk

# training parameters
training_epochs = 30
batch_size = 128

# architecture parameters
n_labels = 10
image_pixels = 28 * 28


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz

In [3]:
graph = tf.Graph()
with graph.as_default():
    x = tf.placeholder(dtype=tf.float32, shape=[None, image_pixels])
    y = tf.placeholder(dtype=tf.int64, shape=[None])
    risk_labels = tf.placeholder(dtype=tf.float32, shape=[None])

    def gelu_fast(x):
        return 0.5 * x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))
    rho = gelu_fast

    W = {}
    b = {}

    W['1'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([image_pixels, 256]), 0)/tf.sqrt(1 + 0.425))
    W['2'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([256, 256]), 0)/tf.sqrt(0.425 + 0.425))
    W['3'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([256, 256]), 0)/tf.sqrt(0.425 + 0.425))
    W['logits'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([256, n_labels]), 0)/tf.sqrt(0.425 + 1))

    b['1'] = tf.Variable(tf.zeros([256]))
    b['2'] = tf.Variable(tf.zeros([256]))
    b['3'] = tf.Variable(tf.zeros([256]))
    b['logits'] = tf.Variable(tf.zeros([n_labels]))

    def model(x):
        h1 = rho(tf.matmul(x, W['1']) + b['1'])
        h2 = rho(tf.matmul(h1, W['2']) + b['2'])
        h3 = rho(tf.matmul(h2, W['3']) + b['3'])
        return tf.matmul(h3, W['logits']) + b['logits']

    logits = model(x)
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y))
    loss += 1e-5 * (tf.nn.l2_loss(W['1']) + tf.nn.l2_loss(W['2']) + tf.nn.l2_loss(W['3']))

    lr = tf.constant(0.001)
    optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
    compute_error = tf.reduce_mean(tf.to_float(tf.not_equal(tf.argmax(logits, 1), y)))

In [4]:
sess = tf.InteractiveSession(graph=graph)
print('Beginning training')

sess.run(tf.initialize_all_variables())

num_batches = int(mnist.train.num_examples / batch_size)
ce_ema = 2.3  # - log(0.1)
err_ema = 0.9
risk_loss_ema = 0.3  # - log(0.5)
learning_rate = 0.001
for epoch in range(training_epochs):
    if epoch >= 20:
        learning_rate = 0.0001
    for i in range(num_batches):
        bx, by = mnist.train.next_batch(batch_size)
        _, err, l = sess.run([optimizer, compute_error, loss], feed_dict={x: bx, y: by, lr: learning_rate})
        ce_ema = ce_ema * 0.95 + 0.05 * l
        err_ema = err_ema * 0.95 + 0.05 * err

    # we're training on all data so we do not keep the validation set separate
    for i in range(mnist.validation.num_examples//batch_size):
        bx, by = mnist.validation.next_batch(batch_size)
        _, err, l = sess.run([optimizer, compute_error, loss], feed_dict={x: bx, y: by, lr: learning_rate})
        ce_ema = ce_ema * 0.95 + 0.05 * l
        err_ema = err_ema * 0.95 + 0.05 * err

    print('Epoch:', epoch, '|', 'ce ema of loss for epoch:', ce_ema, 'error (%):', 100*err_ema)

print('MNIST classification loss and error:', sess.run([loss, 100*compute_error],
                                                       feed_dict={x: mnist.test.images, y: mnist.test.labels}))


Beginning training
Epoch: 0 | ce ema of loss for epoch: 0.128168097743 error (%): 3.7103690808
Epoch: 1 | ce ema of loss for epoch: 0.0878284856398 error (%): 2.46795125842
Epoch: 2 | ce ema of loss for epoch: 0.0557872351687 error (%): 1.33225200717
Epoch: 3 | ce ema of loss for epoch: 0.044525056651 error (%): 1.0687527966
Epoch: 4 | ce ema of loss for epoch: 0.0400799943013 error (%): 1.09333086375
Epoch: 5 | ce ema of loss for epoch: 0.0334733837831 error (%): 0.996192755486
Epoch: 6 | ce ema of loss for epoch: 0.0266693892385 error (%): 0.671791556202
Epoch: 7 | ce ema of loss for epoch: 0.0259292802148 error (%): 0.554253685523
Epoch: 8 | ce ema of loss for epoch: 0.0258896145523 error (%): 0.686721110837
Epoch: 9 | ce ema of loss for epoch: 0.0213904327838 error (%): 0.398648341879
Epoch: 10 | ce ema of loss for epoch: 0.0180771592907 error (%): 0.336621098846
Epoch: 11 | ce ema of loss for epoch: 0.0189750431679 error (%): 0.266963033885
Epoch: 12 | ce ema of loss for epoch: 0.023876104777 error (%): 0.537854956562
Epoch: 13 | ce ema of loss for epoch: 0.0187525864723 error (%): 0.252999327926
Epoch: 14 | ce ema of loss for epoch: 0.0154464462101 error (%): 0.197290761084
Epoch: 15 | ce ema of loss for epoch: 0.015962206791 error (%): 0.277141026723
Epoch: 16 | ce ema of loss for epoch: 0.0197467425291 error (%): 0.331265762182
Epoch: 17 | ce ema of loss for epoch: 0.024291269015 error (%): 0.495741653373
Epoch: 18 | ce ema of loss for epoch: 0.0174592302354 error (%): 0.335498327916
Epoch: 19 | ce ema of loss for epoch: 0.0222376752082 error (%): 0.360551282264
Epoch: 20 | ce ema of loss for epoch: 0.0109632858578 error (%): 0.00274929360806
Epoch: 21 | ce ema of loss for epoch: 0.0107647904147 error (%): 0.000479989089891
Epoch: 22 | ce ema of loss for epoch: 0.0106740857367 error (%): 4.71576687701e-05
Epoch: 23 | ce ema of loss for epoch: 0.0105518535606 error (%): 3.4471183752e-06
Epoch: 24 | ce ema of loss for epoch: 0.0103936941233 error (%): 1.29450422631e-16
Epoch: 25 | ce ema of loss for epoch: 0.0102447433476 error (%): 4.86128124869e-27
Epoch: 26 | ce ema of loss for epoch: 0.01010410161 error (%): 1.8255680359e-37
Epoch: 27 | ce ema of loss for epoch: 0.00990675524067 error (%): 6.8555972864e-48
Epoch: 28 | ce ema of loss for epoch: 0.00969000956002 error (%): 2.57449808657e-58
Epoch: 29 | ce ema of loss for epoch: 0.00941752819686 error (%): 9.66807138877e-69
MNIST classification loss and error: [0.082697153, 1.48]

In [5]:
s = tf.nn.softmax(logits)
s_prob = tf.reduce_max(s, reduction_indices=[1], keep_dims=True)
kl_all = tf.log(10.) + tf.reduce_sum(s * tf.log(tf.abs(s) + 1e-11), reduction_indices=[1], keep_dims=True)
m_all, v_all = tf.nn.moments(kl_all, axes=[0])

logits_right = tf.boolean_mask(logits, tf.equal(tf.argmax(logits, 1), y))
s_right = tf.nn.softmax(logits_right)
s_right_prob = tf.reduce_max(s_right, reduction_indices=[1], keep_dims=True)
kl_right = tf.log(10.) + tf.reduce_sum(s_right * tf.log(tf.abs(s_right) + 1e-11), reduction_indices=[1], keep_dims=True)
m_right, v_right = tf.nn.moments(kl_right, axes=[0])

logits_wrong = tf.boolean_mask(logits, tf.not_equal(tf.argmax(logits, 1), y))
s_wrong = tf.nn.softmax(logits_wrong)
s_wrong_prob = tf.reduce_max(s_wrong, reduction_indices=[1], keep_dims=True)
kl_wrong = tf.log(10.) + tf.reduce_sum(s_wrong * tf.log(tf.abs(s_wrong) + 1e-11), reduction_indices=[1], keep_dims=True)
m_wrong, v_wrong = tf.nn.moments(kl_wrong, axes=[0])

acc = 100*tf.reduce_mean(tf.to_float(tf.equal(tf.argmax(logits, 1), y)))

Right/Wrong Distinction


In [14]:
err, kl_a, kl_r, kl_w, s_p, s_rp, s_wp = sess.run(
    [100 - acc, kl_all, kl_right, kl_wrong, s_prob, s_right_prob, s_wrong_prob],
    feed_dict={x: mnist.test.images, y: mnist.test.labels})

print('MNIST Error (%)| Prediction Prob (mean, std) | PProb Right (mean, std) | PProb Wrong (mean, std):')
print(err, '|', np.mean(s_p), np.std(s_p), '|', np.mean(s_rp), np.std(s_rp), '|', np.mean(s_wp), np.std(s_wp))

print('\nSuccess Detection')
print('Success base rate (%):', round(100-err,2))
print('KL[p||u]: Right/Wrong classification distinction')
safe, risky = kl_r, kl_w
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[:safe.shape[0]] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

print('Prediction Prob: Right/Wrong classification distinction')
safe, risky = s_rp, s_wp
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[:safe.shape[0]] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


print('\nError Detection')
print('Error base rate (%):', round(err,2))
safe, risky = -kl_r, -kl_w
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[safe.shape[0]:] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('KL[p||u]: Right/Wrong classification distinction')
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

print('Prediction Prob: Right/Wrong classification distinction')
safe, risky = -s_rp, -s_wp
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[safe.shape[0]:] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


MNIST Error (%)| Prediction Prob (mean, std) | PProb Right (mean, std) | PProb Wrong (mean, std):
1.48 | 0.995331 0.0379625 | 0.99755 0.0255258 | 0.847625 0.178486

Success Detection
Success base rate (%): 98.52
KL[p||u]: Right/Wrong classification distinction
AUPR (%): 99.97
AUROC (%): 97.82
Prediction Prob: Right/Wrong classification distinction
AUPR (%): 99.97
AUROC (%): 97.83

Error Detection
Error base rate (%): 1.48
KL[p||u]: Right/Wrong classification distinction
AUPR (%): 45.36
AUROC (%): 97.82
Prediction Prob: Right/Wrong classification distinction
AUPR (%): 43.88
AUROC (%): 97.83

In [32]:
def show_ood_detection_results(error_rate_for_in, in_examples, out_examples):
    kl_oos, s_p_oos = sess.run([kl_all, s_prob], feed_dict={x: out_examples})

    print('OOD Example Prediction Probability (mean, std):')
    print(np.mean(s_p_oos), np.std(s_p_oos))

    print('\nNormality Detection')
    print('Normality base rate (%):', round(100*in_examples.shape[0]/(
                out_examples.shape[0] + in_examples.shape[0]),2))
    print('KL[p||u]: Normality Detection')
    safe, risky = kl_a, kl_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Normality Detection')
    safe, risky = s_p, s_p_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Normality base rate (%):', round(100*(1 - err/100)*in_examples.shape[0]/
          (out_examples.shape[0] + (1 - err/100)*in_examples.shape[0]),2))
    print('KL[p||u]: Normality Detection (relative to correct examples)')
    safe, risky = kl_r, kl_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Normality Detection (relative to correct examples)')
    safe, risky = s_rp, s_p_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


    print('\n\nAbnormality Detection')
    print('Abnormality base rate (%):', round(100*out_examples.shape[0]/(
                out_examples.shape[0] + in_examples.shape[0]),2))
    print('KL[p||u]: Abnormality Detection')
    safe, risky = -kl_a, -kl_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Abnormality Detection')
    safe, risky = -s_p, -s_p_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Abnormality base rate (%):', round(100*out_examples.shape[0]/
          (out_examples.shape[0] + (1 - err/100)*in_examples.shape[0]),2))
    print('KL[p||u]: Abnormality Detection (relative to correct examples)')
    safe, risky = -kl_r, -kl_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Abnormality Detection (relative to correct examples)')
    safe, risky = -s_rp, -s_p_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

Omniglot OOD Detection


In [12]:
import scipy.io as sio
import scipy.misc as scimisc

In [13]:
safe_list = [0,2,5,6,8,12,13,14,15,16,17,18,19,21,26]  # other alphabets have characters which look like digits
m = sio.loadmat("./data/data_background.mat")

squished_set = []
for safe_number in safe_list:
    for alphabet in m['images'][safe_number]:
        for letters in alphabet:
            for letter in letters:
                for example in letter:
                    squished_set.append(scimisc.imresize(1 - example[0], (28,28)).reshape(1, 28*28))

safe_images = np.concatenate(squished_set, axis=0)

In [33]:
print('Omniglot\n')
show_ood_detection_results(err, mnist.test.images, safe_images)


Omniglot

OOD Example Prediction Probability (mean, std):
0.866063 0.178644

Normality Detection
Normality base rate (%): 52.08
KL[p||u]: Normality Detection
AUPR (%): 96.28
AUROC (%): 95.61
Prediction Prob: Normality Detection
AUPR (%): 96.05
AUROC (%): 95.25
Normality base rate (%): 51.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 96.68
AUROC (%): 96.33
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 96.46
AUROC (%): 96.0


Abnormality Detection
Abnormality base rate (%): 47.92
KL[p||u]: Abnormality Detection
AUPR (%): 94.52
AUROC (%): 95.61
Prediction Prob: Abnormality Detection
AUPR (%): 94.02
AUROC (%): 95.25
Abnormality base rate (%): 48.29
KL[p||u]: Abnormality Detection (relative to correct examples)
AUPR (%): 95.91
AUROC (%): 96.33
Prediction Prob: Abnormality Detection (relative to correct examples)
AUPR (%): 95.61
AUROC (%): 96.0

notMNIST OOS Detection


In [34]:
pickle_file = './data/notMNIST.pickle'
with open(pickle_file, 'rb') as f:
    save = pickle.load(f, encoding='latin1')
    test_dataset = save['test_dataset'].reshape((-1, 28 * 28))
    del save  # hint to help gc free up memory

print('notMNIST\n')

show_ood_detection_results(err, mnist.test.images, test_dataset)


notMNIST

OOD Example Prediction Probability (mean, std):
0.938411 0.127357

Normality Detection
Normality base rate (%): 50.0
KL[p||u]: Normality Detection
AUPR (%): 82.81
AUROC (%): 82.94
Prediction Prob: Normality Detection
AUPR (%): 84.47
AUROC (%): 82.87
Normality base rate (%): 49.63
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 83.18
AUROC (%): 83.78
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 84.86
AUROC (%): 83.71


Abnormality Detection
Abnormality base rate (%): 50.0
KL[p||u]: Abnormality Detection
AUPR (%): 84.48
AUROC (%): 82.94
Prediction Prob: Abnormality Detection
AUPR (%): 84.67
AUROC (%): 82.87
Abnormality base rate (%): 50.37
KL[p||u]: Abnormality Detection (relative to correct examples)
AUPR (%): 86.58
AUROC (%): 83.78
Prediction Prob: Abnormality Detection (relative to correct examples)
AUPR (%): 86.82
AUROC (%): 83.71

CIFAR-10 OOD detection


In [35]:
_, _, X_test, _ = load_data10()
cifar_batch = sess.run(tf.image.resize_images(tf.image.rgb_to_grayscale(X_test), 28, 28))

print('CIFAR-10bw\n')

show_ood_detection_results(err, mnist.test.images, cifar_batch.reshape(-1, 28*28))


CIFAR-10bw

OOD Example Prediction Probability (mean, std):
0.88425 0.168702

Normality Detection
Normality base rate (%): 50.0
KL[p||u]: Normality Detection
AUPR (%): 94.21
AUROC (%): 94.15
Prediction Prob: Normality Detection
AUPR (%): 94.07
AUROC (%): 93.7
Normality base rate (%): 49.63
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 94.64
AUROC (%): 94.9
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 94.51
AUROC (%): 94.47


Abnormality Detection
Abnormality base rate (%): 50.0
KL[p||u]: Abnormality Detection
AUPR (%): 93.77
AUROC (%): 94.15
Prediction Prob: Abnormality Detection
AUPR (%): 93.23
AUROC (%): 93.7
Abnormality base rate (%): 50.37
KL[p||u]: Abnormality Detection (relative to correct examples)
AUPR (%): 95.16
AUROC (%): 94.9
Prediction Prob: Abnormality Detection (relative to correct examples)
AUPR (%): 94.83
AUROC (%): 94.47

Sheer White Noise


In [36]:
print('Sheer White Gaussian Noise\n')

show_ood_detection_results(err, mnist.test.images, np.random.normal(size=(10000, 28*28)))

# caveat: if you let the scale = 100 this will fail because it is inputting to much
# energy into the network, but this is unrealistic


Sheer White Gaussian Noise

OOD Example Prediction Probability (mean, std):
0.917362 0.144497

Normality Detection
Normality base rate (%): 50.0
KL[p||u]: Normality Detection
AUPR (%): 88.19
AUROC (%): 88.59
Prediction Prob: Normality Detection
AUPR (%): 88.93
AUROC (%): 88.29
Normality base rate (%): 49.63
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 88.59
AUROC (%): 89.42
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 89.34
AUROC (%): 89.12


Abnormality Detection
Abnormality base rate (%): 50.0
KL[p||u]: Abnormality Detection
AUPR (%): 89.01
AUROC (%): 88.59
Prediction Prob: Abnormality Detection
AUPR (%): 88.9
AUROC (%): 88.29
Abnormality base rate (%): 50.37
KL[p||u]: Abnormality Detection (relative to correct examples)
AUPR (%): 90.86
AUROC (%): 89.42
Prediction Prob: Abnormality Detection (relative to correct examples)
AUPR (%): 90.83
AUROC (%): 89.12

Sheer Uniform Noise (in the positive orthant)


In [37]:
print('Sheer Uniform Noise\n')

show_ood_detection_results(err, mnist.test.images, np.random.uniform(size=(10000, 28*28)))


Sheer Uniform Noise

OOD Example Prediction Probability (mean, std):
0.881227 0.15026

Normality Detection
Normality base rate (%): 50.0
KL[p||u]: Normality Detection
AUPR (%): 98.68
AUROC (%): 98.0
Prediction Prob: Normality Detection
AUPR (%): 98.63
AUROC (%): 97.89
Normality base rate (%): 49.63
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 99.1
AUROC (%): 98.7
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 99.06
AUROC (%): 98.63


Abnormality Detection
Abnormality base rate (%): 50.0
KL[p||u]: Abnormality Detection
AUPR (%): 96.5
AUROC (%): 98.0
Prediction Prob: Abnormality Detection
AUPR (%): 96.06
AUROC (%): 97.89
Abnormality base rate (%): 50.37
KL[p||u]: Abnormality Detection (relative to correct examples)
AUPR (%): 98.02
AUROC (%): 98.7
Prediction Prob: Abnormality Detection (relative to correct examples)
AUPR (%): 97.73
AUROC (%): 98.63