In [1]:
import tensorflow as tf
import numpy as np
import re
import collections
import sklearn.metrics as sk

In [2]:
def load_data(filename='./data/20ng-train.txt'):
    '''
    :param filename: the system location of the data to load
    :return: the text (x) and its label (y)
             the text is a list of words and is not processed
    '''

    # stop words taken from nltk
    stop_words = ['i','me','my','myself','we','our','ours','ourselves','you','your','yours',
                  'yourself','yourselves','he','him','his','himself','she','her','hers','herself',
                  'it','its','itself','they','them','their','theirs','themselves','what','which',
                  'who','whom','this','that','these','those','am','is','are','was','were','be',
                  'been','being','have','has','had','having','do','does','did','doing','a','an',
                  'the','and','but','if','or','because','as','until','while','of','at','by','for',
                  'with','about','against','between','into','through','during','before','after',
                  'above','below','to','from','up','down','in','out','on','off','over','under',
                  'again','further','then','once','here','there','when','where','why','how','all',
                  'any','both','each','few','more','most','other','some','such','no','nor','not',
                  'only','own','same','so','than','too','very','s','t','can','will','just','don',
                  'should','now','d','ll','m','o','re','ve','y','ain','aren','couldn','didn',
                  'doesn','hadn','hasn','haven','isn','ma','mightn','mustn','needn','shan',
                  'shouldn','wasn','weren','won','wouldn']

    x, y = [], []
    with open(filename, "r") as f:
        for line in f:
            line = re.sub(r'\W+', ' ', line).strip()
            x.append(line[1:])
            x[-1] = ' '.join(word for word in x[-1].split() if word not in stop_words)
            y.append(line[0])
    return x, np.array(y, dtype=int)

def get_vocab(dataset):
    '''
    :param dataset: the text from load_data

    :return: a _ordered_ dictionary from words to counts
    '''
    vocab = {}

    # create a counter for each word
    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] = 0

    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] += 1
    
    # sort from greatest to least by count
    return collections.OrderedDict(sorted(vocab.items(), key=lambda x: x[1], reverse=True))

def text_to_rank(dataset, _vocab, desired_vocab_size=15000):
    '''
    :param dataset: the text from load_data
    :vocab: a _ordered_ dictionary of vocab words and counts from get_vocab
    :param desired_vocab_size: the desired vocabulary size
    words no longer in vocab become UUUNNNKKK
    :return: the text corpus with words mapped to their vocab rank,
    with all sufficiently infrequent words mapped to UUUNNNKKK; UUUNNNKKK has rank desired_vocab_size
    (the infrequent word cutoff is determined by desired_vocab size)
    '''
    _dataset = dataset[:]     # aliasing safeguard
    vocab_ordered = list(_vocab)
    count_cutoff = _vocab[vocab_ordered[desired_vocab_size-1]] # get word by its rank and map to its count
    
    word_to_rank = {}
    for i in range(len(vocab_ordered)):
        # we add one to make room for any future padding symbol with value 0
        word_to_rank[vocab_ordered[i]] = i + 1
    
    for i in range(len(_dataset)):
        example = _dataset[i]
        example_as_list = example.split()
        for j in range(len(example_as_list)):
            try:
                if _vocab[example_as_list[j]] >= count_cutoff and word_to_rank[example_as_list[j]] < desired_vocab_size:
                    # we need to ensure that other words below the word on the edge of our desired_vocab size
                    # are not also on the count cutoff
                    example_as_list[j] = word_to_rank[example_as_list[j]] 
                else:
                    example_as_list[j] = desired_vocab_size  # UUUNNNKKK
            except:
                example_as_list[j] = desired_vocab_size  # UUUNNNKKK
        _dataset[i] = example_as_list

    return _dataset

def get_vocab(dataset):
    '''
    :param dataset: the text from load_data

    :return: a _ordered_ dictionary from words to counts
    '''
    vocab = {}

    # create a counter for each word
    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] = 0

    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] += 1

    # sort from greatest to least by count
    return collections.OrderedDict(sorted(vocab.items(), key=lambda x: x[1], reverse=True))

In [3]:
# taken from keras
def pad_sequences(sequences, maxlen=None, dtype='int32',
                  padding='pre', truncating='pre', value=0.):
    '''Pads each sequence to the same length:
    the length of the longest sequence.
    If maxlen is provided, any sequence longer
    than maxlen is truncated to maxlen.
    Truncation happens off either the beginning (default) or
    the end of the sequence.
    Supports post-padding and pre-padding (default).
    # Arguments
        sequences: list of lists where each element is a sequence
        maxlen: int, maximum length
        dtype: type to cast the resulting sequence.
        padding: 'pre' or 'post', pad either before or after each sequence.
        truncating: 'pre' or 'post', remove values from sequences larger than
            maxlen either in the beginning or in the end of the sequence
        value: float, value to pad the sequences to the desired value.
    # Returns
        x: numpy array with dimensions (number_of_sequences, maxlen)
    '''
    lengths = [len(s) for s in sequences]

    nb_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
            continue  # empty list was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' % truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x

In [4]:
def partion_data_in_two(dataset, dataset_labels, in_sample_labels, oos_labels):
    '''
    :param dataset: the text from text_to_rank
    :param dataset_labels: dataset labels
    :param in_sample_labels: a list of newsgroups which the network will/did train on
    :param oos_labels: the complement of in_sample_labels; these newsgroups the network has never seen
    :return: the dataset partitioned into in_sample_examples, in_sample_labels,
    oos_examples, and oos_labels in that order
    '''
    _dataset = dataset[:]     # aliasing safeguard
    _dataset_labels = dataset_labels
    
    in_sample_idxs = np.zeros(np.shape(_dataset_labels), dtype=bool)
    ones_vec = np.ones(np.shape(_dataset_labels), dtype=int)
    for label in in_sample_labels:
        in_sample_idxs = np.logical_or(in_sample_idxs, _dataset_labels == label * ones_vec)

    
    return _dataset[in_sample_idxs], _dataset_labels[in_sample_idxs],\
        _dataset[np.logical_not(in_sample_idxs)], _dataset_labels[np.logical_not(in_sample_idxs)]
    
# our network trains only on a subset of classes, say 6, but class number 7 might still
# be an in-sample label: we need to squish the labels to be in {0,...,5}
def relabel_in_sample_labels(labels):
    labels_as_list = labels.tolist()
    
    set_of_labels = []
    for label in labels_as_list:
        set_of_labels.append(label)
    labels_ordered = sorted(list(set(set_of_labels)))
    
    relabeled = np.zeros(labels.shape, dtype=int)
    for i in range(len(labels_as_list)):
        relabeled[i] = labels_ordered.index(labels_as_list[i])
    
    return relabeled

In [5]:
max_example_len = 1000
batch_size = 32
embedding_dims = 30   # TODO: change to 50 and see what happens
vocab_size = 15000
num_epochs = 20
hidden_dim = 128
nclasses_to_exclude = 5  # 0-18

In [6]:
random_classes = np.arange(20)
np.random.shuffle(random_classes)
to_include = list(random_classes[:20-nclasses_to_exclude])
to_exclude = list(random_classes[20-nclasses_to_exclude:])

In [7]:
print('Loading Data')
X_train, Y_train = load_data()
X_test, Y_test = load_data('./data/20ng-test.txt')

vocab = get_vocab(X_train)
X_train = text_to_rank(X_train, vocab, vocab_size)
X_train = pad_sequences(X_train, maxlen=max_example_len)
X_test = text_to_rank(X_test, vocab, vocab_size)
X_test = pad_sequences(X_test, maxlen=max_example_len)

# shuffle
indices = np.arange(X_train.shape[0])
np.random.shuffle(indices)
X_train = X_train[indices]
Y_train = Y_train[indices]

indices = np.arange(X_test.shape[0])
np.random.shuffle(indices)
X_test = X_test[indices]
Y_test = Y_test[indices]

# split into train/dev
X_dev = X_train[-1500:]
Y_dev = Y_train[-1500:]
X_train = X_train[:-1500]
Y_train = Y_train[:-1500]

in_sample_examples, in_sample_labels, oos_examples, oos_labels =\
partion_data_in_two(X_train, Y_train, to_include, to_exclude)
dev_in_sample_examples, dev_in_sample_labels, dev_oos_examples, dev_oos_labels =\
partion_data_in_two(X_dev, Y_dev, to_include, to_exclude)
test_in_sample_examples, test_in_sample_labels, test_oos_examples, dev_oos_labels =\
partion_data_in_two(X_test, Y_test, to_include, to_exclude)

# safely assumes there is an example for each in_sample class in both the training and dev class 
in_sample_labels = relabel_in_sample_labels(in_sample_labels)
dev_in_sample_labels = relabel_in_sample_labels(dev_in_sample_labels)
test_in_sample_labels = relabel_in_sample_labels(test_in_sample_labels)

num_examples = in_sample_labels.shape[0]
num_batches = num_examples//batch_size

print('Data loaded')


Loading Data
Data loaded

In [10]:
graph = tf.Graph()

with graph.as_default():
    x = tf.placeholder(dtype=tf.int32, shape=[None, max_example_len])
    y = tf.placeholder(dtype=tf.int64, shape=[None])
#     is_training = tf.placeholder(tf.bool)
    
    # add one to vocab size for the padding symbol
    W_embedding = tf.Variable(
        tf.random_uniform([vocab_size+1, embedding_dims])/tf.sqrt((vocab_size+1+embedding_dims)/6.),
        trainable=True)
    
    w_vecs = tf.nn.embedding_lookup(W_embedding, x)
    pooled = tf.reduce_mean(w_vecs, reduction_indices=[1])
    
    W_out = tf.Variable(tf.nn.l2_normalize(tf.random_normal([embedding_dims, 20-nclasses_to_exclude]), 0))
    b_out = tf.Variable(tf.zeros([20-nclasses_to_exclude]))
    
    logits = tf.matmul(pooled, W_out) + b_out
    
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y))

    global_step = tf.Variable(0, trainable=False)
    lr = tf.train.exponential_decay(1e-2, global_step, 15*num_batches, 0.1, staircase=True)
    optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss, global_step=global_step)

    acc = 100*tf.reduce_mean(tf.to_float(tf.equal(tf.argmax(logits, 1), y)))

In [11]:
# initialize
sess = tf.InteractiveSession(graph=graph)
tf.initialize_all_variables().run()
# create saver to train model
saver = tf.train.Saver(max_to_keep=1)

print('Initialized')


Initialized

In [11]:
sess.close()

In [12]:
best_acc = 0

for epoch in range(num_epochs):
    # shuffle data every epoch
    indices = np.arange(num_examples)
    np.random.shuffle(indices)
    in_sample_examples = in_sample_examples[indices]
    in_sample_labels = in_sample_labels[indices]

    for i in range(num_batches):
        offset = i * batch_size

        x_batch = in_sample_examples[offset:offset + batch_size]
        y_batch = in_sample_labels[offset:offset + batch_size]

        _, l, batch_acc = sess.run([optimizer, loss, acc], feed_dict={x: x_batch, y: y_batch})

    curr_dev_acc = sess.run(
        acc, feed_dict={x: dev_in_sample_examples, y: dev_in_sample_labels})
    if best_acc < curr_dev_acc:
        best_acc = curr_dev_acc
        saver.save(sess, './data/best_newsgroup_model.ckpt')

    print('Epoch %d | Minibatch loss %.3f | Minibatch accuracy %.3f | Dev accuracy %.3f' %
          (epoch+1, l, batch_acc, curr_dev_acc))


Epoch 1 | Minibatch loss 1.465 | Minibatch accuracy 56.250 | Dev accuracy 62.717
Epoch 2 | Minibatch loss 1.242 | Minibatch accuracy 68.750 | Dev accuracy 62.796
Epoch 3 | Minibatch loss 0.663 | Minibatch accuracy 75.000 | Dev accuracy 63.823
Epoch 4 | Minibatch loss 0.866 | Minibatch accuracy 84.375 | Dev accuracy 65.877
Epoch 5 | Minibatch loss 0.746 | Minibatch accuracy 78.125 | Dev accuracy 73.065
Epoch 6 | Minibatch loss 0.647 | Minibatch accuracy 78.125 | Dev accuracy 77.962
Epoch 7 | Minibatch loss 0.327 | Minibatch accuracy 93.750 | Dev accuracy 82.385
Epoch 8 | Minibatch loss 0.238 | Minibatch accuracy 87.500 | Dev accuracy 72.828
Epoch 9 | Minibatch loss 0.459 | Minibatch accuracy 84.375 | Dev accuracy 80.332
Epoch 10 | Minibatch loss 0.167 | Minibatch accuracy 96.875 | Dev accuracy 84.202
Epoch 11 | Minibatch loss 0.047 | Minibatch accuracy 100.000 | Dev accuracy 86.414
Epoch 12 | Minibatch loss 0.249 | Minibatch accuracy 100.000 | Dev accuracy 86.098
Epoch 13 | Minibatch loss 0.191 | Minibatch accuracy 93.750 | Dev accuracy 91.074
Epoch 14 | Minibatch loss 0.408 | Minibatch accuracy 78.125 | Dev accuracy 89.573
Epoch 15 | Minibatch loss 0.246 | Minibatch accuracy 90.625 | Dev accuracy 89.731
Epoch 16 | Minibatch loss 0.087 | Minibatch accuracy 100.000 | Dev accuracy 93.523
Epoch 17 | Minibatch loss 0.159 | Minibatch accuracy 93.750 | Dev accuracy 95.024
Epoch 18 | Minibatch loss 0.226 | Minibatch accuracy 96.875 | Dev accuracy 93.207
Epoch 19 | Minibatch loss 0.107 | Minibatch accuracy 100.000 | Dev accuracy 92.338
Epoch 20 | Minibatch loss 0.115 | Minibatch accuracy 96.875 | Dev accuracy 94.392

In [13]:
# restore variables from disk
saver.restore(sess, "./data/best_newsgroup_model.ckpt")
print("Best model restored!")

print('Dev accuracy:', sess.run(acc, feed_dict={x: dev_in_sample_examples, y: dev_in_sample_labels}))


Best model restored!
Dev accuracy: 95.0237

In [15]:
s = tf.nn.softmax(logits)
s_prob = tf.reduce_max(s, reduction_indices=[1], keep_dims=True)
kl_all = tf.log(20. - nclasses_to_exclude)\
        + tf.reduce_sum(s * tf.log(tf.abs(s) + 1e-10), reduction_indices=[1], keep_dims=True)
m_all, v_all = tf.nn.moments(kl_all, axes=[0])

logits_right = tf.boolean_mask(logits, tf.equal(tf.argmax(logits, 1), y))
s_right = tf.nn.softmax(logits_right)
s_right_prob = tf.reduce_max(s_right, reduction_indices=[1], keep_dims=True)
kl_right = tf.log(20. - nclasses_to_exclude)\
         + tf.reduce_sum(s_right * tf.log(tf.abs(s_right) + 1e-10), reduction_indices=[1], keep_dims=True)
m_right, v_right = tf.nn.moments(kl_right, axes=[0])

logits_wrong = tf.boolean_mask(logits, tf.not_equal(tf.argmax(logits, 1), y))
s_wrong = tf.nn.softmax(logits_wrong)
s_wrong_prob = tf.reduce_max(s_wrong, reduction_indices=[1], keep_dims=True)
kl_wrong = tf.log(20. - nclasses_to_exclude)\
           + tf.reduce_sum(s_wrong * tf.log(tf.abs(s_wrong) + 1e-10), reduction_indices=[1], keep_dims=True)
m_wrong, v_wrong = tf.nn.moments(kl_wrong, axes=[0])

In [16]:
err, kl_a, kl_r, kl_w, s_p, s_rp, s_wp = sess.run(
    [100 - acc, kl_all, kl_right, kl_wrong, s_prob, s_right_prob, s_wrong_prob],
    feed_dict={x: test_in_sample_examples, y: test_in_sample_labels})

print('20 NG (w/class subset) Error (%)| Prediction Prob (mean, std) | PProb Right (mean, std) | PProb Wrong (mean, std):')
print(err, '|', np.mean(s_p), np.std(s_p), '|', np.mean(s_rp), np.std(s_rp), '|', np.mean(s_wp), np.std(s_wp))

print('\nSuccess Detection')
print('Success base rate (%):', round(100-err,2))
print('KL[p||u]: Right/Wrong classification distinction')
safe, risky = kl_r, kl_w
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[:safe.shape[0]] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

print('Prediction Prob: Right/Wrong classification distinction')
safe, risky = s_rp, s_wp
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[:safe.shape[0]] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


print('\nError Detection')
print('Error base rate (%):', round(err,2))
safe, risky = -kl_r, -kl_w
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[safe.shape[0]:] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('KL[p||u]: Right/Wrong classification distinction')
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

print('Prediction Prob: Right/Wrong classification distinction')
safe, risky = -s_rp, -s_wp
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[safe.shape[0]:] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


20 NG (w/class subset) Error (%)| Prediction Prob (mean, std) | PProb Right (mean, std) | PProb Wrong (mean, std):
7.3129 | 0.861871 0.205038 | 0.888052 0.178718 | 0.530052 0.226419

Success Detection
Success base rate (%): 92.69
KL[p||u]: Right/Wrong classification distinction
AUPR (%): 98.79
AUROC (%): 87.43
Prediction Prob: Right/Wrong classification distinction
AUPR (%): 98.9
AUROC (%): 88.68

Error Detection
Error base rate (%): 7.31
KL[p||u]: Right/Wrong classification distinction
AUPR (%): 37.91
AUROC (%): 87.43
Prediction Prob: Right/Wrong classification distinction
AUPR (%): 41.66
AUROC (%): 88.68

In [17]:
def show_ood_detection_results(error_rate_for_in, in_examples, out_examples):
    kl_oos, s_p_oos = sess.run([kl_all, s_prob], feed_dict={x: out_examples})

    print('OOD Example Prediction Probability (mean, std):')
    print(np.mean(s_p_oos), np.std(s_p_oos))

    print('\nNormality Detection')
    print('Normality base rate (%):', round(100*in_examples.shape[0]/(
                out_examples.shape[0] + in_examples.shape[0]),2))
    print('KL[p||u]: Normality Detection')
    safe, risky = kl_a, kl_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Normality Detection')
    safe, risky = s_p, s_p_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Normality base rate (%):', round(100*(1 - err/100)*in_examples.shape[0]/
          (out_examples.shape[0] + (1 - err/100)*in_examples.shape[0]),2))
    print('KL[p||u]: Normality Detection (relative to correct examples)')
    safe, risky = kl_r, kl_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Normality Detection (relative to correct examples)')
    safe, risky = s_rp, s_p_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


    print('\n\nAbnormality Detection')
    print('Abnormality base rate (%):', round(100*out_examples.shape[0]/(
                out_examples.shape[0] + in_examples.shape[0]),2))
    print('KL[p||u]: Abnormality Detection')
    safe, risky = -kl_a, -kl_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Abnormality Detection')
    safe, risky = -s_p, -s_p_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Abnormality base rate (%):', round(100*out_examples.shape[0]/
          (out_examples.shape[0] + (1 - err/100)*in_examples.shape[0]),2))
    print('KL[p||u]: Abnormality Detection (relative to correct examples)')
    safe, risky = -kl_r, -kl_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Abnormality Detection (relative to correct examples)')
    safe, risky = -s_rp, -s_p_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

In [18]:
print('Held-out subjects\n')
show_ood_detection_results(err, test_in_sample_examples, test_oos_examples)


Held-out subjects

OOD Example Prediction Probability (mean, std):
0.645483 0.272967

Normality Detection
Normality base rate (%): 85.37
KL[p||u]: Normality Detection
AUPR (%): 92.21
AUROC (%): 72.1
Prediction Prob: Normality Detection
AUPR (%): 92.27
AUROC (%): 72.24
Normality base rate (%): 84.4
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 92.31
AUROC (%): 74.68
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 92.4
AUROC (%): 74.97


Abnormality Detection
Abnormality base rate (%): 14.63
KL[p||u]: Abnormality Detection
AUPR (%): 33.48
AUROC (%): 72.1
Prediction Prob: Normality Detection
AUPR (%): 33.99
AUROC (%): 72.24
Abnormality base rate (%): 15.6
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 43.28
AUROC (%): 74.68
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 44.74
AUROC (%): 74.97