notebook.community

Edit and run



In [1]:

    
import tensorflow as tf
import numpy as np
import re
import collections
import sklearn.metrics as sk



In [2]:

    
def load_data(filename='./data/r8-train.txt'):
    '''
    :param filename: the system location of the data to load
    :return: the text (x) and its label (y)
             the text is a list of words and is not processed
    '''

    # stop words taken from nltk
    stop_words = ['i','me','my','myself','we','our','ours','ourselves','you','your','yours',
                  'yourself','yourselves','he','him','his','himself','she','her','hers','herself',
                  'it','its','itself','they','them','their','theirs','themselves','what','which',
                  'who','whom','this','that','these','those','am','is','are','was','were','be',
                  'been','being','have','has','had','having','do','does','did','doing','a','an',
                  'the','and','but','if','or','because','as','until','while','of','at','by','for',
                  'with','about','against','between','into','through','during','before','after',
                  'above','below','to','from','up','down','in','out','on','off','over','under',
                  'again','further','then','once','here','there','when','where','why','how','all',
                  'any','both','each','few','more','most','other','some','such','no','nor','not',
                  'only','own','same','so','than','too','very','s','t','can','will','just','don',
                  'should','now','d','ll','m','o','re','ve','y','ain','aren','couldn','didn',
                  'doesn','hadn','hasn','haven','isn','ma','mightn','mustn','needn','shan',
                  'shouldn','wasn','weren','won','wouldn']

    x, y = [], []
    with open(filename, "r") as f:
        for line in f:
            line = re.sub(r'\W+', ' ', line).strip()
            x.append(line[1:])
            x[-1] = ' '.join(word for word in x[-1].split() if word not in stop_words)
            y.append(line[0])
    return x, np.array(y, dtype=int)

def get_vocab(dataset):
    '''
    :param dataset: the text from load_data

    :return: a _ordered_ dictionary from words to counts
    '''
    vocab = {}

    # create a counter for each word
    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] = 0

    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] += 1
    
    # sort from greatest to least by count
    return collections.OrderedDict(sorted(vocab.items(), key=lambda x: x[1], reverse=True))

def text_to_rank(dataset, _vocab, desired_vocab_size=1000):
    '''
    :param dataset: the text from load_data
    :vocab: a _ordered_ dictionary of vocab words and counts from get_vocab
    :param desired_vocab_size: the desired vocabulary size
    words no longer in vocab become UUUNNNKKK
    :return: the text corpus with words mapped to their vocab rank,
    with all sufficiently infrequent words mapped to UUUNNNKKK; UUUNNNKKK has rank desired_vocab_size
    (the infrequent word cutoff is determined by desired_vocab size)
    '''
    _dataset = dataset[:]     # aliasing safeguard
    vocab_ordered = list(_vocab)
    count_cutoff = _vocab[vocab_ordered[desired_vocab_size-2]] # get word by its rank and map to its count
    
    word_to_rank = {}
    for i in range(len(vocab_ordered)):
        # we add one to make room for any future padding symbol with value 0
        word_to_rank[vocab_ordered[i]] = i
    
    for i in range(len(_dataset)):
        example = _dataset[i]
        example_as_list = example.split()
        for j in range(len(example_as_list)):
            try:
                if _vocab[example_as_list[j]] >= count_cutoff and word_to_rank[example_as_list[j]] < desired_vocab_size:
                    # we need to ensure that other words below the word on the edge of our desired_vocab size
                    # are not also on the count cutoff
                    example_as_list[j] = word_to_rank[example_as_list[j]] 
                else:
                    example_as_list[j] = desired_vocab_size-1  # UUUNNNKKK
            except:
                example_as_list[j] = desired_vocab_size-1  # UUUNNNKKK
        _dataset[i] = example_as_list

    return _dataset

def text_to_matrix(dataset, _vocab, desired_vocab_size=1000):
    sequences = text_to_rank(dataset, _vocab, desired_vocab_size)
    
    mat = np.zeros((len(sequences), desired_vocab_size), dtype=int)
    
    for i, seq in enumerate(sequences):
        for token in seq:
            mat[i][token] = 1
    
    return mat

def get_vocab(dataset):
    '''
    :param dataset: the text from load_data

    :return: a _ordered_ dictionary from words to counts
    '''
    vocab = {}

    # create a counter for each word
    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] = 0

    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] += 1

    # sort from greatest to least by count
    return collections.OrderedDict(sorted(vocab.items(), key=lambda x: x[1], reverse=True))



In [3]:

    
def partion_data_in_two(dataset, dataset_labels, in_sample_labels, oos_labels):
    '''
    :param dataset: the text from text_to_rank
    :param dataset_labels: dataset labels
    :param in_sample_labels: a list of newsgroups which the network will/did train on
    :param oos_labels: the complement of in_sample_labels; these newsgroups the network has never seen
    :return: the dataset partitioned into in_sample_examples, in_sample_labels,
    oos_examples, and oos_labels in that order
    '''
    _dataset = dataset[:]     # aliasing safeguard
    _dataset_labels = dataset_labels
    
    in_sample_idxs = np.zeros(np.shape(_dataset_labels), dtype=bool)
    ones_vec = np.ones(np.shape(_dataset_labels), dtype=int)
    for label in in_sample_labels:
        in_sample_idxs = np.logical_or(in_sample_idxs, _dataset_labels == label * ones_vec)

    
    return _dataset[in_sample_idxs], _dataset_labels[in_sample_idxs],\
        _dataset[np.logical_not(in_sample_idxs)], _dataset_labels[np.logical_not(in_sample_idxs)]



In [4]:

    
# our network trains only on a subset of classes, say 6, but class number 7 might still
# be an in-sample label: we need to squish the labels to be in {0,...,5}
def relabel_in_sample_labels(labels):
    labels_as_list = labels.tolist()
    
    set_of_labels = []
    for label in labels_as_list:
        set_of_labels.append(label)
    labels_ordered = sorted(list(set(set_of_labels)))
    
    relabeled = np.zeros(labels.shape, dtype=int)
    for i in range(len(labels_as_list)):
        relabeled[i] = labels_ordered.index(labels_as_list[i])
    
    return relabeled



In [5]:

    
batch_size = 32
vocab_size = 1000
num_epochs = 5
n_hidden = 512
nclasses_to_exclude = 2  # 0-3



In [6]:

    
random_classes = np.arange(8)
np.random.shuffle(random_classes)
to_include = list(random_classes[:8-nclasses_to_exclude])
to_exclude = list(random_classes[8-nclasses_to_exclude:])



In [7]:

    
print('Loading Data')
X_train, Y_train = load_data()
X_test, Y_test = load_data('./data/r8-test.txt')

vocab = get_vocab(X_train)
X_train = text_to_matrix(X_train, vocab, vocab_size)
X_test = text_to_matrix(X_test, vocab, vocab_size)

# shuffle
indices = np.arange(X_train.shape[0])
np.random.shuffle(indices)
X_train = X_train[indices]
Y_train = Y_train[indices]

indices = np.arange(X_test.shape[0])
np.random.shuffle(indices)
X_test = X_test[indices]
Y_test = Y_test[indices]

# split into train/dev
X_dev = X_train[-500:]
Y_dev = Y_train[-500:]
X_train = X_train[:-500]
Y_train = Y_train[:-500]

in_sample_examples, in_sample_labels, oos_examples, oos_labels =\
partion_data_in_two(X_train, Y_train, to_include, to_exclude)
dev_in_sample_examples, dev_in_sample_labels, dev_oos_examples, dev_oos_labels =\
partion_data_in_two(X_dev, Y_dev, to_include, to_exclude)
test_in_sample_examples, test_in_sample_labels, test_oos_examples, dev_oos_labels =\
partion_data_in_two(X_test, Y_test, to_include, to_exclude)

# safely assumes there is an example for each in_sample class in both the training and dev class 
in_sample_labels = relabel_in_sample_labels(in_sample_labels)
dev_in_sample_labels = relabel_in_sample_labels(dev_in_sample_labels)
test_in_sample_labels = relabel_in_sample_labels(test_in_sample_labels)

num_examples = in_sample_labels.shape[0]
num_batches = num_examples//batch_size

print('Data loaded')









    



Loading Data
Data loaded



In [8]:

    
graph = tf.Graph()

with graph.as_default():
    x = tf.placeholder(dtype=tf.float32, shape=[None, vocab_size])
    y = tf.placeholder(dtype=tf.int64, shape=[None])
    is_training = tf.placeholder(tf.bool)
    
    # add one to vocab size for the padding symbol

    W_h = tf.Variable(tf.nn.l2_normalize(tf.random_normal([vocab_size, n_hidden]), 0)/tf.sqrt(1 + 0.45))
    b_h = tf.Variable(tf.zeros([n_hidden]))
    
    def gelu_fast(_x):
        return 0.5 * _x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (_x + 0.044715 * tf.pow(_x, 3))))
    
    h = tf.cond(is_training,
                lambda: tf.nn.dropout(gelu_fast(tf.matmul(x, W_h) + b_h), 0.5),
                lambda: gelu_fast(tf.matmul(x, W_h) + b_h))
    
    W_out = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_hidden, 8-nclasses_to_exclude]), 0)/tf.sqrt(0.45 + 1))
    b_out = tf.Variable(tf.zeros([8-nclasses_to_exclude]))
    
    logits = tf.matmul(h, W_out) + b_out
    
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y))

    global_step = tf.Variable(0, trainable=False)
    lr = tf.train.exponential_decay(1e-3, global_step, 4*num_batches, 0.1, staircase=True)
    optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss, global_step=global_step)

    acc = 100*tf.reduce_mean(tf.to_float(tf.equal(tf.argmax(logits, 1), y)))



In [9]:

    
# initialize
sess = tf.InteractiveSession(graph=graph)
tf.initialize_all_variables().run()
# create saver to train model
saver = tf.train.Saver(max_to_keep=1)

print('Initialized')









    



Initialized



In [56]:

    
sess.close()



In [10]:

    
best_acc = 0

for epoch in range(num_epochs):
    # shuffle data every epoch
    indices = np.arange(num_examples)
    np.random.shuffle(indices)
    in_sample_examples = in_sample_examples[indices]
    in_sample_labels = in_sample_labels[indices]

    for i in range(num_batches):
        offset = i * batch_size

        x_batch = in_sample_examples[offset:offset + batch_size]
        y_batch = in_sample_labels[offset:offset + batch_size]

        _, l, batch_acc = sess.run([optimizer, loss, acc], feed_dict={x: x_batch, y: y_batch, is_training: True})


    curr_dev_acc = sess.run(
        acc, feed_dict={x: dev_in_sample_examples, y: dev_in_sample_labels, is_training: False})
    if best_acc < curr_dev_acc:
        best_acc = curr_dev_acc
        saver.save(sess, './data/best_r8_model.ckpt')

    print('Epoch %d | Minibatch loss %.3f | Minibatch accuracy %.3f | Dev accuracy %.3f' %
          (epoch+1, l, batch_acc, curr_dev_acc))









    



Epoch 1 | Minibatch loss 0.404 | Minibatch accuracy 90.625 | Dev accuracy 97.447
Epoch 2 | Minibatch loss 0.041 | Minibatch accuracy 100.000 | Dev accuracy 97.447
Epoch 3 | Minibatch loss 0.006 | Minibatch accuracy 100.000 | Dev accuracy 97.447
Epoch 4 | Minibatch loss 0.019 | Minibatch accuracy 100.000 | Dev accuracy 97.447
Epoch 5 | Minibatch loss 0.006 | Minibatch accuracy 100.000 | Dev accuracy 97.660



In [11]:

    
# restore variables from disk
saver.restore(sess, "./data/best_r8_model.ckpt")
print("Best model restored!")

print('Dev accuracy:', sess.run(acc, feed_dict={x: dev_in_sample_examples, y: dev_in_sample_labels, is_training:False}))









    



Best model restored!
Dev accuracy: 97.6596



In [13]:

    
s = tf.nn.softmax(logits)
s_prob = tf.reduce_max(s, reduction_indices=[1], keep_dims=True)
kl_all = tf.log(8. - nclasses_to_exclude)\
        + tf.reduce_sum(s * tf.log(tf.abs(s) + 1e-10), reduction_indices=[1], keep_dims=True)
m_all, v_all = tf.nn.moments(kl_all, axes=[0])

logits_right = tf.boolean_mask(logits, tf.equal(tf.argmax(logits, 1), y))
s_right = tf.nn.softmax(logits_right)
s_right_prob = tf.reduce_max(s_right, reduction_indices=[1], keep_dims=True)
kl_right = tf.log(8. - nclasses_to_exclude)\
         + tf.reduce_sum(s_right * tf.log(tf.abs(s_right) + 1e-10), reduction_indices=[1], keep_dims=True)
m_right, v_right = tf.nn.moments(kl_right, axes=[0])

logits_wrong = tf.boolean_mask(logits, tf.not_equal(tf.argmax(logits, 1), y))
s_wrong = tf.nn.softmax(logits_wrong)
s_wrong_prob = tf.reduce_max(s_wrong, reduction_indices=[1], keep_dims=True)
kl_wrong = tf.log(8. - nclasses_to_exclude)\
           + tf.reduce_sum(s_wrong * tf.log(tf.abs(s_wrong) + 1e-10), reduction_indices=[1], keep_dims=True)
m_wrong, v_wrong = tf.nn.moments(kl_wrong, axes=[0])



In [14]:

    
err, kl_a, kl_r, kl_w, s_p, s_rp, s_wp = sess.run(
    [100 - acc, kl_all, kl_right, kl_wrong, s_prob, s_right_prob, s_wrong_prob],
    feed_dict={x: test_in_sample_examples, y: test_in_sample_labels, is_training: False})

print('Reuters8 (w/class subset) Error (%)| Prediction Prob (mean, std) | PProb Right (mean, std) | PProb Wrong (mean, std):')
print(err, '|', np.mean(s_p), np.std(s_p), '|', np.mean(s_rp), np.std(s_rp), '|', np.mean(s_wp), np.std(s_wp))

print('\nSuccess Detection')
print('Success base rate (%):', round(100-err,2))
print('KL[p||u]: Right/Wrong classification distinction')
safe, risky = kl_r, kl_w
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[:safe.shape[0]] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

print('Prediction Prob: Right/Wrong classification distinction')
safe, risky = s_rp, s_wp
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[:safe.shape[0]] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


print('\nError Detection')
print('Error base rate (%):', round(err,2))
safe, risky = -kl_r, -kl_w
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[safe.shape[0]:] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('KL[p||u]: Right/Wrong classification distinction')
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

print('Prediction Prob: Right/Wrong classification distinction')
safe, risky = -s_rp, -s_wp
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[safe.shape[0]:] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))









    



Reuters8 (w/class subset) Error (%)| Prediction Prob (mean, std) | PProb Right (mean, std) | PProb Wrong (mean, std):
2.53346 | 0.979176 0.0754985 | 0.984653 0.0592522 | 0.768477 0.210652

Success Detection
Success base rate (%): 97.47
KL[p||u]: Right/Wrong classification distinction
AUPR (%): 99.6
AUROC (%): 88.86
Prediction Prob: Right/Wrong classification distinction
AUPR (%): 99.61
AUROC (%): 89.26

Error Detection
Error base rate (%): 2.53
KL[p||u]: Right/Wrong classification distinction
AUPR (%): 27.47
AUROC (%): 88.86
Prediction Prob: Right/Wrong classification distinction
AUPR (%): 34.62
AUROC (%): 89.26



In [17]:

    
def show_ood_detection_results(error_rate_for_in, in_examples, out_examples):
    kl_oos, s_p_oos = sess.run([kl_all, s_prob], feed_dict={x: out_examples, is_training: False})

    print('OOD Example Prediction Probability (mean, std):')
    print(np.mean(s_p_oos), np.std(s_p_oos))

    print('\nNormality Detection')
    print('Normality base rate (%):', round(100*in_examples.shape[0]/(
                out_examples.shape[0] + in_examples.shape[0]),2))
    print('KL[p||u]: Normality Detection')
    safe, risky = kl_a, kl_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Normality Detection')
    safe, risky = s_p, s_p_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Normality base rate (%):', round(100*(1 - err/100)*in_examples.shape[0]/
          (out_examples.shape[0] + (1 - err/100)*in_examples.shape[0]),2))
    print('KL[p||u]: Normality Detection (relative to correct examples)')
    safe, risky = kl_r, kl_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Normality Detection (relative to correct examples)')
    safe, risky = s_rp, s_p_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


    print('\n\nAbnormality Detection')
    print('Abnormality base rate (%):', round(100*out_examples.shape[0]/(
                out_examples.shape[0] + in_examples.shape[0]),2))
    print('KL[p||u]: Abnormality Detection')
    safe, risky = -kl_a, -kl_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Normality Detection')
    safe, risky = -s_p, -s_p_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Abnormality base rate (%):', round(100*out_examples.shape[0]/
          (out_examples.shape[0] + (1 - err/100)*in_examples.shape[0]),2))
    print('KL[p||u]: Normality Detection (relative to correct examples)')
    safe, risky = -kl_r, -kl_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Normality Detection (relative to correct examples)')
    safe, risky = -s_rp, -s_p_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))



In [18]:

    
print('Held-out subjects\n')
show_ood_detection_results(err, test_in_sample_examples, test_oos_examples)









    



Held-out subjects

OOD Example Prediction Probability (mean, std):
0.718123 0.230031

Normality Detection
Normality base rate (%): 95.57
KL[p||u]: Normality Detection
AUPR (%): 99.53
AUROC (%): 91.7
Prediction Prob: Normality Detection
AUPR (%): 99.52
AUROC (%): 91.52
Normality base rate (%): 95.46
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 99.56
AUROC (%): 92.47
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 99.56
AUROC (%): 92.42


Abnormality Detection
Abnormality base rate (%): 4.43
KL[p||u]: Abnormality Detection
AUPR (%): 47.48
AUROC (%): 91.7
Prediction Prob: Normality Detection
AUPR (%): 44.08
AUROC (%): 91.52
Abnormality base rate (%): 4.54
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 57.04
AUROC (%): 92.47
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 56.14
AUROC (%): 92.42