The code sets up the model, evaluates the effectiveness of softmax information alone, and after shows the improvement gained from an abnormality module.


In [1]:
import tensorflow as tf
import numpy as np
import h5py as h5
import sklearn.metrics as sk

In [2]:
# training parameters
learning_rate = 0.001
training_epochs = 20
batch_size = 32

# architecture parameters
n_hidden = 1024
n_labels = 39   # 39 phones
n_coeffs = 26
n_context_frames = 11   # 5 + 1 + 5
p = 0.75        # keep rate

In [3]:
def enumerate_context(i, sentence, num_frames):
    r = range(i-num_frames, i+num_frames+1)
    r = [x if x>=0 else 0 for x in r]
    r = [x if x<len(sentence) else len(sentence)-1 for x in r]
    return sentence[r]

def add_context(sentence, num_frames=11):
    # [sentence_length, coefficients] -> [sentence_length, num_frames, coefficients]

    assert num_frames % 2 == 1, "Number of frames must be odd (since left + 1 + right, left = right)"

    if num_frames == 1:
        return sentence

    context_sent = []

    for i in range(0, len(sentence)):
        context_sent.append([context for context in enumerate_context(i, sentence, (num_frames-1)//2)])

    return np.array(context_sent).reshape((-1, num_frames*n_coeffs))

In [4]:
graph = tf.Graph()
with graph.as_default():
    x = tf.placeholder(dtype=tf.float32, shape=[None, n_coeffs*n_context_frames])
    y = tf.placeholder(dtype=tf.int64, shape=[None])
    risk_labels = tf.placeholder(dtype=tf.float32, shape=[None])
    is_training = tf.placeholder(tf.bool)

    # nonlinearity
    def gelu_fast(_x):
        return 0.5 * _x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (_x + 0.044715 * tf.pow(_x, 3))))
    f = gelu_fast

    W = {}
    b = {}

    with tf.variable_scope("in_sample"):
        W['1'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_context_frames*n_coeffs, n_hidden]), 0)/tf.sqrt(1 + p*0.425))
        W['2'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_hidden, n_hidden]), 0)/tf.sqrt(0.425/p + p*0.425))
        W['3'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_hidden, n_hidden]), 0)/tf.sqrt(0.425/p + p*0.425))
        W['logits'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_hidden, n_labels]), 0)/tf.sqrt(0.425/p + 1))
        b['1'] = tf.Variable(tf.zeros([n_hidden]))
        b['2'] = tf.Variable(tf.zeros([n_hidden]))
        b['3'] = tf.Variable(tf.zeros([n_hidden]))
        b['logits'] = tf.Variable(tf.zeros([n_labels]))

        W['bottleneck'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_hidden, n_hidden//2]), 0)/tf.sqrt(0.425/p + 0.425))
        W['decode1'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_hidden//2, n_hidden]), 0)/tf.sqrt(0.425 + p*0.425))
        W['decode2'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_hidden, n_hidden]), 0)/tf.sqrt(0.425/p + 0.425*p))
        W['reconstruction'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_hidden, n_context_frames*n_coeffs]), 0)/tf.sqrt(0.425/p + 1))
        b['bottleneck'] = tf.Variable(tf.zeros([n_hidden//2]))
        b['decode1'] = tf.Variable(tf.zeros([n_hidden]))
        b['decode2'] = tf.Variable(tf.zeros([n_hidden]))
        b['reconstruction'] = tf.Variable(tf.zeros([n_context_frames*n_coeffs]))

    with tf.variable_scope("out_of_sample"):
        W['residual_to_risk1'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_context_frames*n_coeffs, n_hidden//2]), 0)/tf.sqrt(1 + 0.425))
        W['hidden_to_risk1'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_hidden, n_hidden//2]), 0)/tf.sqrt(0.425/p + 0.425))
        W['logits_to_risk1'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_labels, n_hidden//2]), 0)/tf.sqrt(1 + 0.425))
        W['risk2'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_hidden//2, 128]), 0)/tf.sqrt(0.425 + 0.425))
        W['risk'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([128, 1]), 0)/tf.sqrt(0.425 + 1))

        b['risk1'] = tf.Variable(tf.zeros([n_hidden//2]))
        b['risk2'] = tf.Variable(tf.zeros([128]))
        b['risk'] = tf.Variable(tf.zeros([1]))

    def feedforward(x):
        h1 = f(tf.matmul(x, W['1']) + b['1'])
        h1 = tf.cond(is_training, lambda: tf.nn.dropout(h1, p), lambda: h1)
        h2 = f(tf.matmul(h1, W['2']) + b['2'])
        h2 = tf.cond(is_training, lambda: tf.nn.dropout(h2, p), lambda: h2)
        h3 = f(tf.matmul(h2, W['3']) + b['3'])
        h3 = tf.cond(is_training, lambda: tf.nn.dropout(h3, p), lambda: h3)
        out = tf.matmul(h3, W['logits']) + b['logits']

        hidden_to_bottleneck = f(tf.matmul(h2, W['bottleneck']) + b['bottleneck'])
        d1 = f(tf.matmul(hidden_to_bottleneck, W['decode1']) + b['decode1'])
        d1 = tf.cond(is_training, lambda: tf.nn.dropout(d1, p), lambda: d1)
        d2 = f(tf.matmul(d1, W['decode2']) + b['decode2'])
        d2 = tf.cond(is_training, lambda: tf.nn.dropout(d2, p), lambda: d2)
        recreation = tf.matmul(d2, W['reconstruction']) + b['reconstruction']

        risk1 = f(tf.matmul(out, W['logits_to_risk1']) +
                  tf.matmul(tf.square(x - recreation), W['residual_to_risk1']) +
                  tf.matmul(h2, W['hidden_to_risk1']) + b['risk1'])
        risk2 = f(tf.matmul(risk1, W['risk2']) + b['risk2'])
        risk_out = tf.matmul(risk2, W['risk'])

        return out, recreation, tf.squeeze(risk_out)

    logits, reconstruction, risk = feedforward(x)
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y)) +\
           0.1 * tf.reduce_mean(tf.square(x - reconstruction)) +\
           1e-4*(tf.nn.l2_loss(W['1']) + tf.nn.l2_loss(W['2']) + tf.nn.l2_loss(W['3']) +
                 tf.nn.l2_loss(W['bottleneck']) + tf.nn.l2_loss(W['decode1']) + tf.nn.l2_loss(W['decode2']))

    lr = tf.constant(learning_rate)
    optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

    compute_error = tf.reduce_mean(tf.to_float(tf.not_equal(tf.argmax(logits, 1), y)))

In [6]:
print('Loading Data')
data = h5.File("train.h5")
X_train = data['X'][()]
Y_train = data['y'][()]
train_idxs = data['start_idx'][()]

# get validation set
X_val = X_train[-500:]
Y_val = Y_train[-500:]
val_idxs = train_idxs[-500:]
X_train = X_train[:-500]
Y_train = Y_train[:-500]
train_idxs = train_idxs[:-500]

train_mean = np.mean(X_train, axis=(0,1))
train_std = np.std(X_train, axis=(0,1))
X_train -= train_mean
X_train /= (train_std + 1e-11)

# NOTE: the test set is not the core test set but the entire, so the it's easier
data = h5.File("test.h5")
X_test = data['X'][()] - train_mean
Y_test = data['y'][()]
test_idxs = data['start_idx'][()]
X_test -= train_mean
X_test /= (train_std + 1e-11)
del data
print('Number of training examples', X_train.shape[0])
print('Number of validation examples', X_val.shape[0])
print('Number of testing examples', X_test.shape[0])


Loading Data
Number of training examples 4767
Number of validation examples 500
Number of testing examples 1907

In [7]:
sess = tf.InteractiveSession(graph=graph)

In [8]:
in_sample_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "in_sample")
out_of_sample_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "out_of_sample")
sess.run(tf.initialize_variables(set(tf.all_variables()) - set(out_of_sample_vars)))

risk_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(risk, risk_labels))
phase2_vars = list(set(tf.all_variables()) - set(in_sample_vars))
risk_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(risk_loss, var_list=phase2_vars)
sess.run(tf.initialize_variables(set(tf.all_variables()) - set(in_sample_vars)))

compute_risk_error = tf.reduce_mean(tf.to_float(tf.not_equal(tf.to_int64(tf.round(tf.sigmoid(risk))),
                                                             tf.to_int64(tf.round(risk_labels)))))

# could collapse this into an "initialize all" statement but that might have less fecundity

In [16]:
sess.close()

In [9]:
saver = tf.train.Saver(max_to_keep=1)
saver.restore(sess, "./fcn.ckpt")

print('Model restored')


Model restored

Softmax Information


In [10]:
s = tf.nn.softmax(logits)
s_prob = tf.reduce_max(s, reduction_indices=[1], keep_dims=True)
kl_all = tf.log(39.) + tf.reduce_sum(s * tf.log(tf.abs(s) + 1e-11), reduction_indices=[1], keep_dims=True)
m_all, v_all = tf.nn.moments(kl_all, axes=[0])

logits_right = tf.boolean_mask(logits, tf.equal(tf.argmax(logits, 1), y))
s_right = tf.nn.softmax(logits_right)
s_right_prob = tf.reduce_max(s_right, reduction_indices=[1], keep_dims=True)
kl_right = tf.log(39.) + tf.reduce_sum(s_right * tf.log(tf.abs(s_right) + 1e-11), reduction_indices=[1], keep_dims=True)
m_right, v_right = tf.nn.moments(kl_right, axes=[0])

logits_wrong = tf.boolean_mask(logits, tf.not_equal(tf.argmax(logits, 1), y))
s_wrong = tf.nn.softmax(logits_wrong)
s_wrong_prob = tf.reduce_max(s_wrong, reduction_indices=[1], keep_dims=True)
kl_wrong = tf.log(39.) + tf.reduce_sum(s_wrong * tf.log(tf.abs(s_wrong) + 1e-11), reduction_indices=[1], keep_dims=True)
m_wrong, v_wrong = tf.nn.moments(kl_wrong, axes=[0])

In [11]:
kl_a, kl_r, kl_w, s_p, s_rp, s_wp = [], [], [], [], [], []
err_total = 0

for i in range(X_test.shape[0]//batch_size):
    offset = i * batch_size

    _bx, mask_x, _by = X_test[offset:offset+batch_size], test_idxs[offset:offset+batch_size], Y_test[offset:offset+batch_size]

    bx, by = [], []
    for i in range(_bx.shape[0]):
        sentence_frames = add_context(_bx[i][mask_x[i]:])
        bx.append(sentence_frames)
        by.append(_by[i][mask_x[i]:])

    bx, by = np.concatenate(bx), np.concatenate(by)

    err, kl_a_curr, kl_r_curr, kl_w_curr, s_p_curr, s_rp_curr, s_wp_curr = sess.run(
        [100*compute_error, kl_all, kl_right, kl_wrong, s_prob, s_right_prob, s_wrong_prob],
        feed_dict={x: bx, y: by, is_training: False})

    kl_a.append(kl_a_curr)
    kl_r.append(kl_r_curr)
    kl_w.append(kl_w_curr)
    s_p.append(s_p_curr)
    s_rp.append(s_rp_curr)
    s_wp.append(s_wp_curr)
    err_total += err

err_total /= X_test.shape[0]//batch_size
kl_a = np.concatenate(kl_a)
kl_r = np.concatenate(kl_r)
kl_w = np.concatenate(kl_w)
s_p = np.concatenate(s_p)
s_rp = np.concatenate(s_rp)
s_wp = np.concatenate(s_wp)

In [13]:
print('Frame Error (%)| Prediction Prob (mean, std) | PProb Right (mean, std) | PProb Wrong (mean, std):')
print(err_total, '|', np.mean(s_p), np.std(s_p), '|', np.mean(s_rp), np.std(s_rp), '|', np.mean(s_wp), np.std(s_wp))

print('\nSuccess Detection')
print('Success base rate (%):', round(100-err_total,2))
print('KL[p||u]: Right/Wrong classification distinction')
safe, risky = kl_r, kl_w
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[:safe.shape[0]] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

print('Prediction Prob: Right/Wrong classification distinction')
safe, risky = s_rp, s_wp
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[:safe.shape[0]] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


print('\nError Detection')
print('Error base rate (%):', round(err_total,2))
safe, risky = -kl_r, -kl_w
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[safe.shape[0]:] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('KL[p||u]: Right/Wrong classification distinction')
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

print('Prediction Prob: Right/Wrong classification distinction')
safe, risky = -s_rp, -s_wp
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[safe.shape[0]:] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


Frame Error (%)| Prediction Prob (mean, std) | PProb Right (mean, std) | PProb Wrong (mean, std):
29.6853021201 | 0.759521 0.230741 | 0.82219 0.194918 | 0.611037 0.240867

Success Detection
Success base rate (%): 70.31
KL[p||u]: Right/Wrong classification distinction
AUPR (%): 87.8
AUROC (%): 75.54
Prediction Prob: Right/Wrong classification distinction
AUPR (%): 87.99
AUROC (%): 76.14

Error Detection
Error base rate (%): 29.69
KL[p||u]: Right/Wrong classification distinction
AUPR (%): 54.25
AUROC (%): 75.54
Prediction Prob: Right/Wrong classification distinction
AUPR (%): 56.42
AUROC (%): 76.14

The base rates are incorrectly printed.


In [24]:
for oos_name in ['airport', 'babble', 'car', 'exhibition', 'restaurant', 'subway', 'street', 'train']:
    
    data = h5.File("test_" + oos_name + ".h5")     # real noise at a volume of 30%
    oos_x = data['X'][()]
    oos_y = data['y'][()]
    oos_idxs = data['start_idx'][()]
    oos_x -= train_mean
    oos_x /= (train_std + 1e-11)
    
    kl_oos = []
    s_p_oos = []
    
    for i in range(oos_x.shape[0]//batch_size):
        offset = i * batch_size
        
        _bx, mask_x, _by = oos_x[offset:offset+batch_size], oos_idxs[offset:offset+batch_size], oos_y[offset:offset+batch_size]

        bx, by = [], []
        for i in range(_bx.shape[0]):
            sentence_frames = add_context(_bx[i][mask_x[i]:])
            bx.append(sentence_frames)
            by.append(_by[i][mask_x[i]:])

        bx, by = np.concatenate(bx), np.concatenate(by)
        
        kl_oos_curr, s_p_oos_curr = sess.run([kl_all, s_prob], feed_dict={x: bx, is_training: False})

        kl_oos.append(kl_oos_curr)
        s_p_oos.append(s_p_oos_curr)

    print('\n\n' + oos_name, 'Example Prediction Probability (mean, std):')
    print(np.mean(np.concatenate(s_p_oos)), np.std(np.concatenate(s_p_oos)))

    print('\nNormality Detection')
    print('Normality base rate (%):', round(50,2))
    print('KL[p||u]: Normality Detection')
    safe, risky = kl_a, np.concatenate(kl_oos)
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Normality Detection')
    safe, risky = s_p, np.concatenate(s_p_oos)
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Normality base rate (%):', round(100*1./(1 + 1 - err_total/100),2))
    print('KL[p||u]: Normality Detection (relative to correct examples)')
    safe, risky = kl_r, np.concatenate(kl_oos)
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Normality Detection (relative to correct examples)')
    safe, risky = s_rp, np.concatenate(s_p_oos)
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


    print('\nAbnormality Detection')
    print('Abnormality base rate (%):', round(50,2))
    print('KL[p||u]: Abnormality Detection')
    safe, risky = -kl_a, -np.concatenate(kl_oos)
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Normality Detection')
    safe, risky = -s_p, -np.concatenate(s_p_oos)
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Abnormality base rate (%):', round(100*1./(1 + 1 - err_total/100),2))
    print('KL[p||u]: Normality Detection (relative to correct examples)')
    safe, risky = -kl_r, -np.concatenate(kl_oos)
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Prediction Prob: Normality Detection (relative to correct examples)')
    safe, risky = -s_rp, -np.concatenate(s_p_oos)
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))



airport Example Prediction Probability (mean, std):
0.678342 0.164252

Normality Detection
Normality base rate (%): 50
KL[p||u]: Normality Detection
AUPR (%): 74.67
AUROC (%): 66.61
Prediction Prob: Normality Detection
AUPR (%): 74.16
AUROC (%): 65.34
Normality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 77.21
AUROC (%): 75.64
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 76.9
AUROC (%): 74.99

Abnormality Detection
Abnormality base rate (%): 50
KL[p||u]: Abnormality Detection
AUPR (%): 57.49
AUROC (%): 66.61
Prediction Prob: Normality Detection
AUPR (%): 55.35
AUROC (%): 65.34
Abnormality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 74.33
AUROC (%): 75.64
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 72.63
AUROC (%): 74.99


babble Example Prediction Probability (mean, std):
0.400827 0.0942026

Normality Detection
Normality base rate (%): 50
KL[p||u]: Normality Detection
AUPR (%): 93.95
AUROC (%): 91.39
Prediction Prob: Normality Detection
AUPR (%): 92.6
AUROC (%): 88.93
Normality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 95.66
AUROC (%): 95.33
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 95.02
AUROC (%): 94.34

Abnormality Detection
Abnormality base rate (%): 50
KL[p||u]: Abnormality Detection
AUPR (%): 83.26
AUROC (%): 91.39
Prediction Prob: Normality Detection
AUPR (%): 78.29
AUROC (%): 88.93
Abnormality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 93.34
AUROC (%): 95.33
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 91.35
AUROC (%): 94.34


car Example Prediction Probability (mean, std):
0.710342 0.184567

Normality Detection
Normality base rate (%): 50
KL[p||u]: Normality Detection
AUPR (%): 68.1
AUROC (%): 61.48
Prediction Prob: Normality Detection
AUPR (%): 67.78
AUROC (%): 60.65
Normality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 69.41
AUROC (%): 70.42
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 69.24
AUROC (%): 70.06

Abnormality Detection
Abnormality base rate (%): 50
KL[p||u]: Abnormality Detection
AUPR (%): 54.62
AUROC (%): 61.48
Prediction Prob: Normality Detection
AUPR (%): 53.1
AUROC (%): 60.65
Abnormality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 71.32
AUROC (%): 70.42
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 70.31
AUROC (%): 70.06


exhibition Example Prediction Probability (mean, std):
0.445379 0.158559

Normality Detection
Normality base rate (%): 50
KL[p||u]: Normality Detection
AUPR (%): 89.95
AUROC (%): 87.01
Prediction Prob: Normality Detection
AUPR (%): 88.9
AUROC (%): 85.05
Normality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 92.06
AUROC (%): 92.11
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 91.55
AUROC (%): 91.29

Abnormality Detection
Abnormality base rate (%): 50
KL[p||u]: Abnormality Detection
AUPR (%): 81.98
AUROC (%): 87.01
Prediction Prob: Normality Detection
AUPR (%): 78.14
AUROC (%): 85.05
Abnormality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 92.18
AUROC (%): 92.11
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 90.93
AUROC (%): 91.29


restaurant Example Prediction Probability (mean, std):
0.736358 0.169534

Normality Detection
Normality base rate (%): 50
KL[p||u]: Normality Detection
AUPR (%): 67.92
AUROC (%): 59.03
Prediction Prob: Normality Detection
AUPR (%): 67.64
AUROC (%): 58.27
Normality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 69.7
AUROC (%): 68.36
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 69.55
AUROC (%): 68.01

Abnormality Detection
Abnormality base rate (%): 50
KL[p||u]: Abnormality Detection
AUPR (%): 51.82
AUROC (%): 59.03
Prediction Prob: Normality Detection
AUPR (%): 50.71
AUROC (%): 58.27
Abnormality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 68.28
AUROC (%): 68.36
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 67.35
AUROC (%): 68.01


subway Example Prediction Probability (mean, std):
0.665456 0.169276

Normality Detection
Normality base rate (%): 50
KL[p||u]: Normality Detection
AUPR (%): 74.69
AUROC (%): 67.6
Prediction Prob: Normality Detection
AUPR (%): 74.26
AUROC (%): 66.43
Normality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 76.99
AUROC (%): 76.4
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 76.79
AUROC (%): 75.87

Abnormality Detection
Abnormality base rate (%): 50
KL[p||u]: Abnormality Detection
AUPR (%): 58.64
AUROC (%): 67.6
Prediction Prob: Normality Detection
AUPR (%): 56.48
AUROC (%): 66.43
Abnormality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 75.4
AUROC (%): 76.4
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 73.82
AUROC (%): 75.87


street Example Prediction Probability (mean, std):
0.508353 0.120797

Normality Detection
Normality base rate (%): 50
KL[p||u]: Normality Detection
AUPR (%): 88.83
AUROC (%): 84.26
Prediction Prob: Normality Detection
AUPR (%): 87.48
AUROC (%): 81.45
Normality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 91.36
AUROC (%): 90.44
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 90.63
AUROC (%): 89.09

Abnormality Detection
Abnormality base rate (%): 50
KL[p||u]: Abnormality Detection
AUPR (%): 75.39
AUROC (%): 84.26
Prediction Prob: Normality Detection
AUPR (%): 69.17
AUROC (%): 81.45
Abnormality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 88.7
AUROC (%): 90.44
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 85.49
AUROC (%): 89.09


train Example Prediction Probability (mean, std):
0.632496 0.15643

Normality Detection
Normality base rate (%): 50
KL[p||u]: Normality Detection
AUPR (%): 79.3
AUROC (%): 71.83
Prediction Prob: Normality Detection
AUPR (%): 78.66
AUROC (%): 70.32
Normality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 82.19
AUROC (%): 80.42
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 81.81
AUROC (%): 79.66

Abnormality Detection
Abnormality base rate (%): 50
KL[p||u]: Abnormality Detection
AUPR (%): 61.79
AUROC (%): 71.83
Prediction Prob: Normality Detection
AUPR (%): 59.12
AUROC (%): 70.32
Abnormality base rate (%): 58.71
KL[p||u]: Normality Detection (relative to correct examples)
AUPR (%): 78.44
AUROC (%): 80.42
Prediction Prob: Normality Detection (relative to correct examples)
AUPR (%): 76.64
AUROC (%): 79.66

Auxiliary Decoder and the Abnormality Module


In [28]:
print('Loading OOD Data')
data = h5.File("train_p_0.02.h5")
p_02 = data['X'][()]
p_02_idxs = data['start_idx'][()]
p_02 -= train_mean
p_02 /= (train_std + 1e-11)

data = h5.File("train_w_0.005.h5")
w_005 = data['X'][()]
w_005_idxs = data['start_idx'][()]
w_005 -= train_mean
w_005 /= (train_std + 1e-11)

data = h5.File("train_b_0.05.h5")
b_05 = data['X'][()]
b_05_idxs = data['start_idx'][()]
b_05 -= train_mean
b_05 /= (train_std + 1e-11)
del data


Loading OOD Data

In [29]:
print('Training the risk neuron')

num_batches = X_train.shape[0] // batch_size
err_ema = 1./n_labels
risk_loss_ema = 0.3  # -log(0.5)

for epoch in range(2):  # 3 epoch
    # shuffle data
    indices = np.arange(X_train.shape[0])
    np.random.shuffle(indices)
    X_train = X_train[indices]
    Y_train = Y_train[indices]
    train_idxs = train_idxs[indices]
    
    p_02 = p_02[indices]
    p_02_idxs = p_02_idxs[indices]
    w_005 = w_005[indices]
    w_005_idxs = w_005_idxs[indices]
    b_05 = b_05[indices]
    b_05_idxs = b_05_idxs[indices]
    
    for i in range(num_batches):
        offset = i * batch_size
        
        # get in-sample data
        _bx1, mask_x1 = X_train[offset:offset+batch_size//4], train_idxs[offset:offset+batch_size//4]
        bx1 = []
        for i in range(_bx1.shape[0]):
            sentence_frames = add_context(_bx1[i][mask_x1[i]:])
            bx1.append(sentence_frames)
        bx1 = np.concatenate(bx1)

        val_indices = np.arange(X_val.shape[0])
        np.random.shuffle(val_indices)
        _bx2, mask_x2 = X_val[val_indices[0:batch_size//4]], val_idxs[val_indices[0:batch_size//4]]

        bx2 = []
        for i in range(_bx2.shape[0]):
            sentence_frames = add_context(_bx2[i][mask_x2[i]:])
            bx2.append(sentence_frames)
        bx2 = np.concatenate(bx2)
        
        # get oos data
        
        _bx3, mask_x3 = p_02[offset:offset+batch_size//6], p_02_idxs[offset:offset+batch_size//6]
        bx3 = []
        for i in range(_bx3.shape[0]):
            sentence_frames = add_context(_bx3[i][mask_x3[i]:])
            bx3.append(sentence_frames)
        bx3 = np.concatenate(bx3)
        
        _bx4, mask_x4 = w_005[offset:offset+batch_size//6], w_005_idxs[offset:offset+batch_size//6]
        bx4 = []
        for i in range(_bx4.shape[0]):
            sentence_frames = add_context(_bx4[i][mask_x4[i]:])
            bx4.append(sentence_frames)
        bx4 = np.concatenate(bx4)
        
        _bx5, mask_x5 = b_05[offset:offset+batch_size//6], b_05_idxs[offset:offset+batch_size//6]
        bx5 = []
        for i in range(_bx5.shape[0]):
            sentence_frames = add_context(_bx5[i][mask_x5[i]:])
            bx5.append(sentence_frames)
        bx5 = np.concatenate(bx5)

        risks = np.zeros(bx1.shape[0] + bx2.shape[0] + bx3.shape[0] + bx4.shape[0] + bx5.shape[0])
        risks[:bx1.shape[0] + bx2.shape[0]] = 1
        bx = np.concatenate((bx1, bx2, bx3, bx4, bx5), axis=0)

        _, rl, err = sess.run([risk_optimizer, risk_loss, compute_risk_error],
                              feed_dict={x: bx, risk_labels: risks, is_training: False})
        risk_loss_ema = risk_loss_ema * 0.95 + 0.05 * rl
        err_ema = err_ema * 0.95 + 0.05 * err

    print('Epoch:', epoch, '|', 'ema of risk for epoch:', risk_loss_ema, 'error (%):', 100*err_ema)


Training the risk neuron
Epoch: 0 | ema of risk for epoch: 5.25729855974 error (%): 2.31989826724
Epoch: 1 | ema of risk for epoch: 0.0377295030432 error (%): 1.15339131546

In [30]:
err_total = 0
risk_err_total = 0
risk_total = []
risk_right_total = []
risk_wrong_total = []
conf_total = []

for i in range(X_test.shape[0]//batch_size):
    offset = i * batch_size
    _bx, mask_x, _by = X_test[offset:offset+batch_size], test_idxs[offset:offset+batch_size], Y_test[offset:offset+batch_size]

    bx, by = [], []
    for i in range(_bx.shape[0]):
        sentence_frames = add_context(_bx[i][mask_x[i]:])
        bx.append(sentence_frames)
        by.append(_by[i][mask_x[i]:])

    bx, by = np.concatenate(bx), np.concatenate(by)

    err, r_err, r, conf = sess.run([100*compute_error, 100*compute_risk_error,
                                    tf.sigmoid(risk), tf.nn.softmax(logits)],
                                   feed_dict={x: bx, y: by, risk_labels: np.ones(by.shape[0]), is_training: False})

    r_right = r[np.argmax(conf, axis=1).astype(np.int32) == by]
    r_wrong = r[np.argmax(conf, axis=1).astype(np.int32) != by]

    err_total += err
    risk_err_total += r_err
    risk_total.append(r)
    conf_total.append(conf)
    risk_right_total.append(r_right)
    risk_wrong_total.append(r_wrong)

risk_err_total /= X_test.shape[0]//batch_size
err_total /= X_test.shape[0]//batch_size

In [31]:
print('TIMIT Clean Frame Error (%) | TIMIT Frame Riskiness Error (0.5 cutoff) (%) | Frame Confidence (mean, std):')
print(err_total, '|', risk_err_total, '|', np.mean(np.max(np.concatenate(conf_total), axis=1)),
      np.std(np.max(np.concatenate(conf_total), axis=1)))


TIMIT Clean Frame Error (%) | TIMIT Frame Riskiness Error (0.5 cutoff) (%) | Frame Confidence (mean, std):
29.6853021201 | 2.56435128383 | 0.759521 0.230741

In [32]:
safe, risky = np.concatenate(risk_right_total).reshape(-1,1), np.concatenate(risk_wrong_total).reshape(-1,1)
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[:safe.shape[0]] += 1
examples = np.squeeze(np.vstack((safe, risky)))

print('Risk Neuron: Clean Right/Wrong classification distinction')
print('AUPR', sk.average_precision_score(labels, examples))
print('AUROC', sk.roc_auc_score(labels, examples))


Risk Neuron: Clean Right/Wrong classification distinction
AUPR 0.841913118528
AUROC 0.635562324801

The implication is that the logistic regression neuron is as not great for detecting whether the example is misclassified. Perhaps is incorrect examples were treated as negative examples we would do better.

Now let's try OOD examples.

Update: Base rates should be as above. err_total is updated while it should refer to an older value. They're incorrectly printed but this doesn't affect anything else.


In [33]:
for oos_name in ['airport', 'babble', 'car', 'exhibition', 'restaurant', 'subway', 'street', 'train']:
    
    data = h5.File("test_" + oos_name + ".h5")     # real noise at a volume of 30%
    oos_x = data['X'][()]
    oos_y = data['y'][()]
    oos_idxs = data['start_idx'][()]
    oos_x -= train_mean
    oos_x /= (train_std + 1e-11)
    
    err_total = 0
    risk_err_total = 0
    risk_total_oos = []
    risk_right_total_oos = []
    risk_wrong_total_oos = []
    conf_total_oos = []
    
    for i in range(oos_x.shape[0]//batch_size):
        offset = i * batch_size
        
        _bx, mask_x, _by = oos_x[offset:offset+batch_size], oos_idxs[offset:offset+batch_size], oos_y[offset:offset+batch_size]

        bx, by = [], []
        for i in range(_bx.shape[0]):
            sentence_frames = add_context(_bx[i][mask_x[i]:])
            bx.append(sentence_frames)
            by.append(_by[i][mask_x[i]:])

        bx, by = np.concatenate(bx), np.concatenate(by)

        
        err, r_err, r, conf = sess.run([100*compute_error, 100*compute_risk_error,
                                        tf.sigmoid(risk), tf.nn.softmax(logits)],
                                       feed_dict={x: bx, y: by, risk_labels: np.zeros(by.shape[0]), is_training: False})

        r_right = r[np.argmax(conf, axis=1).astype(np.int32) == by]
        r_wrong = r[np.argmax(conf, axis=1).astype(np.int32) != by]

        err_total += err
        risk_err_total += r_err
        risk_total_oos.append(r)
        conf_total_oos.append(conf)
        risk_right_total_oos.append(r_right)
        risk_wrong_total_oos.append(r_wrong)


    risk_err_total /= oos_x.shape[0]//batch_size
    err_total /= oos_x.shape[0]//batch_size
    
    print('\nTIMIT', oos_name, 'Frame Error (%) | TIMIT Frame Riskiness Error (0.5 cutoff) (%) | Frame Confidence (mean, std):')
    print(err_total, '|', risk_err_total, '|', np.mean(np.max(np.concatenate(conf_total_oos), axis=1)),
      np.std(np.max(np.concatenate(conf_total_oos), axis=1)))
    
    risk_total = np.concatenate(risk_total).reshape(-1,1)
    risk_right_total = np.concatenate(risk_right_total).reshape(-1,1)
    risk_total_oos = np.concatenate(risk_total_oos).reshape(-1,1)

    print('\nNormality Detection')
    print('Normality base rate (%):', round(50,2))
    print('Normality Detection')
    safe, risky = risk_total, risk_total_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Normality base rate (%):', round(100*1./(1 + 1 - err_total/100),2))
    print('Normality Detection (relative to correct examples)')
    safe, risky = risk_right_total, risk_total_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[:safe.shape[0]] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('\n\nAbnormality Detection')
    print('Abnormality base rate (%):', round(50,2))
    print('Abnormality Detection')
    safe, risky = 1 - risk_total, 1 - risk_total_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

    print('Abnormality base rate (%):', round(100*1./(1 + 1 - err_total/100),2))
    print('Abnormality Detection (relative to correct examples)')
    safe, risky = 1 - risk_right_total, 1 - risk_total_oos
    labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
    labels[safe.shape[0]:] += 1
    examples = np.squeeze(np.vstack((safe, risky)))
    print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
    print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


TIMIT airport Frame Error (%) | TIMIT Frame Riskiness Error (0.5 cutoff) (%) | Frame Confidence (mean, std):
30.3071249946 | 2.62155895213 | 0.678342 0.164252

Normality Detection
Normality base rate (%): 50
Normality Detection
AUPR (%): 99.66
AUROC (%): 99.57
Normality base rate (%): 58.93
Normality Detection (relative to correct examples)
AUPR (%): 99.62
AUROC (%): 99.65


Abnormality Detection
Abnormality base rate (%): 50
Abnormality Detection
AUPR (%): 99.45
AUROC (%): 99.57
Abnormality base rate (%): 58.93
Normality Detection (relative to correct examples)
AUPR (%): 99.68
AUROC (%): 99.65

TIMIT babble Frame Error (%) | TIMIT Frame Riskiness Error (0.5 cutoff) (%) | Frame Confidence (mean, std):
30.308092085 | 0.00997679636388 | 0.400827 0.0942026

Normality Detection
Normality base rate (%): 50
Normality Detection
AUPR (%): 99.94
AUROC (%): 99.91
Normality base rate (%): 58.93
Normality Detection (relative to correct examples)
AUPR (%): 99.93
AUROC (%): 99.92


Abnormality Detection
Abnormality base rate (%): 50
Abnormality Detection
AUPR (%): 99.81
AUROC (%): 99.91
Abnormality base rate (%): 58.93
Normality Detection (relative to correct examples)
AUPR (%): 99.89
AUROC (%): 99.92

TIMIT car Frame Error (%) | TIMIT Frame Riskiness Error (0.5 cutoff) (%) | Frame Confidence (mean, std):
30.3905919285 | 23.7249792552 | 0.710342 0.184567

Normality Detection
Normality base rate (%): 50
Normality Detection
AUPR (%): 98.49
AUROC (%): 98.01
Normality base rate (%): 58.96
Normality Detection (relative to correct examples)
AUPR (%): 98.44
AUROC (%): 98.39


Abnormality Detection
Abnormality base rate (%): 50
Abnormality Detection
AUPR (%): 97.13
AUROC (%): 98.01
Abnormality base rate (%): 58.96
Normality Detection (relative to correct examples)
AUPR (%): 98.31
AUROC (%): 98.39

TIMIT exhibition Frame Error (%) | TIMIT Frame Riskiness Error (0.5 cutoff) (%) | Frame Confidence (mean, std):
31.1150914855 | 32.5871666407 | 0.445379 0.158559

Normality Detection
Normality base rate (%): 50
Normality Detection
AUPR (%): 97.71
AUROC (%): 97.13
Normality base rate (%): 59.21
Normality Detection (relative to correct examples)
AUPR (%): 97.65
AUROC (%): 97.7


Abnormality Detection
Abnormality base rate (%): 50
Abnormality Detection
AUPR (%): 96.37
AUROC (%): 97.13
Abnormality base rate (%): 59.21
Normality Detection (relative to correct examples)
AUPR (%): 97.87
AUROC (%): 97.7

TIMIT restaurant Frame Error (%) | TIMIT Frame Riskiness Error (0.5 cutoff) (%) | Frame Confidence (mean, std):
30.3446945902 | 66.1732135708 | 0.736358 0.169534

Normality Detection
Normality base rate (%): 50
Normality Detection
AUPR (%): 95.63
AUROC (%): 94.07
Normality base rate (%): 58.94
Normality Detection (relative to correct examples)
AUPR (%): 95.63
AUROC (%): 95.27


Abnormality Detection
Abnormality base rate (%): 50
Abnormality Detection
AUPR (%): 91.2
AUROC (%): 94.07
Abnormality base rate (%): 58.94
Normality Detection (relative to correct examples)
AUPR (%): 94.79
AUROC (%): 95.27

TIMIT subway Frame Error (%) | TIMIT Frame Riskiness Error (0.5 cutoff) (%) | Frame Confidence (mean, std):
30.3169423443 | 52.0356083886 | 0.665456 0.169276

Normality Detection
Normality base rate (%): 50
Normality Detection
AUPR (%): 96.05
AUROC (%): 95.01
Normality base rate (%): 58.93
Normality Detection (relative to correct examples)
AUPR (%): 95.96
AUROC (%): 95.99


Abnormality Detection
Abnormality base rate (%): 50
Abnormality Detection
AUPR (%): 93.37
AUROC (%): 95.01
Abnormality base rate (%): 58.93
Normality Detection (relative to correct examples)
AUPR (%): 96.07
AUROC (%): 95.99

TIMIT street Frame Error (%) | TIMIT Frame Riskiness Error (0.5 cutoff) (%) | Frame Confidence (mean, std):
30.3095885131 | 30.8250142437 | 0.508353 0.120797

Normality Detection
Normality base rate (%): 50
Normality Detection
AUPR (%): 98.57
AUROC (%): 97.97
Normality base rate (%): 58.93
Normality Detection (relative to correct examples)
AUPR (%): 98.54
AUROC (%): 98.36


Abnormality Detection
Abnormality base rate (%): 50
Abnormality Detection
AUPR (%): 96.89
AUROC (%): 97.97
Abnormality base rate (%): 58.93
Normality Detection (relative to correct examples)
AUPR (%): 98.18
AUROC (%): 98.36

TIMIT train Frame Error (%) | TIMIT Frame Riskiness Error (0.5 cutoff) (%) | Frame Confidence (mean, std):
30.3597302518 | 0.0548225726648 | 0.632496 0.15643

Normality Detection
Normality base rate (%): 50
Normality Detection
AUPR (%): 99.96
AUROC (%): 99.96
Normality base rate (%): 58.95
Normality Detection (relative to correct examples)
AUPR (%): 99.96
AUROC (%): 99.96


Abnormality Detection
Abnormality base rate (%): 50
Abnormality Detection
AUPR (%): 99.96
AUROC (%): 99.96
Abnormality base rate (%): 58.95
Normality Detection (relative to correct examples)
AUPR (%): 99.97
AUROC (%): 99.96