In [1]:
import numpy as np
import tensorflow as tf
# %matplotlib inline
# import matplotlib.pylab as plt
from helper_functions_twitter import *
import sklearn.metrics as sk

%load_ext autoreload
%autoreload 2

In [4]:
window_size = 1

# note that we encode the tags with numbers for later convenience
tag_to_number = {
    u'N': 0, u'O': 1, u'S': 2, u'^': 3, u'Z': 4, u'L': 5, u'M': 6,
    u'V': 7, u'A': 8, u'R': 9, u'!': 10, u'D': 11, u'P': 12, u'&': 13, u'T': 14,
    u'X': 15, u'Y': 16, u'#': 17, u'@': 18, u'~': 19, u'U': 20, u'E': 21, u'$': 22,
    u',': 23, u'G': 24
}

embeddings = embeddings_to_dict('./data/Tweets/embeddings-twitter.txt')
vocab = embeddings.keys()

# we replace <s> with </s> since it has no embedding, and </s> is a better embedding than UNK
xt, yt = data_to_mat('./data/Tweets/tweets-train.txt', vocab, tag_to_number, window_size=window_size,
                     start_symbol=u'</s>')
xdev, ydev = data_to_mat('./data/Tweets/tweets-dev.txt', vocab, tag_to_number, window_size=window_size,
                         start_symbol=u'</s>')
xdtest, ydtest = data_to_mat('./data/Tweets/tweets-devtest.txt', vocab, tag_to_number, window_size=window_size,
                             start_symbol=u'</s>')

data = {
    'x_train': xt, 'y_train': yt,
    'x_dev': xdev, 'y_dev': ydev,
    'x_devtest': xdtest, 'y_devtest': ydtest
}

In [5]:
# build tf inputs
num_epochs = 30
num_tags = 25
hidden_size = 256
batch_size = 64
embedding_dimension = 50
example_size = (2*window_size + 1)*embedding_dimension
init_lr = 0.001
num_examples = data['y_train'].shape[0]
num_batches = num_examples//batch_size

graph = tf.Graph()
with graph.as_default():
    x = tf.placeholder(tf.float32, [None, example_size])
    y = tf.placeholder(tf.int64, [None])

    w1 = tf.Variable(tf.nn.l2_normalize(tf.random_normal([example_size, hidden_size]), 0)/tf.sqrt(1 + 0.425))
    b1 = tf.Variable(tf.zeros([hidden_size]))
    w2 = tf.Variable(tf.nn.l2_normalize(tf.random_normal([hidden_size, hidden_size]), 0)/tf.sqrt(0.425 + 0.425))
    b2 = tf.Variable(tf.zeros([hidden_size]))
    w_out = tf.Variable(tf.nn.l2_normalize(tf.random_normal([hidden_size, num_tags]), 0)/tf.sqrt(0.425 + 1))
    b_out = tf.Variable(tf.zeros([num_tags]))

    def gelu_fast(_x):
        return 0.5 * _x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (_x + 0.044715 * tf.pow(_x, 3))))

    def model(data_feed):
        h1 = gelu_fast(tf.matmul(data_feed, w1) + b1)
        h2 = gelu_fast(tf.matmul(h1, w2) + b2)
        return tf.matmul(h2, w_out) + b_out

    logits = model(x)
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y))
    loss += 5e-5*(tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2))

    # learning rate annealing
    global_step = tf.Variable(0, trainable=False)
    # drop lr 15 epochs in
    lr = tf.train.exponential_decay(init_lr, global_step, 15*num_batches, 0.1, staircase=True)
    # pick optimizer
    optimizer = tf.train.AdamOptimizer(lr).minimize(loss, global_step=global_step)

    acc = 100*tf.reduce_mean(tf.cast(tf.equal(tf.argmax(logits, 1), y), "float"))

In [6]:
# initialize
sess = tf.InteractiveSession(graph=graph)
tf.initialize_all_variables().run()
print('Initialized')

# create saver to train model
saver = tf.train.Saver(max_to_keep=1)


Initialized

In [7]:
sess.close()

In [7]:
best_acc = 0

# train
for epoch in range(num_epochs):
    # shuffle data every epoch
    indices = np.arange(num_examples)
    np.random.shuffle(indices)
    data['x_train'] = data['x_train'][indices]
    data['y_train'] = data['y_train'][indices]

    for i in range(num_batches):
        offset = i * batch_size

        x_batch = word_list_to_embedding(data['x_train'][offset:offset + batch_size, :],
                                             embeddings, embedding_dimension)
        y_batch = data['y_train'][offset:offset + batch_size]

        _, l, batch_acc = sess.run([optimizer, loss, acc],
                                     feed_dict={x: x_batch, y: y_batch})

        if i % 100 == 0:
            curr_dev_acc = sess.run(
                acc, feed_dict={x: word_list_to_embedding(data['x_dev'], embeddings, embedding_dimension),
                                y: data['y_dev']})
            if best_acc < curr_dev_acc:
                best_acc = curr_dev_acc
                saver.save(sess, './data/best_tweet_model.ckpt')

    print('Epoch %d | Minibatch loss %.3f | Minibatch accuracy %.3f | Dev accuracy %.3f' %
          (epoch, l, batch_acc, curr_dev_acc))


Epoch 0 | Minibatch loss 0.644 | Minibatch accuracy 82.812 | Dev accuracy 81.775
Epoch 1 | Minibatch loss 0.572 | Minibatch accuracy 85.938 | Dev accuracy 84.325
Epoch 2 | Minibatch loss 0.601 | Minibatch accuracy 89.062 | Dev accuracy 85.383
Epoch 3 | Minibatch loss 0.639 | Minibatch accuracy 78.125 | Dev accuracy 85.403
Epoch 4 | Minibatch loss 0.249 | Minibatch accuracy 92.188 | Dev accuracy 86.419
Epoch 5 | Minibatch loss 0.318 | Minibatch accuracy 90.625 | Dev accuracy 86.502
Epoch 6 | Minibatch loss 0.231 | Minibatch accuracy 95.312 | Dev accuracy 86.668
Epoch 7 | Minibatch loss 0.495 | Minibatch accuracy 82.812 | Dev accuracy 86.958
Epoch 8 | Minibatch loss 0.485 | Minibatch accuracy 87.500 | Dev accuracy 87.186
Epoch 9 | Minibatch loss 0.399 | Minibatch accuracy 90.625 | Dev accuracy 86.917
Epoch 10 | Minibatch loss 0.403 | Minibatch accuracy 90.625 | Dev accuracy 87.290
Epoch 11 | Minibatch loss 0.206 | Minibatch accuracy 92.188 | Dev accuracy 86.896
Epoch 12 | Minibatch loss 0.244 | Minibatch accuracy 93.750 | Dev accuracy 86.958
Epoch 13 | Minibatch loss 0.239 | Minibatch accuracy 95.312 | Dev accuracy 86.585
Epoch 14 | Minibatch loss 0.155 | Minibatch accuracy 95.312 | Dev accuracy 86.668
Epoch 15 | Minibatch loss 0.141 | Minibatch accuracy 95.312 | Dev accuracy 87.207
Epoch 16 | Minibatch loss 0.366 | Minibatch accuracy 90.625 | Dev accuracy 87.311
Epoch 17 | Minibatch loss 0.180 | Minibatch accuracy 93.750 | Dev accuracy 87.414
Epoch 18 | Minibatch loss 0.186 | Minibatch accuracy 95.312 | Dev accuracy 87.269
Epoch 19 | Minibatch loss 0.132 | Minibatch accuracy 96.875 | Dev accuracy 87.103
Epoch 20 | Minibatch loss 0.092 | Minibatch accuracy 100.000 | Dev accuracy 87.124
Epoch 21 | Minibatch loss 0.158 | Minibatch accuracy 96.875 | Dev accuracy 87.021
Epoch 22 | Minibatch loss 0.073 | Minibatch accuracy 100.000 | Dev accuracy 87.166
Epoch 23 | Minibatch loss 0.104 | Minibatch accuracy 96.875 | Dev accuracy 87.103
Epoch 24 | Minibatch loss 0.146 | Minibatch accuracy 95.312 | Dev accuracy 87.103
Epoch 25 | Minibatch loss 0.244 | Minibatch accuracy 95.312 | Dev accuracy 87.062
Epoch 26 | Minibatch loss 0.227 | Minibatch accuracy 92.188 | Dev accuracy 87.145
Epoch 27 | Minibatch loss 0.169 | Minibatch accuracy 96.875 | Dev accuracy 86.772
Epoch 28 | Minibatch loss 0.147 | Minibatch accuracy 95.312 | Dev accuracy 86.917
Epoch 29 | Minibatch loss 0.098 | Minibatch accuracy 96.875 | Dev accuracy 86.896

In [8]:
# restore variables from disk
saver.restore(sess, "./data/best_tweet_model.ckpt")
print("Best model restored!")

print('DevTest accuracy:', sess.run(
        acc, feed_dict={x: word_list_to_embedding(data['x_devtest'], embeddings, embedding_dimension),
                        y: data['y_devtest']}))


Best model restored!
DevTest accuracy: 87.3462

In [9]:
s = tf.nn.softmax(logits)
s_prob = tf.reduce_max(s, reduction_indices=[1], keep_dims=True)
kl_all = tf.log(25.) + tf.reduce_sum(s * tf.log(tf.abs(s) + 1e-11), reduction_indices=[1], keep_dims=True)
m_all, v_all = tf.nn.moments(kl_all, axes=[0])

logits_right = tf.boolean_mask(logits, tf.equal(tf.argmax(logits, 1), y))
s_right = tf.nn.softmax(logits_right)
s_right_prob = tf.reduce_max(s_right, reduction_indices=[1], keep_dims=True)
kl_right = tf.log(25.) + tf.reduce_sum(s_right * tf.log(tf.abs(s_right) + 1e-11), reduction_indices=[1], keep_dims=True)
m_right, v_right = tf.nn.moments(kl_right, axes=[0])

logits_wrong = tf.boolean_mask(logits, tf.not_equal(tf.argmax(logits, 1), y))
s_wrong = tf.nn.softmax(logits_wrong)
s_wrong_prob = tf.reduce_max(s_wrong, reduction_indices=[1], keep_dims=True)
kl_wrong = tf.log(25.) + tf.reduce_sum(s_wrong * tf.log(tf.abs(s_wrong) + 1e-11), reduction_indices=[1], keep_dims=True)
m_wrong, v_wrong = tf.nn.moments(kl_wrong, axes=[0])

In [10]:
err, kl_a, kl_r, kl_w, s_p, s_rp, s_wp = sess.run(
    [100 - acc, kl_all, kl_right, kl_wrong, s_prob, s_right_prob, s_wrong_prob],
    feed_dict={x: word_list_to_embedding(data['x_dev'],embeddings, embedding_dimension),
               y: data['y_dev']})

print('Twitter Error (%)| Prediction Prob (mean, std) | PProb Right (mean, std) | PProb Wrong (mean, std):')
print(err, '|', np.mean(s_p), np.std(s_p), '|', np.mean(s_rp), np.std(s_rp), '|', np.mean(s_wp), np.std(s_wp))

print('\nSuccess Detection')
print('Success base rate (%):', round(100-err,2))
print('KL[p||u]: Right/Wrong classification distinction')
safe, risky = kl_r, kl_w
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[:safe.shape[0]] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

print('Prediction Prob: Right/Wrong classification distinction')
safe, risky = s_rp, s_wp
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[:safe.shape[0]] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


print('\nError Detection')
print('Error base rate (%):', round(err,2))
safe, risky = -kl_r, -kl_w
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[safe.shape[0]:] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('KL[p||u]: Right/Wrong classification distinction')
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))

print('Prediction Prob: Right/Wrong classification distinction')
safe, risky = -s_rp, -s_wp
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[safe.shape[0]:] += 1
examples = np.squeeze(np.vstack((safe, risky)))
print('AUPR (%):', round(100*sk.average_precision_score(labels, examples), 2))
print('AUROC (%):', round(100*sk.roc_auc_score(labels, examples), 2))


Twitter Error (%)| Prediction Prob (mean, std) | PProb Right (mean, std) | PProb Wrong (mean, std):
12.5855 | 0.91604 0.162581 | 0.949234 0.119917 | 0.685485 0.222114

Success Detection
Success base rate (%): 87.41
KL[p||u]: Right/Wrong classification distinction
AUPR (%): 98.23
AUROC (%): 89.23
Prediction Prob: Right/Wrong classification distinction
AUPR (%): 98.24
AUROC (%): 89.28

Error Detection
Error base rate (%): 12.59
KL[p||u]: Right/Wrong classification distinction
AUPR (%): 53.51
AUROC (%): 89.23
Prediction Prob: Right/Wrong classification distinction
AUPR (%): 53.27
AUROC (%): 89.28