In [1]:
import numpy as np
import tensorflow as tf
# %matplotlib inline
# import matplotlib.pylab as plt
from helper_functions_twitter import *
%load_ext autoreload
%autoreload 2
In [10]:
window_size = 1
# note that we encode the tags with numbers for later convenience
tag_to_number = {
u'N': 0, u'O': 1, u'S': 2, u'^': 3, u'Z': 4, u'L': 5, u'M': 6,
u'V': 7, u'A': 8, u'R': 9, u'!': 10, u'D': 11, u'P': 12, u'&': 13, u'T': 14,
u'X': 15, u'Y': 16, u'#': 17, u'@': 18, u'~': 19, u'U': 20, u'E': 21, u'$': 22,
u',': 23, u'G': 24
}
embeddings = embeddings_to_dict('./data/Tweets/embeddings-twitter.txt')
vocab = embeddings.keys()
# we replace <s> with </s> since it has no embedding, and </s> is a better embedding than UNK
xt, yt = data_to_mat('./data/Tweets/tweets-train.txt', vocab, tag_to_number, window_size=window_size,
start_symbol=u'</s>', one_hot=True)
xdev, ydev = data_to_mat('./data/Tweets/tweets-dev.txt', vocab, tag_to_number, window_size=window_size,
start_symbol=u'</s>', one_hot=True)
xdtest, ydtest = data_to_mat('./data/Tweets/tweets-devtest.txt', vocab, tag_to_number, window_size=window_size,
start_symbol=u'</s>', one_hot=True)
data = {
'x_train': xt, 'y_train': yt,
'x_dev': xdev, 'y_dev': ydev,
'x_devtest': xdtest, 'y_devtest': ydtest
}
In [80]:
def train_and_test(mode="c_is_softmax_prob", seed=100, learning_rate=0.001):
training_epochs = 20
n_labels = 25
batch_size = 64
embedding_dimension = 50
example_size = (2*window_size + 1)*embedding_dimension
num_examples = data['y_train'].shape[0]
num_batches = num_examples//batch_size
'''
modes: c_is_softmax_prob, c_is_trained_softmax_prob, c_is_cotrained_sigmoid, c_is_auxiliary_sigmoid
'''
graph = tf.Graph()
with graph.as_default():
tf.set_random_seed(seed) # seed set upon graph construction; does not work
x = tf.placeholder(dtype=tf.float32, shape=[None, example_size])
y = tf.placeholder(dtype=tf.float32, shape=[None, n_labels])
def gelu(x):
return 0.5 * x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))
f = gelu
W = {}
b = {}
with tf.variable_scope("classifier"):
W['1'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([example_size, 256]), 0))
W['2'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([256, 256]), 0))
W['3'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([256, 256]), 0))
W['logits'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([256, n_labels]), 0))
b['1'] = tf.Variable(tf.zeros([256]))
b['2'] = tf.Variable(tf.zeros([256]))
b['3'] = tf.Variable(tf.zeros([256]))
b['logits'] = tf.Variable(tf.zeros([n_labels]))
with tf.variable_scope("confidence_scorer"):
W['hidden_to_conf1'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([256, 512]), 0))
W['logits_to_conf1'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([n_labels, 512]), 0))
W['conf2'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([512, 128]), 0))
W['conf'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([128, 1]), 0))
b['conf1'] = tf.Variable(tf.zeros([512]))
b['conf2'] = tf.Variable(tf.zeros([128]))
b['conf'] = tf.Variable(tf.zeros([1]))
def cautious_fcn(x):
h1 = f(tf.matmul(x, W['1']) + b['1'])
h2 = f(tf.matmul(h1, W['2']) + b['2'])
h3 = f(tf.matmul(h2, W['3']) + b['3'])
logits_out = tf.matmul(h3, W['logits']) + b['logits']
conf1 = f(tf.matmul(logits_out, W['logits_to_conf1']) +
tf.matmul(h2, W['hidden_to_conf1']) + b['conf1'])
conf2 = f(tf.matmul(conf1, W['conf2']) + b['conf2'])
conf_out = tf.matmul(conf2, W['conf']) + b['conf']
return logits_out, tf.squeeze(conf_out)
logits, confidence_logit = cautious_fcn(x)
right_answer = tf.stop_gradient(tf.to_float(tf.equal(tf.argmax(logits, 1), tf.argmax(y, 1))))
compute_error = 100*tf.reduce_mean(1 - right_answer)
classification_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, y))
if "softmax" in mode:
confidence_logit = tf.reduce_max(tf.nn.softmax(logits), reduction_indices=[1])
caution_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(confidence_logit, right_answer))
# cc_loss is cautious classification loss
if mode == "c_is_trained_softmax_prob":
cc_loss = classification_loss + caution_loss
else:
cc_loss = classification_loss
elif mode == "c_is_cotrained_sigmoid":
caution_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(confidence_logit, right_answer))
cc_loss = classification_loss + caution_loss
confidence = tf.sigmoid(confidence_logit)
elif mode == "c_is_auxiliary_sigmoid":
caution_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(confidence_logit, right_answer))
cc_loss = classification_loss # we use caution_loss after training normal classifier
else:
assert False, "Invalid mode specified"
cc_calibration_score = tf.reduce_mean((2 * right_answer - 1) * (2 * tf.sigmoid(confidence_logit) - 1))
cc_model_score = tf.reduce_mean(right_answer * ((2 * right_answer - 1) * (2 * tf.sigmoid(confidence_logit) - 1)+ 1)/2)
# cautious classification perplexity
cc_calibration_perplexity = tf.exp(caution_loss)
cc_model_perplexity = tf.exp(caution_loss + classification_loss)
lr = tf.constant(learning_rate)
optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(cc_loss)
sess = tf.InteractiveSession(graph=graph)
if "softmax" in mode:
sess.run(tf.initialize_all_variables())
elif mode == "c_is_cotrained_sigmoid":
sess.run(tf.initialize_all_variables())
elif mode == "c_is_auxiliary_sigmoid":
thawed_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "classifier")
frozen_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "confidence_scorer")
sess.run(tf.initialize_variables(set(tf.all_variables()) - set(frozen_vars)))
err_ema = 90
cc_calibration_perp_ema = 10
cc_model_perp_ema = 10
cc_calibration_score_ema = -1
cc_model_score_ema = -1
for epoch in range(1,training_epochs+1):
# shuffle data
indices = np.arange(num_examples)
np.random.shuffle(indices)
data['x_train'] = data['x_train'][indices]
data['y_train'] = data['y_train'][indices]
for i in range(num_batches):
offset = i * batch_size
bx = word_list_to_embedding(data['x_train'][offset:offset + batch_size, :],
embeddings, embedding_dimension)
by = data['y_train'][offset:offset + batch_size]
if mode != "c_is_auxiliary_sigmoid":
_, err, cc_model_score_curr, cc_calibration_score_curr,\
cc_model_perp_curr, cc_calibration_perp_curr = sess.run([
optimizer, compute_error, cc_model_score, cc_calibration_score,
cc_model_perplexity, cc_calibration_perplexity],
feed_dict={x: bx, y: by, lr: learning_rate})
err_ema = err_ema * 0.95 + 0.05 * err
cc_calibration_perp_ema = cc_calibration_perp_ema * 0.95 + 0.05 * cc_calibration_perp_curr
cc_model_perp_ema = cc_model_perp_ema * 0.95 + 0.05 * cc_model_perp_curr
cc_calibration_score_ema = cc_calibration_score_ema * 0.95 + 0.05 * cc_calibration_score_curr
cc_model_score_ema = cc_model_score_ema * 0.95 + 0.05 * cc_model_score_curr
else:
_, err = sess.run([optimizer, compute_error],
feed_dict={x: bx, y: by, lr: learning_rate})
err_ema = err_ema * 0.95 + 0.05 * err
if epoch % 10 == 0:
print('Epoch', epoch, ' | ', 'Current Classification Error (%)', err_ema)
if mode != "c_is_auxiliary_sigmoid":
print('Epoch', epoch, ' | ', 'Cautious Classification Calibration Perp', cc_calibration_perp_ema)
print('Epoch', epoch, ' | ', 'Cautious Classification Model Perp', cc_model_perp_ema)
print('Epoch', epoch, ' | ', 'Cautious Classification Calibration Score', cc_calibration_score_ema)
print('Epoch', epoch, ' | ', 'Cautious Classification Model Score', cc_model_score_ema)
if mode == "c_is_auxiliary_sigmoid":
# train sigmoid separately from the classifier
phase2_vars = list(set(tf.all_variables()) - set(thawed_vars))
optimizer2 = tf.train.AdamOptimizer(learning_rate=0.001).minimize(caution_loss, var_list=phase2_vars)
sess.run(tf.initialize_variables(set(tf.all_variables()) - set(thawed_vars)))
for epoch in range(3):
for i in range(num_batches):
offset = i * batch_size
bx = word_list_to_embedding(data['x_train'][offset:offset + batch_size, :],
embeddings, embedding_dimension)
by = data['y_train'][offset:offset + batch_size]
sess.run([optimizer2], feed_dict={x: bx, y: by})
err, cc_model_score_test, cc_calibration_score_test,\
cc_model_perp_test, cc_calibration_perp_test = sess.run([
compute_error, cc_model_score, cc_calibration_score,
cc_model_perplexity, cc_calibration_perplexity],
feed_dict={x: word_list_to_embedding(data['x_devtest'], embeddings, embedding_dimension),
y: data['y_devtest']})
print('Test Classification Error (%)', err)
print('Test Cautious Classification Calibration Perp', cc_calibration_perp_test)
print('Test Cautious Classification Model Perp', cc_model_perp_test)
print('Test Cautious Classification Calibration Score', cc_calibration_score_test)
print('Test Cautious Classification Model Score', cc_model_score_test)
sess.close()
In [85]:
train_and_test()
train_and_test()
train_and_test()
In [86]:
train_and_test("c_is_cotrained_sigmoid")
train_and_test("c_is_cotrained_sigmoid")
train_and_test("c_is_cotrained_sigmoid")
In [83]:
train_and_test("c_is_auxiliary_sigmoid")