In [1]:
import tensorflow as tf
import numpy as np
import re
import collections
import sklearn.metrics as sk

In [2]:
def load_data(filename='./data/imdb.train'):
    '''
    :param filename: the system location of the data to load
    :return: the text (x) and its label (y)
             the text is a list of words and is not processed
    '''

    # stop words taken from nltk
    stop_words = ['i','me','my','myself','we','our','ours','ourselves','you','your','yours',
                  'yourself','yourselves','he','him','his','himself','she','her','hers','herself',
                  'it','its','itself','they','them','their','theirs','themselves','what','which',
                  'who','whom','this','that','these','those','am','is','are','was','were','be',
                  'been','being','have','has','had','having','do','does','did','doing','a','an',
                  'the','and','but','if','or','because','as','until','while','of','at','by','for',
                  'with','about','against','between','into','through','during','before','after',
                  'above','below','to','from','up','down','in','out','on','off','over','under',
                  'again','further','then','once','here','there','when','where','why','how','all',
                  'any','both','each','few','more','most','other','some','such','no','nor','not',
                  'only','own','same','so','than','too','very','s','t','can','will','just','don',
                  'should','now','d','ll','m','o','re','ve','y','ain','aren','couldn','didn',
                  'doesn','hadn','hasn','haven','isn','ma','mightn','mustn','needn','shan',
                  'shouldn','wasn','weren','won','wouldn']

    x, y = [], []
    with open(filename, "r") as f:
        for line in f:
            line = re.sub(r'\W+', ' ', line).strip().lower()  # perhaps don't make words lowercase?
            x.append(line[:-1])
            x[-1] = ' '.join(word for word in x[-1].split() if word not in stop_words)
            y.append(line[-1])
    return x, np.array(y, dtype=int)

In [3]:
def get_vocab(dataset):
    '''
    :param dataset: the text from load_data

    :return: a _ordered_ dictionary from words to counts
    '''
    vocab = {}

    # create a counter for each word
    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] = 0

    for example in dataset:
        example_as_list = example.split()
        for word in example_as_list:
            vocab[word] += 1
    
    # sort from greatest to least by count
    return collections.OrderedDict(sorted(vocab.items(), key=lambda x: x[1], reverse=True))

In [4]:
def text_to_rank(dataset, _vocab, desired_vocab_size=5000):
    '''
    :param dataset: the text from load_data
    :vocab: a _ordered_ dictionary of vocab words and counts from get_vocab
    :param desired_vocab_size: the desired vocabulary size
    words no longer in vocab become UUUNNNKKK
    :return: the text corpus with words mapped to their vocab rank,
    with all sufficiently infrequent words mapped to UUUNNNKKK; UUUNNNKKK has rank desired_vocab_size
    (the infrequent word cutoff is determined by desired_vocab size)
    '''
    _dataset = dataset[:]     # aliasing safeguard
    vocab_ordered = list(_vocab)
    count_cutoff = _vocab[vocab_ordered[desired_vocab_size-1]] # get word by its rank and map to its count
    
    word_to_rank = {}
    for i in range(len(vocab_ordered)):
        # we add one to make room for any future padding symbol with value 0
        word_to_rank[vocab_ordered[i]] = i + 1
    
    # we need to ensure that other words below the word on the edge of our desired_vocab size
    # are not also on the count cutoff, so we subtract a bit
    # this is likely quicker than adding another preventative if case
    for i in range(50):
        _vocab[vocab_ordered[desired_vocab_size+i]] -= 0.1
    
    for i in range(len(_dataset)):
        example = _dataset[i]
        example_as_list = example.split()
        for j in range(len(example_as_list)):
            try:
                if _vocab[example_as_list[j]] >= count_cutoff:
                    example_as_list[j] = word_to_rank[example_as_list[j]] 
                else:
                    example_as_list[j] = desired_vocab_size  # UUUNNNKKK
            except:
                example_as_list[j] = desired_vocab_size  # UUUNNNKKK
        _dataset[i] = example_as_list

    return _dataset

In [5]:
# taken from keras
def pad_sequences(sequences, maxlen=None, dtype='int32',
                  padding='pre', truncating='pre', value=0.):
    '''Pads each sequence to the same length:
    the length of the longest sequence.
    If maxlen is provided, any sequence longer
    than maxlen is truncated to maxlen.
    Truncation happens off either the beginning (default) or
    the end of the sequence.
    Supports post-padding and pre-padding (default).
    # Arguments
        sequences: list of lists where each element is a sequence
        maxlen: int, maximum length
        dtype: type to cast the resulting sequence.
        padding: 'pre' or 'post', pad either before or after each sequence.
        truncating: 'pre' or 'post', remove values from sequences larger than
            maxlen either in the beginning or in the end of the sequence
        value: float, value to pad the sequences to the desired value.
    # Returns
        x: numpy array with dimensions (number_of_sequences, maxlen)
    '''
    lengths = [len(s) for s in sequences]

    nb_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
            continue  # empty list was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' % truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x

In [31]:
max_example_len = 400
batch_size = 32
embedding_dims = 50
vocab_size = 5000
training_epochs = 20

print('Loading Data')
X_train, Y_train = load_data()
X_dev, Y_dev = load_data('./data/imdb.dev')
X_test, Y_test = load_data('./data/imdb.test')

vocab = get_vocab(X_train)
X_train = text_to_rank(X_train, vocab, 5000)
X_dev = text_to_rank(X_dev, vocab, 5000)
X_test = text_to_rank(X_test, vocab, 5000)

X_train = pad_sequences(X_train, maxlen=max_example_len)
X_dev = pad_sequences(X_dev, maxlen=max_example_len)
X_test = pad_sequences(X_test, maxlen=max_example_len)
print('Data loaded')

num_examples = Y_train.shape[0]
num_batches = num_examples//batch_size


Loading Data
Data loaded

In [113]:
def train_and_test(mode="c_is_softmax_prob", seed=100, learning_rate=0.001,
                   X_train=X_train, Y_train=Y_train, X_test=X_test, Y_test=Y_test):
    '''
    modes: c_is_softmax_prob, c_is_trained_softmax_prob, c_is_cotrained_sigmoid, c_is_auxiliary_sigmoid
    '''

    graph = tf.Graph()
    with graph.as_default():
        tf.set_random_seed(seed)  # seed set upon graph construction; does not work
        
        x = tf.placeholder(dtype=tf.int32, shape=[None, max_example_len])
        y = tf.placeholder(dtype=tf.int64, shape=[None])

        def gelu(x):
            return 0.5 * x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))
        f = gelu

        W = {}
        b = {}

        with tf.variable_scope("classifier"):
            W['embedding'] = tf.Variable(tf.nn.l2_normalize(
                tf.random_normal([vocab_size+1, embedding_dims]), 0), trainable=True)
            W['logits'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([embedding_dims, 2]), 0))
            
            b['logits'] = tf.Variable(tf.zeros([2]))

        with tf.variable_scope("confidence_scorer"):
            W['hidden_to_conf1'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal(
                        [embedding_dims, embedding_dims//2]), 0))
            W['logits_to_conf1'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([2, embedding_dims//2]), 0))
            W['conf'] = tf.Variable(tf.nn.l2_normalize(tf.random_normal([embedding_dims//2, 1]), 0))

            b['conf1'] = tf.Variable(tf.zeros([embedding_dims//2]))
            b['conf'] = tf.Variable(tf.zeros([1]))

        def cautious_fcn(x):
            w_vecs = tf.nn.embedding_lookup(W['embedding'], x)
            pooled = tf.reduce_mean(w_vecs, reduction_indices=[1])

            logits_out = tf.matmul(pooled, W['logits']) + b['logits']

            conf1 = f(tf.matmul(logits_out, W['logits_to_conf1']) +
                        tf.matmul(pooled, W['hidden_to_conf1']) + b['conf1'])
            conf_out = tf.matmul(conf1, W['conf']) + b['conf']

            return logits_out, tf.squeeze(conf_out)

        logits, confidence_logit = cautious_fcn(x)

        right_answer = tf.stop_gradient(tf.to_float(tf.equal(tf.argmax(logits, 1), y)))
        compute_error = 100*tf.reduce_mean(1 - right_answer)

        classification_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y))
        if "softmax" in mode:
            confidence_logit = tf.reduce_max(tf.nn.softmax(logits), reduction_indices=[1])
            caution_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(confidence_logit, right_answer))
            
            # cc_loss is cautious classification loss
            if mode == "c_is_trained_softmax_prob":
                cc_loss = classification_loss + caution_loss
            else:
                cc_loss = classification_loss
        
        elif mode == "c_is_cotrained_sigmoid":
            caution_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(confidence_logit, right_answer))
            cc_loss = classification_loss + caution_loss
            confidence = tf.sigmoid(confidence_logit)
        elif mode == "c_is_auxiliary_sigmoid":
            caution_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(confidence_logit, right_answer))
            cc_loss = classification_loss  # we use caution_loss after training normal classifier
        else:
            assert False, "Invalid mode specified"
        
        cc_calibration_score = tf.reduce_mean((2 * right_answer - 1) * (2 * tf.sigmoid(confidence_logit) - 1))
        cc_model_score = tf.reduce_mean(right_answer * ((2 * right_answer - 1) * (2 * tf.sigmoid(confidence_logit) - 1)+ 1)/2)
        
        # cautious classification perplexity
        cc_calibration_perplexity = tf.exp(caution_loss)
        cc_model_perplexity = tf.exp(caution_loss + classification_loss)
        
        lr = tf.constant(learning_rate)
        optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(cc_loss)

    sess = tf.InteractiveSession(graph=graph)
    
    if "softmax" in mode:
        sess.run(tf.initialize_all_variables())
    
    elif mode == "c_is_cotrained_sigmoid":
        sess.run(tf.initialize_all_variables())
    
    elif mode == "c_is_auxiliary_sigmoid":
        thawed_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "classifier")
        frozen_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "confidence_scorer")
        sess.run(tf.initialize_variables(set(tf.all_variables()) - set(frozen_vars)))
    
    err_ema = 90
    cc_calibration_perp_ema = 10
    cc_model_perp_ema = 10
    cc_calibration_score_ema = -1
    cc_model_score_ema = -1
    
    for epoch in range(1,training_epochs+1):
        # shuffle data every epoch
        indices = np.arange(num_examples)
        np.random.shuffle(indices)
        X_train = X_train[indices]
        Y_train = Y_train[indices]

        for i in range(num_batches):
            offset = i * batch_size
            bx = X_train[offset:offset + batch_size]
            by = Y_train[offset:offset + batch_size]

            if mode != "c_is_auxiliary_sigmoid":

                _, err, cc_model_score_curr, cc_calibration_score_curr,\
                cc_model_perp_curr, cc_calibration_perp_curr = sess.run([
                        optimizer, compute_error, cc_model_score, cc_calibration_score,
                        cc_model_perplexity, cc_calibration_perplexity],
                     feed_dict={x: bx, y: by, lr: learning_rate})

                err_ema = err_ema * 0.95 + 0.05 * err
                cc_calibration_perp_ema = cc_calibration_perp_ema * 0.95 + 0.05 * cc_calibration_perp_curr
                cc_model_perp_ema = cc_model_perp_ema * 0.95 + 0.05 * cc_model_perp_curr
                cc_calibration_score_ema = cc_calibration_score_ema * 0.95 + 0.05 * cc_calibration_score_curr
                cc_model_score_ema = cc_model_score_ema * 0.95 + 0.05 * cc_model_score_curr
            else:
                _, err, l = sess.run([optimizer, compute_error, cc_loss],
                                     feed_dict={x: bx, y: by, lr: learning_rate})
                err_ema = err_ema * 0.95 + 0.05 * err

        if epoch % 10 == 0:
            print('Epoch', epoch, ' | ', 'Current Classification Error (%)', err_ema)
            if mode != "c_is_auxiliary_sigmoid":
                print('Epoch', epoch, ' | ', 'Cautious Classification Calibration Perp', cc_calibration_perp_ema)
                print('Epoch', epoch, ' | ', 'Cautious Classification Model Perp', cc_model_perp_ema)
                print('Epoch', epoch, ' | ', 'Cautious Classification Calibration Score', cc_calibration_score_ema)
                print('Epoch', epoch, ' | ', 'Cautious Classification Model Score', cc_model_score_ema)

    if mode == "c_is_auxiliary_sigmoid":
        # train sigmoid separately from the classifier
        phase2_vars = list(set(tf.all_variables()) - set(thawed_vars))
        optimizer2 = tf.train.AdamOptimizer(learning_rate=0.001).minimize(caution_loss, var_list=phase2_vars)
        sess.run(tf.initialize_variables(set(tf.all_variables()) - set(thawed_vars)))
        
        for epoch in range(2):
            # shuffle data every epoch
            indices = np.arange(num_examples)
            np.random.shuffle(indices)
            X_train = X_train[indices]
            Y_train = Y_train[indices]

            for i in range(num_batches):
                bx = X_train[offset:offset + batch_size]
                by = Y_train[offset:offset + batch_size]

                sess.run([optimizer2], feed_dict={x: bx, y: by})

    err, cc_model_score_test, cc_calibration_score_test,\
    cc_model_perp_test, cc_calibration_perp_test = sess.run([
                    compute_error, cc_model_score, cc_calibration_score,
                    cc_model_perplexity, cc_calibration_perplexity],
                                  feed_dict={x: X_test, y: Y_test})

    print('Test Classification Error (%)', err)
    print('Test Cautious Classification Calibration Perp', cc_calibration_perp_test)
    print('Test Cautious Classification Model Perp', cc_model_perp_test)
    print('Test Cautious Classification Calibration Score', cc_calibration_score_test)
    print('Test Cautious Classification Model Score', cc_model_score_test)

    sess.close()

In [118]:
train_and_test()
train_and_test()
train_and_test()


Epoch 10  |  Current Classification Error (%) 8.04213016869
Epoch 10  |  Cautious Classification Calibration Perp 1.4913142399
Epoch 10  |  Cautious Classification Model Perp 1.84001342238
Epoch 10  |  Cautious Classification Calibration Score 0.366954381777
Epoch 10  |  Cautious Classification Model Score 0.656664406814
Test Classification Error (%) 12.3
Test Cautious Classification Calibration Perp 1.54706
Test Cautious Classification Model Perp 2.1239
Test Cautious Classification Calibration Score 0.331525
Test Cautious Classification Model Score 0.626295
Epoch 10  |  Current Classification Error (%) 9.01776217802
Epoch 10  |  Cautious Classification Calibration Perp 1.5067733706
Epoch 10  |  Cautious Classification Model Perp 1.93401051969
Epoch 10  |  Cautious Classification Calibration Score 0.35768268921
Epoch 10  |  Cautious Classification Model Score 0.649408129246
Test Classification Error (%) 12.968
Test Cautious Classification Calibration Perp 1.55451
Test Cautious Classification Model Perp 2.16295
Test Cautious Classification Calibration Score 0.327657
Test Cautious Classification Model Score 0.622469
Epoch 10  |  Current Classification Error (%) 8.74696441255
Epoch 10  |  Cautious Classification Calibration Perp 1.49859240673
Epoch 10  |  Cautious Classification Model Perp 1.85765795816
Epoch 10  |  Cautious Classification Calibration Score 0.362969619449
Epoch 10  |  Cautious Classification Model Score 0.652586620277
Test Classification Error (%) 13.996
Test Cautious Classification Calibration Perp 1.57026
Test Cautious Classification Model Perp 2.23955
Test Cautious Classification Calibration Score 0.317181
Test Cautious Classification Model Score 0.613974

In [119]:
train_and_test("c_is_cotrained_sigmoid")
train_and_test("c_is_cotrained_sigmoid")
train_and_test("c_is_cotrained_sigmoid")


Epoch 10  |  Current Classification Error (%) 9.06342513393
Epoch 10  |  Cautious Classification Calibration Perp 1.26285863247
Epoch 10  |  Cautious Classification Model Perp 1.61074784375
Epoch 10  |  Cautious Classification Calibration Score 0.727877072048
Epoch 10  |  Cautious Classification Model Score 0.839024624966
Test Classification Error (%) 11.968
Test Cautious Classification Calibration Perp 1.34651
Test Cautious Classification Model Perp 1.81795
Test Cautious Classification Calibration Score 0.678708
Test Cautious Classification Model Score 0.810641
Epoch 10  |  Current Classification Error (%) 8.52237364392
Epoch 10  |  Cautious Classification Calibration Perp 1.24786365961
Epoch 10  |  Cautious Classification Model Perp 1.59730933445
Epoch 10  |  Cautious Classification Calibration Score 0.729640300553
Epoch 10  |  Cautious Classification Model Score 0.842416382632
Test Classification Error (%) 12.056
Test Cautious Classification Calibration Perp 1.35883
Test Cautious Classification Model Perp 1.837
Test Cautious Classification Calibration Score 0.692414
Test Cautious Classification Model Score 0.818819
Epoch 10  |  Current Classification Error (%) 10.1644734978
Epoch 10  |  Cautious Classification Calibration Perp 1.31099227112
Epoch 10  |  Cautious Classification Model Perp 1.75256107516
Epoch 10  |  Cautious Classification Calibration Score 0.675811019478
Epoch 10  |  Cautious Classification Model Score 0.808336328174
Test Classification Error (%) 12.1
Test Cautious Classification Calibration Perp 1.35823
Test Cautious Classification Model Perp 1.83806
Test Cautious Classification Calibration Score 0.689054
Test Cautious Classification Model Score 0.817251

In [116]:
train_and_test("c_is_auxiliary_sigmoid")
# looks like it needs some regularization
# it probably has learned to guess extremely and positively
# or perhaps it just doesn't decreased accuracy like the cotrained sigmoid may
# so maybe change emphasis on cotrained sigmoid


Epoch 10  |  Current Classification Error (%) 9.82396211464
Test Classification Error (%) 24.164
Test Cautious Classification Calibration Perp 1.4518
Test Cautious Classification Model Perp 2.6351
Test Cautious Classification Calibration Score 0.648539
Test Cautious Classification Model Score 0.655682