Captioning Model(s)

This notebook runs the gamut from very basic to very recent models:

  • Take the featurized images (2048d), and tokenised captions
  • Have 'pluggable' input and output transform for each word :
    • Concat : (256 one-hot - including '0'=mask, '1'={UNK}, '2'={START}, '3'={STOP}, '4'={UseOther})
    • (a) UseOther + (8192-250 of more one-hot)
    • (b) UseOther + (50d of same GloVe embedding, for nearest-neighbour)
    • (c) UseOther + (log2(8192)==13 bits + error correction of index of word ~ 3 copies, averaged)
  • LSTM / GRU
    • 64d or 200d of hidden units for the RNNs
    • Choice of number of layers
    • Use features as initialisation input for hidden units
  • CNN (with dilation "DeepMind-style")
    • 64d or 200d of width of input, with 1d convolutions run over it
    • Choice of layer layout
    • Use image features as :
      • additional channel of input for every timestep with 16x 1x1 convolutions on top; or
      • use as additional bias input for word embedding inputs; or
      • use an attention-like mechanism to match at each step of the processing
  • CNN (including residual layer skips and Gated-Linear-Units "Facebook-style")
    • 200d of width of input, with 1d convolutions run over it
    • Use image features concatenated onto the word embedding at every step
  • Attention-Is-All-You-Need ("Google-style")
    • 200d of width of input, with new implementation of the AIAYN model in Keras
    • Switchable layouts :
      • Can switch off caption self-attention
      • Can use more than 1 'layer' deep
  • Have a final score for test cases :

In [1]:
import os

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import random
import pickle

TRAIN_PCT=0.9

In [2]:
# Load in the captions/corpus/embedding
with open('./data/cache/CAPTIONS_data_Flickr30k_2017-06-07_23-15.pkl', 'rb') as f:
    text_data=pickle.load(f, encoding='iso-8859-1')

"""
text_data ~ dict(
    img_to_captions = img_to_valid_captions,
    
    action_words = action_words, 
    stop_words = stop_words_sorted,
    
    embedding = embedding,
    embedding_word_arr = embedding_word_arr,
    
    img_arr = img_arr_save,
    train_test = np.random.random( (len(img_arr_save),) ),
)"""

embedding = text_data['embedding']
embedding_eps = 0.00001
embedding_normed = embedding / np.maximum( np.linalg.norm(embedding, axis=1, keepdims=True), embedding_eps)
vocab_arr = text_data['embedding_word_arr']
dictionary = { w:i for i,w in enumerate(vocab_arr) }

img_arr_train = [ img for i, img in enumerate(text_data['img_arr']) if text_data['train_test'][i]<TRAIN_PCT ]
caption_arr_train = [ (img, caption) for img in img_arr_train for caption in text_data['img_to_captions'][img] ]

print("Loaded captions, corpus and embedding")

In [3]:
# Load in the features
with open('./data/cache/FEATURES_data_Flickr30k_flickr30k-images_2017-06-06_18-07.pkl', 'rb') as f:
    image_data=pickle.load(f, encoding='iso-8859-1')

"""
image_data ~ dict(
    features = features,
    img_arr = img_arr,
)
"""
feature_arr = image_data['features']
image_feature_idx = { img:idx for idx, img in enumerate(image_data['img_arr']) }

print("Loaded dim(%d) image features for %d images" % (feature_arr.shape[1], feature_arr.shape[0]))

In [4]:
CAPTION_LEN = 32
EMBEDDING_DIM = embedding.shape[1]

VOCAB_SIZE = len(vocab_arr)
LOG2_VOCAB_SIZE = 13  # 1024->10, 8192->13
if not (2**LOG2_VOCAB_SIZE/2) < VOCAB_SIZE < 2**LOG2_VOCAB_SIZE:
    print("LOG2_VOCAB_SIZE incorrect")

In [5]:
def caption_to_idx_arr(caption):  # This is actually 1 longer than CAPTION_LEN - need to shift about a bit later
    ret = np.zeros( (CAPTION_LEN+1,), dtype='int32')  # {MASK}.idx===0
    i=0
    ret[i] = dictionary['{START}']
    #print(len(caption.split()), caption)
    for w in caption.lower().split():
        i += 1
        ret[i] = dictionary.get(w, dictionary['{UNK}'])
    ret[i+1] = dictionary['{STOP}']
    return ret

In [6]:
#for j in range(0,10):
#    print(j)
#print(j)

In [7]:
# This is re-done below, since better to choose over full range of captions, 
#   rather than randomly within shuffled images
def caption_training_example():
    img_arr = img_arr_train
    while True:
        random.shuffle( img_arr )
        for img in img_arr:
            captions = text_data['img_to_captions'][img]
            caption = random.choice(captions)
            print(caption)
            yield image_feature_idx[ img ], caption_to_idx_arr( caption )
        print("Captions : Looping")
caption_training_example_gen = caption_training_example()

In [8]:
next(caption_training_example_gen)

TensorFlow / Keras imports


In [9]:
#    import tensorflow.contrib.keras
#    import tensorflow.contrib.keras.backend as K
#    from tensorflow.contrib.keras.python.keras.utils.np_utils import to_categorical
#    from tensorflow.contrib.keras.api.keras.losses import cosine_proximity, categorical_crossentropy, mean_squared_error
#    from tensorflow.contrib.keras.api.keras.activations import softmax, sigmoid
#    from tensorflow.contrib.keras.api.keras.layers import Input, Masking, Dense, GRU
#    from tensorflow.contrib.keras.api.keras.layers import Activation, Conv1D, Dropout, BatchNormalization
#    from tensorflow.contrib.keras.api.keras.layers import RepeatVector, Concatenate, Add, Multiply
#    from tensorflow.contrib.keras.api.keras.layers import Permute, Reshape, Dot, Lambda
#    from tensorflow.contrib.keras.api.keras.optimizers import RMSprop, Adam
#    from tensorflow.contrib.keras.api.keras.models import Model
#    from tensorflow.contrib.keras.python.keras.layers import TimeDistributed

# This needs keras 2.0.4 for RNN initial_state fixes.  tf.contrib.keras LAGS BADLY as of 2017-06-20
import keras
import keras.backend as K
from keras.utils.np_utils import to_categorical
from keras.losses import cosine_proximity, categorical_crossentropy, mean_squared_error
from keras.activations import softmax, sigmoid
from keras.layers import Input, Masking, Dense, GRU
from keras.layers import Activation, Conv1D, Dropout, BatchNormalization
from keras.layers import RepeatVector, Concatenate, Add, Multiply
from keras.layers import Permute, Reshape, Dot, Lambda
from keras.optimizers import RMSprop, Adam
from keras.models import Model

from keras.layers import TimeDistributed

Create pluggable IO stages for words


In [10]:
def pick_close_embedding_idx_FIXED_SET(network_output, out_of_top=8):
    # cosine distance over whole embedding space
    emb_normed = network_output/np.maximum( np.linalg.norm(network_output), embedding_eps)
    scores = embedding_normed.dot(emb_normed)
    # Find indexes of the top==last few: https://docs.scipy.org/doc/numpy/reference/generated/numpy.argpartition.html
    top_idx = np.argpartition(scores, -out_of_top)[-out_of_top:]
    top_val = np.maximum( scores[top_idx], embedding_eps)
    lo, hi = top_val.min(), top_val.max()
    #top_prob = top_val/np.sum(top_val)
    top_scaled = (top_val-lo)/(hi-lo)
    top_prob = top_scaled/np.sum(top_scaled)
    print([ (top_val[i], top_prob[i], vocab_arr[top_idx[i]]) for i in range(out_of_top)])
    top_choice = np.random.choice(out_of_top, p=top_prob )
    return top_idx[ top_choice ], top_prob[ top_choice ]

def pick_close_embedding_idx(network_output, expand_by=1.1, debug=False):
    # cosine distance over whole embedding space
    emb_normed = network_output/np.maximum( np.linalg.norm(network_output), embedding_eps)
    scores = embedding_normed.dot(emb_normed)
    
    # Find index(es) of the best matches
    best_idx = np.argmax(scores)
    top_hurdle = 1.0 - (1.0 - scores[best_idx])*expand_by  # Expand the margin away from perfect(==1.0)
    
    # Now look at all of the scores that are above the hurdle (expand from the best a bit)
    top_idx, = np.nonzero(scores > (top_hurdle-embedding_eps)) 
    n_top = top_idx.shape[0]
    top_val = np.maximum( scores[top_idx], embedding_eps)
    #lo, hi = top_val.min(), top_val.max()
    top_prob = top_val/np.sum(top_val)
    #top_scaled = (top_val-lo)/(hi-lo)
    #top_prob = top_scaled/np.sum(top_scaled)
    if debug: print([ (top_val[i], top_prob[i], vocab_arr[top_idx[i]]) for i in range(n_top)])
    top_choice = np.random.choice(n_top, p=top_prob )
    return top_idx[ top_choice ], top_prob[ top_choice ]

In [11]:
if True:  # Test out the embedding decoder 
    #vec = embedding[ dictionary['to'] ]
    #vec = embedding[ dictionary['queen'] ]
    vec = embedding[ dictionary['queen'] ] + embedding[ dictionary['king'] ]
    for i in range(10):
        idx,prob = pick_close_embedding_idx(vec, debug=i==0)
        print("  %5.2f : %s" % (prob, vocab_arr[idx],  ))

In [12]:
def pick_softmax_idx(network_output):
    e_x = np.exp(network_output - np.max(network_output))
    smx = e_x / e_x.sum()  # softmax of the array, better conditioned by the line above
    top_idx = np.random.choice(smx.shape[0], p=smx)
    return top_idx, smx[top_idx]

In [13]:
class RepresentAs_FullEmbedding():
    name = 'FullEmbedding'
    width = EMBEDDING_DIM
    
    def encode(caption_arr):
        # plain embedding of each symbol
        return embedding[ caption_arr, : ]

    def loss_fn(ideal_output, network_output):  # y_true, y_pred
        return cosine_proximity( ideal_output, network_output )
    
    def decode(network_output):
        #return '{UNK}', 1.0  # placeholder (word and confidence level)
        idx, prob = pick_close_embedding_idx(network_output)
        return vocab_arr[ idx ], prob
    
class RepresentAs_FullOneHot():
    name = 'FullOneHot'
    width = VOCAB_SIZE
    
    def encode(caption_arr):
        # Output desired is one-hot of each symbol (cross entropy match whole thing) 
        return to_categorical(caption_arr, num_classes=VOCAB_SIZE) 
    
    def loss_fn(ideal_output, network_output):  # y_true, y_pred
        smx = softmax(network_output, axis=-1)
        return categorical_crossentropy( ideal_output, smx )
    
    def decode(network_output):
        idx, prob = pick_softmax_idx(network_output)
        return vocab_arr[ idx ], prob

In [14]:
base_width = len(text_data['action_words']) + len(text_data['stop_words'])
MASK_idx = dictionary['{MASK}']   # ==0
EXTRA_idx = dictionary['{EXTRA}']
UNK_idx = dictionary['{UNK}']

def OneHotBasePlus(arr): # Contains indexing magic
    #  Arrangement should be :: (samples, timesteps, features),
    one_hot_base_plus = np.zeros( (CAPTION_LEN, base_width), dtype='float32')
    # Set the indicator for entries not in action or stop words
    one_hot_base_plus[ arr>=base_width, EXTRA_idx ] = 1.0
    # Set the one-hot for everthing in the one-hot-region
    one_hot_base_plus[ arr< base_width, arr[np.where(arr<base_width)] ] = 1.0
    # Force masked values to all-zeros
    one_hot_base_plus[ arr==0, MASK_idx ] = 0.0
    return one_hot_base_plus

class RepresentAs_OneHotBasePlusEmbedding():
    name = 'OneHotBasePlusEmbedding'
    width = base_width + EMBEDDING_DIM

    def encode(caption_arr): 
        # Input is onehot for first part, with the embedding included for all words too
        return np.hstack( [ OneHotBasePlus(caption_arr), embedding[caption_arr] ] )
    
    def loss_fn(ideal_output, network_output):  # y_true, y_pred
        # One-hot of each action symbol and stop words (cross entropy match these) and 
        #   RMSE on remaining embedding (weighted according to onehot[{EXTRA}]~0...1)
        
        #print("ideal_output.shape", ideal_output.shape)     # ideal_output.shape (?, ?, ?)
        #print("network_output.shape", network_output.shape) # network_output.shape (?, 32, 191)
        
        # Perhaps need this idea https://github.com/fchollet/keras/issues/890:
        smx = softmax(network_output[:, :, :base_width], axis=-1)
        #print("smx.shape", smx.shape) # smx.shape (?, 32, 141)
        
        is_extra = smx[:, :, EXTRA_idx]
        one_hot_loss = categorical_crossentropy( ideal_output[:, :, :base_width], smx )    
        embedding_loss = cosine_proximity( ideal_output[:, :, base_width:], 
                                                network_output[:, :, base_width:] )
        
        return (1.-is_extra)*one_hot_loss + (is_extra)*embedding_loss
    
    def decode(network_output):
        idx, prob = pick_softmax_idx(network_output[:base_width])
        if idx==EXTRA_idx:
            idx, prob2 = pick_close_embedding_idx(network_output[base_width:])
            prob *= prob2
        return vocab_arr[ idx ], prob
    
    
POWERS_OF_2 = 2**np.arange(LOG2_VOCAB_SIZE)
class RepresentAs_OneHotBasePlusBinaryIdx():
    name = 'OneHotBasePlusBinaryIdx'
    width = base_width + 3*LOG2_VOCAB_SIZE

    def encode(caption_arr):
        # Input is onehot for first part, with 3 copies of the binary index of all words afterwards
        #   Idea is from : https://arxiv.org/abs/1704.06918

        # Thanks to : https://stackoverflow.com/questions/21918267/
        #         convert-decimal-range-to-numpy-array-with-each-bit-being-an-array-element
        binary = (caption_arr[:, np.newaxis] & POWERS_OF_2) / POWERS_OF_2
        binary -= 0.5  # symmetrical around 0
        
        return np.hstack( [ OneHotBasePlus(caption_arr), binary, binary, binary ] )
  
    def loss_fn(ideal_output, network_output):  # y_true, y_pred
        smx = softmax(network_output[:base_width], axis=-1)
        sig = sigmoid(network_output[base_width:])
        
        is_extra = smx[:, :, EXTRA_idx]
        one_hot_loss = categorical_crossentropy( ideal_output[:base_width], smx )
        #binary_loss  = categorical_crossentropy( ideal_output[base_width:], sig )
        binary_loss  = mean_squared_error( ideal_output[base_width:], sig )  # reported better in paper
        return (1.-is_extra)*one_hot_loss + (is_extra)*binary_loss
    
    def decode(network_output, debug=False):
        idx, prob = pick_softmax_idx(network_output[:base_width])
        if idx==EXTRA_idx:
            sig = 1./(1. + np.exp(-network_output[base_width:] ))
            #print(sig)
            binary = np.mean( sig.reshape( (3, LOG2_VOCAB_SIZE) ), axis=0 )>0.5
            if debug: 
                print( sig )
                #print( sig.shape )
                print( sig.reshape( (3, LOG2_VOCAB_SIZE) ) )
            # TODO : more work to do something (a) stochastic, and (b) give 'prob' measure
            idx = POWERS_OF_2.dot(binary>0.5)
            if idx>VOCAB_SIZE: idx = UNK_idx
        return vocab_arr[ idx ], prob

Just a little test area for the above


In [15]:
#io = RepresentAs_FullEmbedding             # .encode : 32, 50
#io = RepresentAs_FullOneHot                # .encode : 32, 6946
io = RepresentAs_OneHotBasePlusEmbedding   # .encode : 32, 191 
#io = RepresentAs_OneHotBasePlusBinaryIdx   # .encode : 32, 180
io.width

caption_sample = 'The cat sat on the mat .'
caption_sample_idx = caption_to_idx_arr(caption_sample)
caption_sample_idx  # array([   2,    8, 1461, 2496,   11,    8,  998,    5,    3,    0,    0...

onehot_start=range(0,12)
x=OneHotBasePlus(caption_sample_idx[:-1])  # Just the first 10 one-hot entries
#x
#x[onehot_start, dictionary['{MASK}']]
#x[onehot_start, dictionary['{START}']]
#x[onehot_start,  EXTRA_idx]
#x[onehot_start, dictionary['{STOP}']]
#x[onehot_start, dictionary['on']]

#x.shape  # 32, 141 
#embedding[caption_sample_idx[:-1]].shape  # 32, 50

if False:
    powers_of_two = 2**np.arange(LOG2_VOCAB_SIZE)
    (caption_sample_idx[:, np.newaxis] & powers_of_two) / powers_of_two

io.encode( caption_sample_idx[:-1] ).shape  # [0:6,:]

In [16]:
# uses io as defined above
per_caption = CAPTION_LEN//2
for i in range(0, len(vocab_arr), per_caption):
    caption_sample = ' '.join( vocab_arr[i : i+per_caption ] )
    caption_sample_idx = caption_to_idx_arr(caption_sample)
    print("%4d : %s " % (i,caption_sample,))
    x=io.encode(caption_sample_idx[:-1])  # This is now an embedding
    #print(x.shape)
    #print( x[1, -LOG2_VOCAB_SIZE:]+0.5)
    x *= 12.0  # By virtue of how this works... won't affect embedding, will make softmax more precise
    caption_restore = ' '.join([ io.decode(x[i+1])[0] for i in range(per_caption)])  # , debug=i==0
    print("       %s " % (caption_restore,))
    if i>250: break

Create Batches for given embeddings


In [17]:
#BATCH_SIZE=16    # Titan-X occupancy ~ 37%, epoch ~ 450sec for GRUs
BATCH_SIZE=64    # Titan-X occupancy ~ 32%, epoch ~ 300sec for GRUs

In [18]:
def caption_example_generator():
    while True:
        random.shuffle( caption_arr_train ) 
        for img, caption in caption_arr_train:
            if len(caption.split())>CAPTION_LEN-2 : continue # Skip captions that are too long
            yield img, caption

def caption_training_batch_generator(emb_input, emb_output, batch_size=BATCH_SIZE):
    caption_example_gen = caption_example_generator()
    while True:
        X0, X1, Y = [],[], []
        for _ in range(batch_size):
            img, caption = next( caption_example_gen )
            caption_idx = caption_to_idx_arr(caption)
            X0.append( feature_arr[ image_feature_idx[img] ]  )
            X1.append( emb_input.encode(caption_idx[:-1]) )
            Y .append( emb_output.encode(caption_idx[1:]) )
        yield [np.array(X0), np.array(X1)], [np.array(Y)]

In [19]:
caption_example_gen = caption_example_generator()
next(caption_example_gen)

Functions that return Models

Here, the caption input is the result of one of the embedding classes above (batched up), so has some useful properties. The features input is just the features created by the InceptionV3 featurisation already loaded (batched up).

Plain xxx2seq model

This is a 'plain' xxx2seq kind of model, where the feature vector for X is fed in as the initial hidden state of the RNN (which is composed of GRUs)


In [20]:
def RNN_captioner(rnn_count, 
                  caption_input_shape=None, feature_shape=None, caption_output_shape=None, 
                  levels=1):  
    feature_in = Input(shape=feature_shape, dtype='float32', name='feature_input')
    feature_downsize = Dense(rnn_count)(feature_in) 
    
    caption_in = Input(shape=caption_input_shape, dtype='float32', name='caption_input')
    #masked = Masking(mask_value=0.)(caption_in)  # , input_shape=caption_input_shape
    masked = caption_in # Ignore the masking thing - this will be implicit in the scoring

    #  initial_state = feature_downsize :: 
    # See : https://github.com/fchollet/keras/issues/2995
    #       https://github.com/fchollet/keras/pull/3947 (closed, unmerged)
    # Hidden in the code : https://github.com/fchollet/keras/blob/master/keras/layers/recurrent.py#L178
    
    rnn = masked
    
    # The recurrent layer
    #rec1 = GRU(rnn_count, initial_state=feature_downsize, return_sequences=True)(rnn)
    rnn = GRU(rnn_count, return_sequences=True)(rnn, initial_state=feature_downsize)
    #rec1 = GRU(rnn_count, return_sequences=True)(rnn)

    for _ in range(levels-1):
        rnn = GRU(rnn_count, return_sequences=True)(rnn)
    
    rnn_outputs = rnn

    #print( (BATCH_SIZE, caption_input_shape[0], rnn_count) )
    caption_out = TimeDistributed( Dense(caption_output_shape[1], activation='linear'), # activation='softmax'
                                   input_shape=(-1, caption_input_shape[0], rnn_count),
                                   name='output-sizing')(rnn_outputs)
    
    return Model(inputs=[feature_in, caption_in], 
                 outputs=[caption_out], 
                 name='RNN-captioner-nomask-%dx%d' % (levels, rnn_count))

CNN with Dilations Model ("DeepMind")


In [21]:
def CNN_captioner(cnn_channels, 
                  caption_input_shape=None, feature_shape=None, caption_output_shape=None, 
                  layout=0):
    feature_in = Input(shape=feature_shape, dtype='float32', name='feature_input')
    
    feature_resize = Dense(cnn_channels, name='FeatureProjection')(feature_in) 
    feature_everywhere = RepeatVector(caption_input_shape[0], name='RepeatedFeatures')(feature_resize)
    
    caption_in = Input(shape=caption_input_shape, dtype='float32', name='caption_input')

    if layout==0 or layout==1:
        caption_resize = Dense(cnn_channels)(caption_in) ## Is this right?  time-dependency?
        
        #caption_features = Concatenate( [caption_resize, feature_resize], axis=3) # several channels

        # So the image feature becomes a bias for all the first layer cnn inputs
        caption_features = Add()( [feature_everywhere, caption_resize] )
        cnn = Activation('relu')( caption_features )
        
    if layout==2 or layout==3 or layout==4 or layout==5:
        caption_cnn = Conv1D(2*cnn_channels, 1, padding='causal', dilation_rate=1, activation='relu')(caption_in)

        # So the image features can be added into the embedding as required
        caption_features = Concatenate()( [feature_everywhere, caption_cnn] )
        cnn = ( caption_features )  # Don't do any relu transform at this point

    if layout==6:
        feature_everywhere_full = RepeatVector(caption_input_shape[0], name='RepeatedFeatures')(feature_in)
        
        caption_cnn = Conv1D(2*cnn_channels, 1, padding='causal', dilation_rate=1, activation='relu')(caption_in)

        # So the image features can be added into the embedding as required
        caption_features = Concatenate()( [feature_everywhere_full, caption_cnn] )
        cnn = ( caption_features )  # Don't do any relu transform at this point
        
    if layout==0 or layout==1 or layout==2 or layout==3 or layout==4 or layout==5 or layout==6:
        # The CNN layers
        # Conv1D(filters, kernel_size, strides=1, padding='valid', dilation_rate=1, 
        #        activation=None, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', 

        cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='relu')(cnn)

        if layout==3 or layout==4:
            # Gated Linear Units
            cnn_gate = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='tanh')(cnn)
            cnn = Multiply()( [cnn, cnn_gate] )
        
        if layout==1 or layout==4:
            cnn = Dropout(0.5)(cnn)
            
        if layout==4 or layout==5:
            cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='relu')(cnn)
            cnn = BatchNormalization(scale=False)(cnn)
            
        cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=2, activation='relu')(cnn)
        
        if layout==1 or layout==4:
            cnn = Dropout(0.5)(cnn)
            
        cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=4, activation='relu')(cnn)
        
        if layout==1 or layout==4:
            cnn = Dropout(0.5)(cnn)
            
        cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=8, activation='relu')(cnn)
        cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=16, activation='relu')(cnn)
        cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='relu')(cnn)
        
        if layout==4 or layout==5:
            cnn = BatchNormalization(scale=False)(cnn)
        
        #if layout==5:
        #    # Gated Linear Units
        #    cnn_gate = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='tanh')(cnn)
        #    cnn = Multiply()( [cnn, cnn_gate] )
        
        if layout==3 or layout==4:
            cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=2, activation='relu')(cnn)
            cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='relu')(cnn)

        if layout==5:            
            cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=2, activation='relu')(cnn)
            cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=4, activation='relu')(cnn)
            cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=8, activation='relu')(cnn)
            cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=16, activation='relu')(cnn)
            cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='relu')(cnn)
            
        cnn_outputs = cnn

    caption_out = TimeDistributed( Dense(caption_output_shape[1], activation='linear'), # activation='softmax'
                                   input_shape=(-1, caption_input_shape[0], cnn_channels),
                                   name='output-sizing')(cnn_outputs)
    
    return Model(inputs=[feature_in, caption_in], 
                 outputs=[caption_out], 
                 name='CNN-captioner-%dx%d' % (layout, cnn_channels))

CNN with Gated Linear Units ("Facebook")

The "Convolutional Sequence to Sequence Learning" paper https://arxiv.org/abs/1705.03122 was released 6 weeks ago.


In [22]:
def CNNGLU_layer(cnn_channels, k, dilation_rate=1, residual=True, batch_norm=True):
    # for original paper, see : https://arxiv.org/abs/1612.08083
    def layer(cnn):
        cnnA = Conv1D(cnn_channels, k, padding='causal', dilation_rate=dilation_rate, activation='linear')(cnn)
        cnnB = Conv1D(cnn_channels, k, padding='causal', dilation_rate=dilation_rate, activation='sigmoid')(cnn)
        cnnOut = Multiply()( [cnnA, cnnB] )
        if residual:
            cnnOut = Add()([cnn, cnnOut])  # This is the residual skip
        if batch_norm:
            cnnOut = BatchNormalization()(cnnOut)
        return cnnOut
    return layer

def CNNGLU_captioner(cnn_channels, 
                     caption_input_shape=None, feature_shape=None, caption_output_shape=None, 
                     layout=0):
    feature_in = Input(shape=feature_shape, dtype='float32', name='feature_input')
    feature_everywhere_full = RepeatVector(caption_input_shape[0], name='RepeatedFeatures')(feature_in)
    
    caption_in = Input(shape=caption_input_shape, dtype='float32', name='caption_input')
    caption_cnn = Conv1D(4*cnn_channels, 1, padding='causal', dilation_rate=1, activation='relu')(caption_in)

    # So the image features can be added into the embedding as required
    caption_features = Concatenate()( [feature_everywhere_full, caption_cnn] )
    cnn = Conv1D(cnn_channels, 1, padding='causal', dilation_rate=1, activation='relu')( caption_features )

    if layout==7:
        cnn = CNNGLU_layer(cnn_channels, 3, dilation_rate=1)(cnn)
        cnn = CNNGLU_layer(cnn_channels, 3, dilation_rate=2)(cnn)
        cnn = CNNGLU_layer(cnn_channels, 3, dilation_rate=4)(cnn)
        cnn = CNNGLU_layer(cnn_channels, 3, dilation_rate=8)(cnn)
        cnn = CNNGLU_layer(cnn_channels, 3, dilation_rate=16)(cnn)

        cnn_outputs = cnn

    caption_out = TimeDistributed( Dense(caption_output_shape[1], activation='linear'), # activation='softmax'
                                   input_shape=(-1, caption_input_shape[0], cnn_channels),
                                   name='output-sizing')(cnn_outputs)
    
    return Model(inputs=[feature_in, caption_in], 
                 outputs=[caption_out], 
                 name='CNNGLU-captioner-%dx%d' % (layout, cnn_channels))

"Attention is All You Need" ("Google")

The "Attention is All You Need" paper : https://arxiv.org/abs/1706.03762 was released 1 week before this was originally presented. So, as far as I know, it was the first Keras version of this innovative model.


In [23]:
def ResidualAndNormLayer(name):
    def layer(before, after):
        out = Add(name=name+'res')([before, after])      # This is the residual step
        return BatchNormalization(name=name+'bn')(out)
    return layer

def AttentionLayer(name, n_heads=4, key_widths=32, value_width=50, mask=None):
    def layer(q_var, k_var, v_var):
        sqrt_dk = np.sqrt(float(key_widths))
        v_contrib = []
        for i in range(n_heads):
            n="%s%i" % (name, i)
            #print(i, key_widths, value_width, q_var.shape)
            q_head = TimeDistributed( Dense(key_widths), name=n+"Q")(q_var) # shape ~ (?, time_step, key_width)
            k_head = TimeDistributed( Dense(key_widths), name=n+"K")(k_var) # shape ~ (?, time_step, key_width)

            #   We need to do this across time, so the dot products pile up each location
            k_head_transpose = Permute( (2, 1), name=n+"K.T")(k_head)  # shape ~ (?, key_width, time_step)

            # Give each position an array with corresponds to amount it 'wants' each other location
            fit_head = Dot([2, 1], normalize=False, name=n+"match")( [q_head, k_head_transpose] )

            # Scale this down to compensate for adding together lots of N()*N()
            fit_head_scaled = Lambda(lambda x: x/sqrt_dk, name=n+"scaled")(fit_head)

            fit_head_masked = fit_head_scaled
            if mask is not None:
                # Mask out backwards (in time) flowing information (this is the decoder), by adding '-100.'
                #fit_head_masked = K.clip(fit_head_masked, -20., +20.)  # set bounds so -100. is ~ -inf
                fit_head_masked = Lambda(lambda x: K.clip(x, -20., +20.))(fit_head_masked)  # set bounds so -100. is ~ -inf
                fit_head_masked = Add()( [fit_head_masked, mask] )

            # Give each position an array with corresponds to 'probability' it 'wants' each other location
            max_head = TimeDistributed( Activation(K.softmax), name=n+"SoftMax" ) ( fit_head_masked )  

            # Now the value side
            v_head = TimeDistributed( Dense(value_width), name=n+"V" )(v_var) 

            # Combine the key/query match softmax output with the values
            value_head = Dot([2, 1], normalize=False, name=n+"Vcontrib")( [max_head, v_head] )

            v_contrib.append(value_head)

        if n_heads>1:
            v_return = Concatenate(name=n+"Concat")( v_contrib )
        else:
            v_return = v_contrib[0]
        return v_return
    return layer


def AIAYN_captioner(internal_width, 
                    caption_input_shape=None, feature_shape=None, caption_output_shape=None,
                    layout=0):
    n_time_steps = caption_input_shape[0]
    
    feature_in = Input(shape=feature_shape, dtype='float32', name='feature_input')
    features_everywhere = RepeatVector(n_time_steps, name='RepeatedFeatures')(feature_in)
    
    caption_in = Input(shape=caption_input_shape, dtype='float32', name='caption_input')

    # Make the caption embedding fit the 'internal_width' 
    x = Dense(internal_width, name='InitialDense')(caption_in)
    
    def BatchNumpyConst(np_var):  # (needs a 'bogus input' for batch_sizing...)
        # Need to preserve batch-sizes
        def output_of_lambda(input_shape):
            return (input_shape[0], np_var.shape[0], np_var.shape[1])
        
        #  This is 1x(what we want), but the 'output_shape' causes broadcasting
        return Lambda(lambda x: K.constant(np_var[None]), output_shape=output_of_lambda)( caption_in )
    
    # Add the 'clocks'
    def ClockVar(width):
        clock_theta  = np.arange( 0, 1.0, 2.0/n_time_steps) * np.pi
        clock_offset = (2. + np.arange( 0, width ))
        clock_half   = np.outer( clock_theta, clock_offset ) 
        clock_const  = np.vstack( [ np.cos(clock_half), np.sin(clock_half) ] ).astype('float32')
        return BatchNumpyConst(clock_const)

    features = features_everywhere
    if True:
        features_with_clocks = Add(name='FeatWithClocks')( [features_everywhere, ClockVar(feature_shape[0])] )
        features = features_with_clocks
    
    x = Add(name='WithClocks')( [x, ClockVar(internal_width) ] )
    
    # create a mask to prevent backwards-in-time message passing during training
    mask_timewise = np.tri(n_time_steps, n_time_steps, -1).T * -100.0  # ~ -infinity
    mask_const    = BatchNumpyConst(mask_timewise)
    
    n_heads = 1
    if layout==3 or layout==4: n_heads=4
    params = dict( n_heads=n_heads, key_widths=32, value_width=internal_width//n_heads )
    
    n_layers = 1
    if layout==4: n_layers=2
    for i in range(n_layers):
        if layout==1:  # Before the 'image features' - as in the paper
            # Attention layer looking over the previous words in caption
            x_attend = AttentionLayer('L%d-C-'%i, mask=mask_const, **params)(x, x, x)
            x = ResidualAndNormLayer('L%d-CN-'%i)(x, x_attend)
            
        if True:
            # Attention layer looking over the features of the image
            x_attend = AttentionLayer('L%d-I-'%i, mask=None, **params)(x, features, features)
            x = ResidualAndNormLayer('L%d-IN-'%i)(x, x_attend)

        if layout==2 or layout==3 or layout==4: # After seeing some features (more variety for first word)
            # Attention layer looking over the previous words in caption
            x_attend = AttentionLayer('L%d-C-'%i, mask=mask_const, **params)(x, x, x)
            x = ResidualAndNormLayer('L%d-CN-'%i)(x, x_attend)

        # Dense Feed-Forward Network "FFN"
        x_ff = Dense(2*internal_width, name='FF%d-1'%i, activation='relu')(x)
        x_ff = Dense(  internal_width, name='FF%d-2'%i, activation='linear')(x_ff)
        x = ResidualAndNormLayer('FF%d-N'%i)(x, x_ff)

    caption_out = TimeDistributed( Dense(caption_output_shape[1], activation='linear'), # activation='softmax'
                                   input_shape=(-1, caption_input_shape[0], internal_width),
                                   name='output-sizing')(x)
    
    return Model(inputs=[feature_in, caption_in], 
                 outputs=[caption_out], 
                 name='AIAYN-captioner-%dx%d' % (layout, internal_width))

Define the embedding to use


In [24]:
emb_input, emb_output = RepresentAs_OneHotBasePlusEmbedding, RepresentAs_FullOneHot
#emb_input, emb_output = RepresentAs_OneHotBasePlusEmbedding, RepresentAs_OneHotBasePlusEmbedding

And chose the specific model


In [25]:
model_choice = "xRNN CNN xCNNGLU xAIAYN".split()

if 'RNN' in model_choice:
    model = RNN_captioner(200, 
                      caption_input_shape=(CAPTION_LEN, emb_input.width), 
                      feature_shape=(feature_arr.shape[1],),
                      caption_output_shape=(CAPTION_LEN, emb_output.width),
                      levels=2,
                     )

if 'CNN' in model_choice:
    model = CNN_captioner(200, 
                      caption_input_shape=(CAPTION_LEN, emb_input.width), 
                      feature_shape=(feature_arr.shape[1],),
                      caption_output_shape=(CAPTION_LEN, emb_output.width),
                      layout=5,
                     )

if 'CNNGLU' in model_choice:
    model = CNNGLU_captioner(200, 
                      caption_input_shape=(CAPTION_LEN, emb_input.width), 
                      feature_shape=(feature_arr.shape[1],),
                      caption_output_shape=(CAPTION_LEN, emb_output.width),
                      layout=7,
                     )

if 'AIAYN' in model_choice:
    model = AIAYN_captioner(200, 
                      caption_input_shape=(CAPTION_LEN, emb_input.width), 
                      feature_shape=(feature_arr.shape[1],),
                      caption_output_shape=(CAPTION_LEN, emb_output.width),
                      layout=4,
                     )

model.summary()

In [26]:
#model.compile(loss=emb_output.loss_fn, optimizer=RMSprop(lr=0.0001, clipnorm=1.))
model.compile(loss=emb_output.loss_fn, optimizer=Adam())

# Idea : Change learning rates via callbacks : https://github.com/fchollet/keras/issues/2823

Code for testing a model


In [27]:
def sample_caption(model, features):
    # Run the model step-by-step, performing a decode, and re-inputting the result into the inputs...
    caption_arr, prob_tot=[], 0  # initially empty
    for i in range(CAPTION_LEN):
        caption_idx = caption_to_idx_arr(' '.join(caption_arr))
        caption_emb_in = emb_input.encode(caption_idx[:-1])
        
        # Need to make features and caption_emb_in into length-1 batches
        caption_emb_out = model.predict_on_batch( [ features[np.newaxis], caption_emb_in[np.newaxis] ] )
        
        # pick out the i-th output, and add it onto the 
        caption_word, caption_prob = emb_output.decode(caption_emb_out[0][i])
        
        #print("%.2f %4d %s" % (caption_prob, dictionary[caption_word], caption_word,))
        
        if caption_word=='{STOP}': break
        caption_arr.append(caption_word)
        prob_tot += caption_prob  # I know this isn't correct - just a temp value
        
    return ' '.join(caption_arr), prob_tot

In [28]:
def model_test(model, img_idx, img_path='./data/Flickr30k/flickr30k-images/'):
    # first, let's just show the image, and the existing captions
    
    img_name = text_data['img_arr'][img_idx]
    if True:
        is_training = text_data['train_test'][img_idx]
        captions = text_data['img_to_captions'][img_name]

        print("Image is in %s set" % ('TRAINING' if is_training<TRAIN_PCT else 'TEST',) )
        for caption in captions:
            print("  * %s" % (caption, ))
    
    if True:
        img_filepath = os.path.join(img_path, img_name)
        img_data = plt.imread(img_filepath)

        plt.figure()
        plt.imshow(img_data)
        plt.axis('off')
        plt.show()

    features = feature_arr[ image_feature_idx[img_name] ] 
    for _ in range(5):
        caption, prob = sample_caption(model, features)
        print("  * %s" % (caption,))

# Find some TEST images
#print( [ "%d:%.2f" % (i,t) for i,t in enumerate(text_data['train_test'][0:100]) if t>TRAIN_PCT] )
print( ', '.join([ "%d" % (i) for i,t in enumerate(text_data['train_test'][0:200]) if t>TRAIN_PCT] ), "\n")

model_test(model, 70)  # Uninitialised model is *terrible*

Training time!


In [29]:
if False:
    batch_gen = caption_training_batch_generator(emb_input, emb_output, batch_size=BATCH_SIZE)
    X,Y = next(batch_gen)

    model.train_on_batch(X, Y)

In [30]:
weights_filename = './data/cache/%s_%s_%s_%%04d.h5' % (model.name, emb_input.name, emb_output.name)
weights_filename

In [31]:
epoch = 0
epoch = 50   # Force a specific epoch
while os.path.isfile(weights_filename % (epoch,)):
    epoch += 1

if epoch>0:  # i.e. we found something
    model.load_weights(weights_filename % (epoch-1,))  # Go back one step
    print("Loaded weights from previously saved epoch %d" % (epoch-1,))

In [ ]:
for _ in range(50):
    model.fit_generator(caption_training_batch_generator(emb_input, emb_output, batch_size=BATCH_SIZE), 
                        len(caption_arr_train)/BATCH_SIZE, epochs=epoch+1, initial_epoch=epoch)
    model.save_weights(weights_filename % (epoch,))
    epoch += 1

In [ ]:
#raise("Intentional error to stop execution flow")

Test the current model


In [34]:
model_test(model, 2170)  # 70

In [ ]:
import nltk

def bleu_score(model, img_idx, img_path='./data/Flickr30k/flickr30k-images/'):
    img_name = text_data['img_arr'][img_idx]
    captions_real = text_data['img_to_captions'][img_name]

    features = feature_arr[ image_feature_idx[img_name] ] 
    for _ in range(5):
        caption, prob = sample_caption(model, features)
        score = nltk.translate.bleu_score.sentence_bleu(
            [ c.split(' ') for c in captions_real ], 
            caption.split(' '),
        )
        print("  * %.2f : %s" % (score, caption,))
    
bleu_score(model, 70)

In [ ]: