This notebook runs the gamut from very basic to very recent models:
In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import random
import pickle
TRAIN_PCT=0.9
In [2]:
# Load in the captions/corpus/embedding
with open('./data/cache/CAPTIONS_data_Flickr30k_2017-06-07_23-15.pkl', 'rb') as f:
text_data=pickle.load(f, encoding='iso-8859-1')
"""
text_data ~ dict(
img_to_captions = img_to_valid_captions,
action_words = action_words,
stop_words = stop_words_sorted,
embedding = embedding,
embedding_word_arr = embedding_word_arr,
img_arr = img_arr_save,
train_test = np.random.random( (len(img_arr_save),) ),
)"""
embedding = text_data['embedding']
embedding_eps = 0.00001
embedding_normed = embedding / np.maximum( np.linalg.norm(embedding, axis=1, keepdims=True), embedding_eps)
vocab_arr = text_data['embedding_word_arr']
dictionary = { w:i for i,w in enumerate(vocab_arr) }
img_arr_train = [ img for i, img in enumerate(text_data['img_arr']) if text_data['train_test'][i]<TRAIN_PCT ]
caption_arr_train = [ (img, caption) for img in img_arr_train for caption in text_data['img_to_captions'][img] ]
print("Loaded captions, corpus and embedding")
In [3]:
# Load in the features
with open('./data/cache/FEATURES_data_Flickr30k_flickr30k-images_2017-06-06_18-07.pkl', 'rb') as f:
image_data=pickle.load(f, encoding='iso-8859-1')
"""
image_data ~ dict(
features = features,
img_arr = img_arr,
)
"""
feature_arr = image_data['features']
image_feature_idx = { img:idx for idx, img in enumerate(image_data['img_arr']) }
print("Loaded dim(%d) image features for %d images" % (feature_arr.shape[1], feature_arr.shape[0]))
In [4]:
CAPTION_LEN = 32
EMBEDDING_DIM = embedding.shape[1]
VOCAB_SIZE = len(vocab_arr)
LOG2_VOCAB_SIZE = 13 # 1024->10, 8192->13
if not (2**LOG2_VOCAB_SIZE/2) < VOCAB_SIZE < 2**LOG2_VOCAB_SIZE:
print("LOG2_VOCAB_SIZE incorrect")
In [5]:
def caption_to_idx_arr(caption): # This is actually 1 longer than CAPTION_LEN - need to shift about a bit later
ret = np.zeros( (CAPTION_LEN+1,), dtype='int32') # {MASK}.idx===0
i=0
ret[i] = dictionary['{START}']
#print(len(caption.split()), caption)
for w in caption.lower().split():
i += 1
ret[i] = dictionary.get(w, dictionary['{UNK}'])
ret[i+1] = dictionary['{STOP}']
return ret
In [6]:
#for j in range(0,10):
# print(j)
#print(j)
In [7]:
# This is re-done below, since better to choose over full range of captions,
# rather than randomly within shuffled images
def caption_training_example():
img_arr = img_arr_train
while True:
random.shuffle( img_arr )
for img in img_arr:
captions = text_data['img_to_captions'][img]
caption = random.choice(captions)
print(caption)
yield image_feature_idx[ img ], caption_to_idx_arr( caption )
print("Captions : Looping")
caption_training_example_gen = caption_training_example()
In [8]:
next(caption_training_example_gen)
In [9]:
# import tensorflow.contrib.keras
# import tensorflow.contrib.keras.backend as K
# from tensorflow.contrib.keras.python.keras.utils.np_utils import to_categorical
# from tensorflow.contrib.keras.api.keras.losses import cosine_proximity, categorical_crossentropy, mean_squared_error
# from tensorflow.contrib.keras.api.keras.activations import softmax, sigmoid
# from tensorflow.contrib.keras.api.keras.layers import Input, Masking, Dense, GRU
# from tensorflow.contrib.keras.api.keras.layers import Activation, Conv1D, Dropout, BatchNormalization
# from tensorflow.contrib.keras.api.keras.layers import RepeatVector, Concatenate, Add, Multiply
# from tensorflow.contrib.keras.api.keras.layers import Permute, Reshape, Dot, Lambda
# from tensorflow.contrib.keras.api.keras.optimizers import RMSprop, Adam
# from tensorflow.contrib.keras.api.keras.models import Model
# from tensorflow.contrib.keras.python.keras.layers import TimeDistributed
# This needs keras 2.0.4 for RNN initial_state fixes. tf.contrib.keras LAGS BADLY as of 2017-06-20
import keras
import keras.backend as K
from keras.utils.np_utils import to_categorical
from keras.losses import cosine_proximity, categorical_crossentropy, mean_squared_error
from keras.activations import softmax, sigmoid
from keras.layers import Input, Masking, Dense, GRU
from keras.layers import Activation, Conv1D, Dropout, BatchNormalization
from keras.layers import RepeatVector, Concatenate, Add, Multiply
from keras.layers import Permute, Reshape, Dot, Lambda
from keras.optimizers import RMSprop, Adam
from keras.models import Model
from keras.layers import TimeDistributed
In [10]:
def pick_close_embedding_idx_FIXED_SET(network_output, out_of_top=8):
# cosine distance over whole embedding space
emb_normed = network_output/np.maximum( np.linalg.norm(network_output), embedding_eps)
scores = embedding_normed.dot(emb_normed)
# Find indexes of the top==last few: https://docs.scipy.org/doc/numpy/reference/generated/numpy.argpartition.html
top_idx = np.argpartition(scores, -out_of_top)[-out_of_top:]
top_val = np.maximum( scores[top_idx], embedding_eps)
lo, hi = top_val.min(), top_val.max()
#top_prob = top_val/np.sum(top_val)
top_scaled = (top_val-lo)/(hi-lo)
top_prob = top_scaled/np.sum(top_scaled)
print([ (top_val[i], top_prob[i], vocab_arr[top_idx[i]]) for i in range(out_of_top)])
top_choice = np.random.choice(out_of_top, p=top_prob )
return top_idx[ top_choice ], top_prob[ top_choice ]
def pick_close_embedding_idx(network_output, expand_by=1.1, debug=False):
# cosine distance over whole embedding space
emb_normed = network_output/np.maximum( np.linalg.norm(network_output), embedding_eps)
scores = embedding_normed.dot(emb_normed)
# Find index(es) of the best matches
best_idx = np.argmax(scores)
top_hurdle = 1.0 - (1.0 - scores[best_idx])*expand_by # Expand the margin away from perfect(==1.0)
# Now look at all of the scores that are above the hurdle (expand from the best a bit)
top_idx, = np.nonzero(scores > (top_hurdle-embedding_eps))
n_top = top_idx.shape[0]
top_val = np.maximum( scores[top_idx], embedding_eps)
#lo, hi = top_val.min(), top_val.max()
top_prob = top_val/np.sum(top_val)
#top_scaled = (top_val-lo)/(hi-lo)
#top_prob = top_scaled/np.sum(top_scaled)
if debug: print([ (top_val[i], top_prob[i], vocab_arr[top_idx[i]]) for i in range(n_top)])
top_choice = np.random.choice(n_top, p=top_prob )
return top_idx[ top_choice ], top_prob[ top_choice ]
In [11]:
if True: # Test out the embedding decoder
#vec = embedding[ dictionary['to'] ]
#vec = embedding[ dictionary['queen'] ]
vec = embedding[ dictionary['queen'] ] + embedding[ dictionary['king'] ]
for i in range(10):
idx,prob = pick_close_embedding_idx(vec, debug=i==0)
print(" %5.2f : %s" % (prob, vocab_arr[idx], ))
In [12]:
def pick_softmax_idx(network_output):
e_x = np.exp(network_output - np.max(network_output))
smx = e_x / e_x.sum() # softmax of the array, better conditioned by the line above
top_idx = np.random.choice(smx.shape[0], p=smx)
return top_idx, smx[top_idx]
In [13]:
class RepresentAs_FullEmbedding():
name = 'FullEmbedding'
width = EMBEDDING_DIM
def encode(caption_arr):
# plain embedding of each symbol
return embedding[ caption_arr, : ]
def loss_fn(ideal_output, network_output): # y_true, y_pred
return cosine_proximity( ideal_output, network_output )
def decode(network_output):
#return '{UNK}', 1.0 # placeholder (word and confidence level)
idx, prob = pick_close_embedding_idx(network_output)
return vocab_arr[ idx ], prob
class RepresentAs_FullOneHot():
name = 'FullOneHot'
width = VOCAB_SIZE
def encode(caption_arr):
# Output desired is one-hot of each symbol (cross entropy match whole thing)
return to_categorical(caption_arr, num_classes=VOCAB_SIZE)
def loss_fn(ideal_output, network_output): # y_true, y_pred
smx = softmax(network_output, axis=-1)
return categorical_crossentropy( ideal_output, smx )
def decode(network_output):
idx, prob = pick_softmax_idx(network_output)
return vocab_arr[ idx ], prob
In [14]:
base_width = len(text_data['action_words']) + len(text_data['stop_words'])
MASK_idx = dictionary['{MASK}'] # ==0
EXTRA_idx = dictionary['{EXTRA}']
UNK_idx = dictionary['{UNK}']
def OneHotBasePlus(arr): # Contains indexing magic
# Arrangement should be :: (samples, timesteps, features),
one_hot_base_plus = np.zeros( (CAPTION_LEN, base_width), dtype='float32')
# Set the indicator for entries not in action or stop words
one_hot_base_plus[ arr>=base_width, EXTRA_idx ] = 1.0
# Set the one-hot for everthing in the one-hot-region
one_hot_base_plus[ arr< base_width, arr[np.where(arr<base_width)] ] = 1.0
# Force masked values to all-zeros
one_hot_base_plus[ arr==0, MASK_idx ] = 0.0
return one_hot_base_plus
class RepresentAs_OneHotBasePlusEmbedding():
name = 'OneHotBasePlusEmbedding'
width = base_width + EMBEDDING_DIM
def encode(caption_arr):
# Input is onehot for first part, with the embedding included for all words too
return np.hstack( [ OneHotBasePlus(caption_arr), embedding[caption_arr] ] )
def loss_fn(ideal_output, network_output): # y_true, y_pred
# One-hot of each action symbol and stop words (cross entropy match these) and
# RMSE on remaining embedding (weighted according to onehot[{EXTRA}]~0...1)
#print("ideal_output.shape", ideal_output.shape) # ideal_output.shape (?, ?, ?)
#print("network_output.shape", network_output.shape) # network_output.shape (?, 32, 191)
# Perhaps need this idea https://github.com/fchollet/keras/issues/890:
smx = softmax(network_output[:, :, :base_width], axis=-1)
#print("smx.shape", smx.shape) # smx.shape (?, 32, 141)
is_extra = smx[:, :, EXTRA_idx]
one_hot_loss = categorical_crossentropy( ideal_output[:, :, :base_width], smx )
embedding_loss = cosine_proximity( ideal_output[:, :, base_width:],
network_output[:, :, base_width:] )
return (1.-is_extra)*one_hot_loss + (is_extra)*embedding_loss
def decode(network_output):
idx, prob = pick_softmax_idx(network_output[:base_width])
if idx==EXTRA_idx:
idx, prob2 = pick_close_embedding_idx(network_output[base_width:])
prob *= prob2
return vocab_arr[ idx ], prob
POWERS_OF_2 = 2**np.arange(LOG2_VOCAB_SIZE)
class RepresentAs_OneHotBasePlusBinaryIdx():
name = 'OneHotBasePlusBinaryIdx'
width = base_width + 3*LOG2_VOCAB_SIZE
def encode(caption_arr):
# Input is onehot for first part, with 3 copies of the binary index of all words afterwards
# Idea is from : https://arxiv.org/abs/1704.06918
# Thanks to : https://stackoverflow.com/questions/21918267/
# convert-decimal-range-to-numpy-array-with-each-bit-being-an-array-element
binary = (caption_arr[:, np.newaxis] & POWERS_OF_2) / POWERS_OF_2
binary -= 0.5 # symmetrical around 0
return np.hstack( [ OneHotBasePlus(caption_arr), binary, binary, binary ] )
def loss_fn(ideal_output, network_output): # y_true, y_pred
smx = softmax(network_output[:base_width], axis=-1)
sig = sigmoid(network_output[base_width:])
is_extra = smx[:, :, EXTRA_idx]
one_hot_loss = categorical_crossentropy( ideal_output[:base_width], smx )
#binary_loss = categorical_crossentropy( ideal_output[base_width:], sig )
binary_loss = mean_squared_error( ideal_output[base_width:], sig ) # reported better in paper
return (1.-is_extra)*one_hot_loss + (is_extra)*binary_loss
def decode(network_output, debug=False):
idx, prob = pick_softmax_idx(network_output[:base_width])
if idx==EXTRA_idx:
sig = 1./(1. + np.exp(-network_output[base_width:] ))
#print(sig)
binary = np.mean( sig.reshape( (3, LOG2_VOCAB_SIZE) ), axis=0 )>0.5
if debug:
print( sig )
#print( sig.shape )
print( sig.reshape( (3, LOG2_VOCAB_SIZE) ) )
# TODO : more work to do something (a) stochastic, and (b) give 'prob' measure
idx = POWERS_OF_2.dot(binary>0.5)
if idx>VOCAB_SIZE: idx = UNK_idx
return vocab_arr[ idx ], prob
In [15]:
#io = RepresentAs_FullEmbedding # .encode : 32, 50
#io = RepresentAs_FullOneHot # .encode : 32, 6946
io = RepresentAs_OneHotBasePlusEmbedding # .encode : 32, 191
#io = RepresentAs_OneHotBasePlusBinaryIdx # .encode : 32, 180
io.width
caption_sample = 'The cat sat on the mat .'
caption_sample_idx = caption_to_idx_arr(caption_sample)
caption_sample_idx # array([ 2, 8, 1461, 2496, 11, 8, 998, 5, 3, 0, 0...
onehot_start=range(0,12)
x=OneHotBasePlus(caption_sample_idx[:-1]) # Just the first 10 one-hot entries
#x
#x[onehot_start, dictionary['{MASK}']]
#x[onehot_start, dictionary['{START}']]
#x[onehot_start, EXTRA_idx]
#x[onehot_start, dictionary['{STOP}']]
#x[onehot_start, dictionary['on']]
#x.shape # 32, 141
#embedding[caption_sample_idx[:-1]].shape # 32, 50
if False:
powers_of_two = 2**np.arange(LOG2_VOCAB_SIZE)
(caption_sample_idx[:, np.newaxis] & powers_of_two) / powers_of_two
io.encode( caption_sample_idx[:-1] ).shape # [0:6,:]
In [16]:
# uses io as defined above
per_caption = CAPTION_LEN//2
for i in range(0, len(vocab_arr), per_caption):
caption_sample = ' '.join( vocab_arr[i : i+per_caption ] )
caption_sample_idx = caption_to_idx_arr(caption_sample)
print("%4d : %s " % (i,caption_sample,))
x=io.encode(caption_sample_idx[:-1]) # This is now an embedding
#print(x.shape)
#print( x[1, -LOG2_VOCAB_SIZE:]+0.5)
x *= 12.0 # By virtue of how this works... won't affect embedding, will make softmax more precise
caption_restore = ' '.join([ io.decode(x[i+1])[0] for i in range(per_caption)]) # , debug=i==0
print(" %s " % (caption_restore,))
if i>250: break
In [17]:
#BATCH_SIZE=16 # Titan-X occupancy ~ 37%, epoch ~ 450sec for GRUs
BATCH_SIZE=64 # Titan-X occupancy ~ 32%, epoch ~ 300sec for GRUs
In [18]:
def caption_example_generator():
while True:
random.shuffle( caption_arr_train )
for img, caption in caption_arr_train:
if len(caption.split())>CAPTION_LEN-2 : continue # Skip captions that are too long
yield img, caption
def caption_training_batch_generator(emb_input, emb_output, batch_size=BATCH_SIZE):
caption_example_gen = caption_example_generator()
while True:
X0, X1, Y = [],[], []
for _ in range(batch_size):
img, caption = next( caption_example_gen )
caption_idx = caption_to_idx_arr(caption)
X0.append( feature_arr[ image_feature_idx[img] ] )
X1.append( emb_input.encode(caption_idx[:-1]) )
Y .append( emb_output.encode(caption_idx[1:]) )
yield [np.array(X0), np.array(X1)], [np.array(Y)]
In [19]:
caption_example_gen = caption_example_generator()
next(caption_example_gen)
In [20]:
def RNN_captioner(rnn_count,
caption_input_shape=None, feature_shape=None, caption_output_shape=None,
levels=1):
feature_in = Input(shape=feature_shape, dtype='float32', name='feature_input')
feature_downsize = Dense(rnn_count)(feature_in)
caption_in = Input(shape=caption_input_shape, dtype='float32', name='caption_input')
#masked = Masking(mask_value=0.)(caption_in) # , input_shape=caption_input_shape
masked = caption_in # Ignore the masking thing - this will be implicit in the scoring
# initial_state = feature_downsize ::
# See : https://github.com/fchollet/keras/issues/2995
# https://github.com/fchollet/keras/pull/3947 (closed, unmerged)
# Hidden in the code : https://github.com/fchollet/keras/blob/master/keras/layers/recurrent.py#L178
rnn = masked
# The recurrent layer
#rec1 = GRU(rnn_count, initial_state=feature_downsize, return_sequences=True)(rnn)
rnn = GRU(rnn_count, return_sequences=True)(rnn, initial_state=feature_downsize)
#rec1 = GRU(rnn_count, return_sequences=True)(rnn)
for _ in range(levels-1):
rnn = GRU(rnn_count, return_sequences=True)(rnn)
rnn_outputs = rnn
#print( (BATCH_SIZE, caption_input_shape[0], rnn_count) )
caption_out = TimeDistributed( Dense(caption_output_shape[1], activation='linear'), # activation='softmax'
input_shape=(-1, caption_input_shape[0], rnn_count),
name='output-sizing')(rnn_outputs)
return Model(inputs=[feature_in, caption_in],
outputs=[caption_out],
name='RNN-captioner-nomask-%dx%d' % (levels, rnn_count))
In [21]:
def CNN_captioner(cnn_channels,
caption_input_shape=None, feature_shape=None, caption_output_shape=None,
layout=0):
feature_in = Input(shape=feature_shape, dtype='float32', name='feature_input')
feature_resize = Dense(cnn_channels, name='FeatureProjection')(feature_in)
feature_everywhere = RepeatVector(caption_input_shape[0], name='RepeatedFeatures')(feature_resize)
caption_in = Input(shape=caption_input_shape, dtype='float32', name='caption_input')
if layout==0 or layout==1:
caption_resize = Dense(cnn_channels)(caption_in) ## Is this right? time-dependency?
#caption_features = Concatenate( [caption_resize, feature_resize], axis=3) # several channels
# So the image feature becomes a bias for all the first layer cnn inputs
caption_features = Add()( [feature_everywhere, caption_resize] )
cnn = Activation('relu')( caption_features )
if layout==2 or layout==3 or layout==4 or layout==5:
caption_cnn = Conv1D(2*cnn_channels, 1, padding='causal', dilation_rate=1, activation='relu')(caption_in)
# So the image features can be added into the embedding as required
caption_features = Concatenate()( [feature_everywhere, caption_cnn] )
cnn = ( caption_features ) # Don't do any relu transform at this point
if layout==6:
feature_everywhere_full = RepeatVector(caption_input_shape[0], name='RepeatedFeatures')(feature_in)
caption_cnn = Conv1D(2*cnn_channels, 1, padding='causal', dilation_rate=1, activation='relu')(caption_in)
# So the image features can be added into the embedding as required
caption_features = Concatenate()( [feature_everywhere_full, caption_cnn] )
cnn = ( caption_features ) # Don't do any relu transform at this point
if layout==0 or layout==1 or layout==2 or layout==3 or layout==4 or layout==5 or layout==6:
# The CNN layers
# Conv1D(filters, kernel_size, strides=1, padding='valid', dilation_rate=1,
# activation=None, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros',
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='relu')(cnn)
if layout==3 or layout==4:
# Gated Linear Units
cnn_gate = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='tanh')(cnn)
cnn = Multiply()( [cnn, cnn_gate] )
if layout==1 or layout==4:
cnn = Dropout(0.5)(cnn)
if layout==4 or layout==5:
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='relu')(cnn)
cnn = BatchNormalization(scale=False)(cnn)
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=2, activation='relu')(cnn)
if layout==1 or layout==4:
cnn = Dropout(0.5)(cnn)
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=4, activation='relu')(cnn)
if layout==1 or layout==4:
cnn = Dropout(0.5)(cnn)
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=8, activation='relu')(cnn)
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=16, activation='relu')(cnn)
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='relu')(cnn)
if layout==4 or layout==5:
cnn = BatchNormalization(scale=False)(cnn)
#if layout==5:
# # Gated Linear Units
# cnn_gate = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='tanh')(cnn)
# cnn = Multiply()( [cnn, cnn_gate] )
if layout==3 or layout==4:
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=2, activation='relu')(cnn)
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='relu')(cnn)
if layout==5:
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=2, activation='relu')(cnn)
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=4, activation='relu')(cnn)
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=8, activation='relu')(cnn)
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=16, activation='relu')(cnn)
cnn = Conv1D(cnn_channels, 3, padding='causal', dilation_rate=1, activation='relu')(cnn)
cnn_outputs = cnn
caption_out = TimeDistributed( Dense(caption_output_shape[1], activation='linear'), # activation='softmax'
input_shape=(-1, caption_input_shape[0], cnn_channels),
name='output-sizing')(cnn_outputs)
return Model(inputs=[feature_in, caption_in],
outputs=[caption_out],
name='CNN-captioner-%dx%d' % (layout, cnn_channels))
The "Convolutional Sequence to Sequence Learning" paper https://arxiv.org/abs/1705.03122 was released 6 weeks ago.
In [22]:
def CNNGLU_layer(cnn_channels, k, dilation_rate=1, residual=True, batch_norm=True):
# for original paper, see : https://arxiv.org/abs/1612.08083
def layer(cnn):
cnnA = Conv1D(cnn_channels, k, padding='causal', dilation_rate=dilation_rate, activation='linear')(cnn)
cnnB = Conv1D(cnn_channels, k, padding='causal', dilation_rate=dilation_rate, activation='sigmoid')(cnn)
cnnOut = Multiply()( [cnnA, cnnB] )
if residual:
cnnOut = Add()([cnn, cnnOut]) # This is the residual skip
if batch_norm:
cnnOut = BatchNormalization()(cnnOut)
return cnnOut
return layer
def CNNGLU_captioner(cnn_channels,
caption_input_shape=None, feature_shape=None, caption_output_shape=None,
layout=0):
feature_in = Input(shape=feature_shape, dtype='float32', name='feature_input')
feature_everywhere_full = RepeatVector(caption_input_shape[0], name='RepeatedFeatures')(feature_in)
caption_in = Input(shape=caption_input_shape, dtype='float32', name='caption_input')
caption_cnn = Conv1D(4*cnn_channels, 1, padding='causal', dilation_rate=1, activation='relu')(caption_in)
# So the image features can be added into the embedding as required
caption_features = Concatenate()( [feature_everywhere_full, caption_cnn] )
cnn = Conv1D(cnn_channels, 1, padding='causal', dilation_rate=1, activation='relu')( caption_features )
if layout==7:
cnn = CNNGLU_layer(cnn_channels, 3, dilation_rate=1)(cnn)
cnn = CNNGLU_layer(cnn_channels, 3, dilation_rate=2)(cnn)
cnn = CNNGLU_layer(cnn_channels, 3, dilation_rate=4)(cnn)
cnn = CNNGLU_layer(cnn_channels, 3, dilation_rate=8)(cnn)
cnn = CNNGLU_layer(cnn_channels, 3, dilation_rate=16)(cnn)
cnn_outputs = cnn
caption_out = TimeDistributed( Dense(caption_output_shape[1], activation='linear'), # activation='softmax'
input_shape=(-1, caption_input_shape[0], cnn_channels),
name='output-sizing')(cnn_outputs)
return Model(inputs=[feature_in, caption_in],
outputs=[caption_out],
name='CNNGLU-captioner-%dx%d' % (layout, cnn_channels))
The "Attention is All You Need" paper : https://arxiv.org/abs/1706.03762 was released 1 week before this was originally presented. So, as far as I know, it was the first Keras version of this innovative model.
In [23]:
def ResidualAndNormLayer(name):
def layer(before, after):
out = Add(name=name+'res')([before, after]) # This is the residual step
return BatchNormalization(name=name+'bn')(out)
return layer
def AttentionLayer(name, n_heads=4, key_widths=32, value_width=50, mask=None):
def layer(q_var, k_var, v_var):
sqrt_dk = np.sqrt(float(key_widths))
v_contrib = []
for i in range(n_heads):
n="%s%i" % (name, i)
#print(i, key_widths, value_width, q_var.shape)
q_head = TimeDistributed( Dense(key_widths), name=n+"Q")(q_var) # shape ~ (?, time_step, key_width)
k_head = TimeDistributed( Dense(key_widths), name=n+"K")(k_var) # shape ~ (?, time_step, key_width)
# We need to do this across time, so the dot products pile up each location
k_head_transpose = Permute( (2, 1), name=n+"K.T")(k_head) # shape ~ (?, key_width, time_step)
# Give each position an array with corresponds to amount it 'wants' each other location
fit_head = Dot([2, 1], normalize=False, name=n+"match")( [q_head, k_head_transpose] )
# Scale this down to compensate for adding together lots of N()*N()
fit_head_scaled = Lambda(lambda x: x/sqrt_dk, name=n+"scaled")(fit_head)
fit_head_masked = fit_head_scaled
if mask is not None:
# Mask out backwards (in time) flowing information (this is the decoder), by adding '-100.'
#fit_head_masked = K.clip(fit_head_masked, -20., +20.) # set bounds so -100. is ~ -inf
fit_head_masked = Lambda(lambda x: K.clip(x, -20., +20.))(fit_head_masked) # set bounds so -100. is ~ -inf
fit_head_masked = Add()( [fit_head_masked, mask] )
# Give each position an array with corresponds to 'probability' it 'wants' each other location
max_head = TimeDistributed( Activation(K.softmax), name=n+"SoftMax" ) ( fit_head_masked )
# Now the value side
v_head = TimeDistributed( Dense(value_width), name=n+"V" )(v_var)
# Combine the key/query match softmax output with the values
value_head = Dot([2, 1], normalize=False, name=n+"Vcontrib")( [max_head, v_head] )
v_contrib.append(value_head)
if n_heads>1:
v_return = Concatenate(name=n+"Concat")( v_contrib )
else:
v_return = v_contrib[0]
return v_return
return layer
def AIAYN_captioner(internal_width,
caption_input_shape=None, feature_shape=None, caption_output_shape=None,
layout=0):
n_time_steps = caption_input_shape[0]
feature_in = Input(shape=feature_shape, dtype='float32', name='feature_input')
features_everywhere = RepeatVector(n_time_steps, name='RepeatedFeatures')(feature_in)
caption_in = Input(shape=caption_input_shape, dtype='float32', name='caption_input')
# Make the caption embedding fit the 'internal_width'
x = Dense(internal_width, name='InitialDense')(caption_in)
def BatchNumpyConst(np_var): # (needs a 'bogus input' for batch_sizing...)
# Need to preserve batch-sizes
def output_of_lambda(input_shape):
return (input_shape[0], np_var.shape[0], np_var.shape[1])
# This is 1x(what we want), but the 'output_shape' causes broadcasting
return Lambda(lambda x: K.constant(np_var[None]), output_shape=output_of_lambda)( caption_in )
# Add the 'clocks'
def ClockVar(width):
clock_theta = np.arange( 0, 1.0, 2.0/n_time_steps) * np.pi
clock_offset = (2. + np.arange( 0, width ))
clock_half = np.outer( clock_theta, clock_offset )
clock_const = np.vstack( [ np.cos(clock_half), np.sin(clock_half) ] ).astype('float32')
return BatchNumpyConst(clock_const)
features = features_everywhere
if True:
features_with_clocks = Add(name='FeatWithClocks')( [features_everywhere, ClockVar(feature_shape[0])] )
features = features_with_clocks
x = Add(name='WithClocks')( [x, ClockVar(internal_width) ] )
# create a mask to prevent backwards-in-time message passing during training
mask_timewise = np.tri(n_time_steps, n_time_steps, -1).T * -100.0 # ~ -infinity
mask_const = BatchNumpyConst(mask_timewise)
n_heads = 1
if layout==3 or layout==4: n_heads=4
params = dict( n_heads=n_heads, key_widths=32, value_width=internal_width//n_heads )
n_layers = 1
if layout==4: n_layers=2
for i in range(n_layers):
if layout==1: # Before the 'image features' - as in the paper
# Attention layer looking over the previous words in caption
x_attend = AttentionLayer('L%d-C-'%i, mask=mask_const, **params)(x, x, x)
x = ResidualAndNormLayer('L%d-CN-'%i)(x, x_attend)
if True:
# Attention layer looking over the features of the image
x_attend = AttentionLayer('L%d-I-'%i, mask=None, **params)(x, features, features)
x = ResidualAndNormLayer('L%d-IN-'%i)(x, x_attend)
if layout==2 or layout==3 or layout==4: # After seeing some features (more variety for first word)
# Attention layer looking over the previous words in caption
x_attend = AttentionLayer('L%d-C-'%i, mask=mask_const, **params)(x, x, x)
x = ResidualAndNormLayer('L%d-CN-'%i)(x, x_attend)
# Dense Feed-Forward Network "FFN"
x_ff = Dense(2*internal_width, name='FF%d-1'%i, activation='relu')(x)
x_ff = Dense( internal_width, name='FF%d-2'%i, activation='linear')(x_ff)
x = ResidualAndNormLayer('FF%d-N'%i)(x, x_ff)
caption_out = TimeDistributed( Dense(caption_output_shape[1], activation='linear'), # activation='softmax'
input_shape=(-1, caption_input_shape[0], internal_width),
name='output-sizing')(x)
return Model(inputs=[feature_in, caption_in],
outputs=[caption_out],
name='AIAYN-captioner-%dx%d' % (layout, internal_width))
In [24]:
emb_input, emb_output = RepresentAs_OneHotBasePlusEmbedding, RepresentAs_FullOneHot
#emb_input, emb_output = RepresentAs_OneHotBasePlusEmbedding, RepresentAs_OneHotBasePlusEmbedding
In [25]:
model_choice = "xRNN CNN xCNNGLU xAIAYN".split()
if 'RNN' in model_choice:
model = RNN_captioner(200,
caption_input_shape=(CAPTION_LEN, emb_input.width),
feature_shape=(feature_arr.shape[1],),
caption_output_shape=(CAPTION_LEN, emb_output.width),
levels=2,
)
if 'CNN' in model_choice:
model = CNN_captioner(200,
caption_input_shape=(CAPTION_LEN, emb_input.width),
feature_shape=(feature_arr.shape[1],),
caption_output_shape=(CAPTION_LEN, emb_output.width),
layout=5,
)
if 'CNNGLU' in model_choice:
model = CNNGLU_captioner(200,
caption_input_shape=(CAPTION_LEN, emb_input.width),
feature_shape=(feature_arr.shape[1],),
caption_output_shape=(CAPTION_LEN, emb_output.width),
layout=7,
)
if 'AIAYN' in model_choice:
model = AIAYN_captioner(200,
caption_input_shape=(CAPTION_LEN, emb_input.width),
feature_shape=(feature_arr.shape[1],),
caption_output_shape=(CAPTION_LEN, emb_output.width),
layout=4,
)
model.summary()
In [26]:
#model.compile(loss=emb_output.loss_fn, optimizer=RMSprop(lr=0.0001, clipnorm=1.))
model.compile(loss=emb_output.loss_fn, optimizer=Adam())
# Idea : Change learning rates via callbacks : https://github.com/fchollet/keras/issues/2823
In [27]:
def sample_caption(model, features):
# Run the model step-by-step, performing a decode, and re-inputting the result into the inputs...
caption_arr, prob_tot=[], 0 # initially empty
for i in range(CAPTION_LEN):
caption_idx = caption_to_idx_arr(' '.join(caption_arr))
caption_emb_in = emb_input.encode(caption_idx[:-1])
# Need to make features and caption_emb_in into length-1 batches
caption_emb_out = model.predict_on_batch( [ features[np.newaxis], caption_emb_in[np.newaxis] ] )
# pick out the i-th output, and add it onto the
caption_word, caption_prob = emb_output.decode(caption_emb_out[0][i])
#print("%.2f %4d %s" % (caption_prob, dictionary[caption_word], caption_word,))
if caption_word=='{STOP}': break
caption_arr.append(caption_word)
prob_tot += caption_prob # I know this isn't correct - just a temp value
return ' '.join(caption_arr), prob_tot
In [28]:
def model_test(model, img_idx, img_path='./data/Flickr30k/flickr30k-images/'):
# first, let's just show the image, and the existing captions
img_name = text_data['img_arr'][img_idx]
if True:
is_training = text_data['train_test'][img_idx]
captions = text_data['img_to_captions'][img_name]
print("Image is in %s set" % ('TRAINING' if is_training<TRAIN_PCT else 'TEST',) )
for caption in captions:
print(" * %s" % (caption, ))
if True:
img_filepath = os.path.join(img_path, img_name)
img_data = plt.imread(img_filepath)
plt.figure()
plt.imshow(img_data)
plt.axis('off')
plt.show()
features = feature_arr[ image_feature_idx[img_name] ]
for _ in range(5):
caption, prob = sample_caption(model, features)
print(" * %s" % (caption,))
# Find some TEST images
#print( [ "%d:%.2f" % (i,t) for i,t in enumerate(text_data['train_test'][0:100]) if t>TRAIN_PCT] )
print( ', '.join([ "%d" % (i) for i,t in enumerate(text_data['train_test'][0:200]) if t>TRAIN_PCT] ), "\n")
model_test(model, 70) # Uninitialised model is *terrible*
In [29]:
if False:
batch_gen = caption_training_batch_generator(emb_input, emb_output, batch_size=BATCH_SIZE)
X,Y = next(batch_gen)
model.train_on_batch(X, Y)
In [30]:
weights_filename = './data/cache/%s_%s_%s_%%04d.h5' % (model.name, emb_input.name, emb_output.name)
weights_filename
In [31]:
epoch = 0
epoch = 50 # Force a specific epoch
while os.path.isfile(weights_filename % (epoch,)):
epoch += 1
if epoch>0: # i.e. we found something
model.load_weights(weights_filename % (epoch-1,)) # Go back one step
print("Loaded weights from previously saved epoch %d" % (epoch-1,))
In [ ]:
for _ in range(50):
model.fit_generator(caption_training_batch_generator(emb_input, emb_output, batch_size=BATCH_SIZE),
len(caption_arr_train)/BATCH_SIZE, epochs=epoch+1, initial_epoch=epoch)
model.save_weights(weights_filename % (epoch,))
epoch += 1
In [ ]:
#raise("Intentional error to stop execution flow")
In [34]:
model_test(model, 2170) # 70
In [ ]:
import nltk
def bleu_score(model, img_idx, img_path='./data/Flickr30k/flickr30k-images/'):
img_name = text_data['img_arr'][img_idx]
captions_real = text_data['img_to_captions'][img_name]
features = feature_arr[ image_feature_idx[img_name] ]
for _ in range(5):
caption, prob = sample_caption(model, features)
score = nltk.translate.bleu_score.sentence_bleu(
[ c.split(' ') for c in captions_real ],
caption.split(' '),
)
print(" * %.2f : %s" % (score, caption,))
bleu_score(model, 70)
In [ ]: