In [1]:
import theano
from theano import tensor
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
In [2]:
import cPickle as pkl
import numpy
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import skimage
import skimage.transform
import skimage.io
from PIL import Image
In [3]:
import capgen
import generate_caps as gencaps
import flickr8k
import flickr30k
import coco
In [39]:
datasets = {'flickr8k': (flickr8k.load_data, flickr8k.prepare_data),
'flickr30k': (flickr30k.load_data, flickr30k.prepare_data),
'coco': (coco.load_data, coco.prepare_data)}
# location of the model file, the pkl file should be named "model_name.npz.pkl"
model= 'model_name.npz'
# location of the devset split file like the ones in /splits
dev_list = './splits/coco_val.txt'
image_path = './path_to_coco_dev_image/'
# load model model_options
with open('%s.pkl'%model, 'rb') as f:
options = pkl.load(f)
print 'Loading: ' + options['dataset']
flist = []
with open(dev_list, 'r') as f:
for l in f:
flist.append(l.strip())
In [40]:
# keep aspect ratio, and center crop
def LoadImage(file_name, resize=256, crop=224):
image = Image.open(file_name)
width, height = image.size
if width > height:
width = (width * resize) / height
height = resize
else:
height = (height * resize) / width
width = resize
left = (width - crop) / 2
top = (height - crop) / 2
image_resized = image.resize((width, height), Image.BICUBIC).crop((left, top, left + crop, top + crop))
data = numpy.array(image_resized.convert('RGB').getdata()).reshape(crop, crop, 3)
data = data.astype('float32') / 255
return data
In [41]:
load_data, prepare_data = datasets[options['dataset']]
train, valid, test, worddict = load_data(False, True, False)
print 'Data loaded'
word_idict = dict()
for kk, vv in worddict.iteritems():
word_idict[vv] = kk
word_idict[0] = '<eos>'
word_idict[1] = 'UNK'
In [42]:
# build the sampling functions and model
trng = RandomStreams(1234)
use_noise = theano.shared(numpy.float32(0.), name='use_noise')
params = capgen.init_params(options)
params = capgen.load_params(model, params)
tparams = capgen.init_tparams(params)
# word index
f_init, f_next = capgen.build_sampler(tparams, options, use_noise, trng)
In [43]:
trng, use_noise, \
inps, alphas, alphas_samples, \
cost, opt_outs = \
capgen.build_model(tparams, options)
In [44]:
# get the alphas and selector value [called \beta in the paper]
# create update rules for the stochastic attention
hard_attn_updates = []
if options['attn_type'] == 'stochastic':
baseline_time = theano.shared(numpy.float32(0.), name='baseline_time')
hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())]
hard_attn_updates += opt_outs['attn_updates']
f_alpha = theano.function(inps, alphas, name='f_alpha', updates=hard_attn_updates)
if options['selector']:
f_sels = theano.function(inps, opt_outs['selector'], name='f_sels', updates=hard_attn_updates)
In [45]:
idx = numpy.random.randint(0, len(valid[0])) # random image
k = 1 # beam width
use_gt = False # set to False if you want to use the generated sample
gt = valid[0][idx][0] # groundtruth
context = numpy.array(valid[1][valid[0][idx][1]].todense()).reshape([14*14, 512]) # annotations
img = LoadImage(image_path+flist[valid[0][idx][1]])
In [46]:
if not use_gt:
sample, score = capgen.gen_sample(tparams, f_init, f_next, context,
options, trng=trng, k=k, maxlen=200, stochastic=False)
sidx = numpy.argmin(score)
caption = sample[sidx][:-1]
In [47]:
# print the generated caption and the ground truth
if use_gt:
caption = map(lambda w: worddict[w] if worddict[w] < options['n_words'] else 1, gt.split())
words = map(lambda w: word_idict[w] if w in word_idict else '<UNK>', caption)
print 'Sample:', ' '.join(words)
print 'GT:', gt
In [48]:
alpha = f_alpha(numpy.array(caption).reshape(len(caption),1),
numpy.ones((len(caption),1), dtype='float32'),
context.reshape(1,context.shape[0],context.shape[1]))
if options['selector']:
sels = f_sels(numpy.array(caption).reshape(len(caption),1),
numpy.ones((len(caption),1), dtype='float32'),
context.reshape(1,context.shape[0],context.shape[1]))
In [49]:
# display the visualization
n_words = alpha.shape[0] + 1
w = numpy.round(numpy.sqrt(n_words))
h = numpy.ceil(numpy.float32(n_words) / w)
plt.subplot(w, h, 1)
plt.imshow(img)
plt.axis('off')
smooth = True
for ii in xrange(alpha.shape[0]):
plt.subplot(w, h, ii+2)
lab = words[ii]
if options['selector']:
lab += '(%0.2f)'%sels[ii]
plt.text(0, 1, lab, backgroundcolor='white', fontsize=13)
plt.text(0, 1, lab, color='black', fontsize=13)
plt.imshow(img)
if smooth:
alpha_img = skimage.transform.pyramid_expand(alpha[ii,0,:].reshape(14,14), upscale=16, sigma=20)
else:
alpha_img = skimage.transform.resize(alpha[ii,0,:].reshape(14,14), [img.shape[0], img.shape[1]])
plt.imshow(alpha_img, alpha=0.8)
plt.set_cmap(cm.Greys_r)
plt.axis('off')
plt.show()