Imports
In [1]:
%matplotlib inline
import numpy as np
import json
from VQA.PythonHelperTools.vqaTools.vqa import VQA
from VQA.PythonEvaluationTools.vqaEvaluation.vqaEval import VQAEval
import random
import skimage.io as io
import matplotlib
import matplotlib.pyplot as plt
import os
from sklearn.utils import shuffle
import spacy
import operator
from operator import itemgetter
from tqdm import tqdm
import cPickle as pickle
from numpy import linalg as LA
In [2]:
import keras
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils, generic_utils
from keras.preprocessing import image
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Activation, Dropout, LSTM, GRU, Flatten, Embedding, RepeatVector
from keras.layers import Merge, Reshape, RepeatVector, BatchNormalization, Lambda, TimeDistributed, Permute
from keras.layers import GlobalMaxPooling2D, Convolution2D, merge, Bidirectional
from keras.regularizers import l2
from keras.optimizers import *
from keras.applications.vgg19 import VGG19
from keras.applications.resnet50 import ResNet50
from keras import backend as K
from imagenet_utils import preprocess_input
Constants
In [3]:
dataDir='VQA'
taskType='OpenEnded'
dataType='mscoco' # 'mscoco' for real and 'abstract_v002' for abstract
In [4]:
#nlp = spacy.load('en')
nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
Load train data
In [5]:
dataSubType_train = 'train2014'
annFile_train = '%s/Annotations/%s_%s_annotations.json'%(dataDir, dataType, dataSubType_train)
quesFile_train = '%s/Questions/%s_%s_%s_questions.json'%(dataDir, taskType, dataType, dataSubType_train)
imgDir_train = '%s/Images/%s/%s/' %(dataDir, dataType, dataSubType_train)
vqa_train = VQA(annFile_train, quesFile_train)
In [6]:
dataSubType_val = 'val2014'
annFile_val = '%s/Annotations/%s_%s_annotations.json'%(dataDir, dataType, dataSubType_val)
quesFile_val = '%s/Questions/%s_%s_%s_questions.json'%(dataDir, taskType, dataType, dataSubType_val)
imgDir_val = '%s/Images/%s/%s/' %(dataDir, dataType, dataSubType_val)
vqa_val = VQA(annFile_val, quesFile_val)
Optional to run (just showing the VQA API capabilities)
In [7]:
# load and display QA annotations for given question types
"""
All possible quesTypes for abstract and mscoco has been provided in respective text files in ../QuestionTypes/ folder.
"""
annIds = vqa_train.getQuesIds(quesTypes='how many');
anns = vqa_train.loadQA(annIds)
randomAnn = random.choice(anns)
vqa_train.showQA([randomAnn])
imgId = randomAnn['image_id']
imgFilename = 'COCO_' + dataSubType_train + '_'+ str(imgId).zfill(12) + '.jpg'
if os.path.isfile(imgDir_train + imgFilename):
I = io.imread(imgDir_train + imgFilename)
plt.imshow(I)
plt.axis('off')
plt.show()
else: print(imgDir_train + imgFilename)
In [8]:
# load and display QA annotations for given answer types
"""
ansTypes can be one of the following
yes/no
number
other
"""
annIds = vqa_train.getQuesIds(ansTypes='yes/no');
anns = vqa_train.loadQA(annIds)
randomAnn = random.choice(anns)
vqa_train.showQA([randomAnn])
imgId = randomAnn['image_id']
imgFilename = 'COCO_' + dataSubType_train + '_'+ str(imgId).zfill(12) + '.jpg'
if os.path.isfile(imgDir_train + imgFilename):
I = io.imread(imgDir_train + imgFilename)
plt.imshow(I)
plt.axis('off')
plt.show()
In [9]:
# load and display QA annotations for given images
"""
Usage: vqa.getImgIds(quesIds=[], quesTypes=[], ansTypes=[])
Above method can be used to retrieve imageIds for given question Ids or given question types or given answer types.
"""
ids = vqa_train.getImgIds()
annIds = vqa_train.getQuesIds(imgIds=random.sample(ids, 5));
anns = vqa_train.loadQA(annIds)
randomAnn = random.choice(anns)
vqa_train.showQA([randomAnn])
imgId = randomAnn['image_id']
imgFilename = 'COCO_' + dataSubType_train + '_' + str(imgId).zfill(12) + '.jpg'
if os.path.isfile(imgDir_train + imgFilename):
I = io.imread(imgDir_train + imgFilename)
plt.imshow(I)
plt.axis('off')
plt.show()
In [10]:
annIds = vqa_train.getQuesIds()
anns = vqa_train.loadQA(annIds)
randomAnn = random.choice(anns)
vqa_train.showQA([randomAnn])
imgId = randomAnn['image_id']
imgFilename = 'COCO_' + dataSubType_train + '_' + str(imgId).zfill(12) + '.jpg'
if os.path.isfile(imgDir_train + imgFilename):
'''
I = io.imread(imgDir_train + imgFilename)
plt.imshow(I)
plt.axis('off')
plt.show()
'''
img = image.load_img(imgDir_train + imgFilename, target_size=(224, 224))
x = image.img_to_array(img)
x = preprocess_input(x)
plt.imshow(img)
plt.axis('off')
plt.show()
In [11]:
annIds = vqa_train.getQuesIds()
anns = vqa_train.loadQA(annIds)
anns = shuffle(anns, random_state=0)
num_batches = len(anns) // batch_size + (1 if len(anns) % batch_size > 0 else 0)
#print num_batches
i = 2
anns_batch = anns[i * batch_size : min((i + 1) * batch_size, len(anns))]
#print len(anns_batch)
ann = anns_batch[0]
print ann
print vqa_train.qqa[ann['question_id']]['question']
question = nlp(vqa_train.qqa[ann['question_id']]['question'])
for w in question:
print (w.text, w.pos_, len(w.vector))
question_word_vec = [w.vector for w in question]
In [19]:
ans_types=[]
#ans_types='yes/no'
Modify the call to get_most_common_answers to 1000 or 3000 depending on the model used
In [20]:
def get_most_common_answers(num_answers):
ans_dict = {}
annIds = vqa_train.getQuesIds(ansTypes=ans_types)
anns = vqa_train.loadQA(annIds)
for ann in anns:
for ans in ann['answers']:
answer = ans['answer'].lower()
if answer in ans_dict:
ans_dict[answer] += 1
else:
ans_dict[answer] = 1
'''
if ann['multiple_choice_answer'] in ans_dict:
ans_dict[ann['multiple_choice_answer']] += 1
else:
ans_dict[ann['multiple_choice_answer']] = 1
'''
sorted_ans_dict = sorted(ans_dict.items(), key=itemgetter(1), reverse=True)
# Some bar plots
num_ans_plot = 10
total_ans = 0
for (x,y) in sorted_ans_dict: total_ans += y
plt.bar(range(1, num_ans_plot+1), [float(y) / total_ans * 100 for (x,y) in sorted_ans_dict[0:num_ans_plot]], 0.9, color='b')
plt.xticks(range(1, num_ans_plot+1), [x for (x,y) in sorted_ans_dict[0:num_ans_plot]])
plt.title("Most Common Answer Frequencies")
plt.show()
sorted_ans_dict = [x for (x,y) in sorted_ans_dict]
sorted_ans_dict = sorted_ans_dict[0:num_answers]
ans_to_id = dict((a, i) for i, a in enumerate(sorted_ans_dict))
id_to_ans = dict((i, a) for i, a in enumerate(sorted_ans_dict))
return ans_to_id, id_to_ans
ans_to_id, id_to_ans = get_most_common_answers(1000)
In [9]:
def process_question(vqa, ann):
quesId = ann['question_id']
if quesId in question_word_vec_map:
return question_word_vec_map[quesId]
question = nlp(vqa.qqa[quesId]['question'])
question_word_vec = [w.vector for w in question]
return np.array(question_word_vec)
In [10]:
def process_answer(vqa, ann):
quesId = ann['question_id']
if quesId in ans_map:
return ans_map[quesId]
answer = ann['multiple_choice_answer'].lower()
if answer in ans_to_id:
encoding = np.zeros(len(id_to_ans))
encoding[ans_to_id[answer]] = 1
return encoding
else:
return None
In [11]:
# VGG 19 post-convolution layers
model = VGG19(weights='imagenet', include_top=False)
def process_img(vqa, ann, dataSubType, imgDir):
imgId = ann['image_id']
imgFilename = 'COCO_' + dataSubType + '_' + str(imgId).zfill(12) + '.jpg'
if os.path.isfile(imgDir + imgFilename):
img = image.load_img(imgDir + imgFilename, target_size=(224, 224))
x = image.img_to_array(img)
x = preprocess_input(x)
features = model.predict(np.array([x]))
features = np.reshape(features[0], (512, 49))
return features
else:
return None
In [11]:
# VGG 19 fc2 layer
base_model = VGG19(weights='imagenet', include_top=True)
model = Model(input=base_model.input, output=base_model.get_layer('fc2').output)
def process_img(vqa, ann, dataSubType, imgDir):
imgId = ann['image_id']
imgFilename = 'COCO_' + dataSubType + '_' + str(imgId).zfill(12) + '.jpg'
if os.path.isfile(imgDir + imgFilename):
img = image.load_img(imgDir + imgFilename, target_size=(224, 224))
x = image.img_to_array(img)
x = preprocess_input(x)
features = model.predict(np.array([x]))
features = np.reshape(features[0], (4096,))
features /= LA.norm(features, 2)
return features
else:
return None
In [12]:
# ResNet50
model = ResNet50(weights='imagenet', include_top=False)
def process_img(vqa, ann, dataSubType, imgDir):
imgId = ann['image_id']
imgFilename = 'COCO_' + dataSubType + '_' + str(imgId).zfill(12) + '.jpg'
if os.path.isfile(imgDir + imgFilename):
img = image.load_img(imgDir + imgFilename, target_size=(224, 224))
x = image.img_to_array(img)
x = preprocess_input(x)
features = model.predict(np.array([x]))
features = np.reshape(features[0], (2048,))
features /= LA.norm(features, 2)
return features
else:
return None
In [13]:
question_word_vec_map = {}
ans_map = {}
img_map = {}
ques_to_img = {}
In [14]:
annIds = vqa_train.getQuesIds(ansTypes=ans_types)
anns = vqa_train.loadQA(annIds)
for ann in tqdm(anns):
quesId = int(ann['question_id'])
if quesId in question_word_vec_map:
continue
question = process_question(vqa_train, ann)
if question is None:
continue
question_word_vec_map[quesId] = question
In [16]:
f = open("data/train_questions.pkl", "w")
pickle.dump(question_word_vec_map, f, pickle.HIGHEST_PROTOCOL)
f.close()
In [17]:
annIds = vqa_train.getQuesIds()
anns = vqa_train.loadQA(annIds)
for ann in tqdm(anns):
quesId = int(ann['question_id'])
if quesId in ans_map:
continue
answer = process_answer(vqa_train, ann)
if answer is None:
continue
ans_map[quesId] = answer.tolist()
In [18]:
f = open("data/train_answers.pkl", "w")
pickle.dump(ans_map, f, pickle.HIGHEST_PROTOCOL)
f.close()
In [36]:
annIds = vqa_train.getQuesIds()
anns = vqa_train.loadQA(annIds)
for ann in tqdm(anns):
imgId = int(ann['image_id'])
if imgId in img_map:
continue
img = process_img(vqa_train, ann, dataSubType_train, imgDir_train)
if img is None:
continue
img_map[imgId] = img
In [37]:
f = open("data/train_images.pkl", "w")
pickle.dump(img_map, f, pickle.HIGHEST_PROTOCOL)
f.close()
In [16]:
annIds = vqa_train.getQuesIds()
anns = vqa_train.loadQA(annIds)
for ann in tqdm(anns):
quesId = int(ann['question_id'])
imgId = int(ann['image_id'])
ques_to_img[quesId] = imgId
In [17]:
f = open("data/train_ques_to_img.pkl", "w")
pickle.dump(ques_to_img, f, pickle.HIGHEST_PROTOCOL)
f.close()
Now for the validation set
In [69]:
question_word_vec_map = {}
ans_map = {}
img_map = {}
ques_to_img = {}
In [50]:
annIds = vqa_val.getQuesIds()
anns = vqa_val.loadQA(annIds)
for ann in tqdm(anns):
quesId = int(ann['question_id'])
if quesId in question_word_vec_map:
continue
question = process_question(vqa_val, ann)
if question is None:
continue
question_word_vec_map[quesId] = question
In [51]:
f = open("data/val_questions.pkl", "w")
pickle.dump(question_word_vec_map, f, pickle.HIGHEST_PROTOCOL)
f.close()
In [70]:
annIds = vqa_val.getQuesIds()
anns = vqa_val.loadQA(annIds)
for ann in tqdm(anns):
quesId = int(ann['question_id'])
if quesId in ans_map:
continue
answer = process_answer(vqa_val, ann)
if answer is None:
continue
ans_map[quesId] = answer
In [71]:
f = open("data/val_answers.pkl", "w")
pickle.dump(ans_map, f, pickle.HIGHEST_PROTOCOL)
f.close()
In [39]:
annIds = vqa_val.getQuesIds()
anns = vqa_val.loadQA(annIds)
for ann in tqdm(anns):
imgId = int(ann['image_id'])
if imgId in img_map:
continue
img = process_img(vqa_val, ann, dataSubType_val, imgDir_val)
if img is None:
continue
img_map[imgId] = img
In [40]:
f = open("data/val_images.pkl", "w")
pickle.dump(img_map, f, pickle.HIGHEST_PROTOCOL)
f.close()
In [23]:
annIds = vqa_val.getQuesIds()
anns = vqa_val.loadQA(annIds)
for ann in tqdm(anns):
quesId = int(ann['question_id'])
imgId = int(ann['image_id'])
ques_to_img[quesId] = imgId
In [24]:
f = open("data/val_ques_to_img.pkl", "w")
pickle.dump(ques_to_img, f, pickle.HIGHEST_PROTOCOL)
f.close()
In [28]:
print "Loading train questions"
ques_train_map = pickle.load(open("data/train_questions.pkl","r"))
print "Loading train answers"
ans_train_map = pickle.load(open("data/train_answers.pkl","r"))
print "Loading train images"
img_train_map = pickle.load(open("data/train_images.pkl","r"))
print "Loading ques_to_img map"
ques_to_img_train = pickle.load(open("data/train_ques_to_img.pkl","r"))
print "Done"
In [29]:
print "Loading validation questions"
ques_val_map = pickle.load(open("data/val_questions.pkl","r"))
print "Loading validation answers"
ans_val_map = pickle.load(open("data/val_answers.pkl","r"))
print "Loading validation images"
img_val_map = pickle.load(open("data/val_images.pkl","r"))
print "Loading ques_to_img map"
ques_to_img_val = pickle.load(open("data/val_ques_to_img.pkl","r"))
print "Done"
In [30]:
ques_train_ids = np.array(ques_train_map.keys())
ques_val_ids = np.array(ques_val_map.keys())
train_dim, val_dim = len(ques_train_ids), len(ques_val_ids)
print train_dim, val_dim
In [15]:
def get_batch(batch, batch_size, ques_map, ans_map, img_map, ques_ids, ques_to_img):
# get ids in the current batch
batch_ids = ques_ids[batch * batch_size: min((batch + 1) * batch_size, len(ques_ids))]
# filter out ids which don't have question, answer or image
batch_ids = [batch_id for batch_id in batch_ids if batch_id in ques_map and batch_id in ans_map and ques_to_img[batch_id] in img_map]
# add questions to batch
batch_questions = [ques_map[batch_id] for batch_id in batch_ids]
batch_answers = [ans_map[batch_id] for batch_id in batch_ids]
batch_images = [img_map[ques_to_img[batch_id]] for batch_id in batch_ids]
# find out maximum length of a question in this batch
max_len = max([len(ques) for ques in batch_questions])
# ... and pad all questions in the batch to that length (more efficient than padding all questions to a single maximum length)
batch_ques_aligned = []
for question in batch_questions:
if len(question) < max_len:
batch_ques_aligned.append(np.append(question, np.zeros((max_len - len(question), 300)), axis=0))
else:
batch_ques_aligned.append(question)
# finally, construct train_X, and train_y
train_X = [np.array(batch_images), np.array(batch_ques_aligned)]
train_y = np.array(batch_answers)
return train_X, train_y
In [16]:
def train_epoch(
epoch_no,
model,
num_batches,
batch_size,
ques_map,
ans_map,
img_map,
ques_ids,
ques_to_img):
# shuffle all question ids on each epoch
np.random.shuffle(ques_ids)
loss, accuracy, total = .0, .0, .0
for batch in tqdm(range(num_batches), desc="Train epoch %d" % epoch_no):
train_X, train_y = get_batch(batch, batch_size, ques_map, ans_map, img_map, ques_ids, ques_to_img)
total += len(train_y)
# ... and train model with the batch
l, a = model.train_on_batch(train_X, train_y)
loss += l * len(train_y)
accuracy += a * len(train_y)
loss /= total
accuracy /= total
print("Train loss: {}\tAccuracy: {}".format(loss, accuracy))
return loss, accuracy
In [17]:
def val_epoch(
epoch_no,
model,
num_batches,
batch_size,
ques_map,
ans_map,
img_map,
ques_ids,
ques_to_img):
loss, accuracy, total = .0, .0, .0
for batch in tqdm(range(num_batches), desc="Val epoch %d" % epoch_no):
val_X, val_y = get_batch(batch, batch_size, ques_map, ans_map, img_map, ques_ids, ques_to_img)
total += len(val_y)
l, a = model.test_on_batch(val_X, val_y)
loss += l * len(val_y)
accuracy += a * len(val_y)
loss /= total
accuracy /= total
print("Val loss: {}\tAccuracy: {}".format(loss, accuracy))
return loss, accuracy
In [37]:
# constants for evaluation
taskType = 'OpenEnded'
dataType = 'mscoco' # 'mscoco' for real and 'abstract_v002' for abstract
dataSubType = 'val2014'
annFile = '%s/Annotations/%s_%s_annotations.json' % (dataDir, dataType, dataSubType)
quesFile = '%s/Questions/%s_%s_%s_questions.json' % (dataDir, taskType, dataType, dataSubType)
imgDir = '%s/Images/%s/%s/' % (dataDir, dataType, dataSubType)
resultType = 'eval'
fileTypes = ['results', 'accuracy', 'evalQA', 'evalQuesType', 'evalAnsType']
[resFile, accuracyFile, evalQAFile, evalQuesTypeFile, evalAnsTypeFile] = \
['%s/Results/%s_%s_%s_%s_%s.json' % (dataDir, taskType, dataType, dataSubType, \
resultType, fileType) for fileType in fileTypes]
In [41]:
def process_question_batch(questions, question_ids, images, results):
# find out maximum length of a question in this batch
max_len = max([len(ques) for ques in questions])
# ... and pad all questions in the batch to that length (more efficient than padding all questions to a single maximum length)
ques_aligned = []
for question in questions:
if len(question) < max_len:
ques_aligned.append(np.append(question, np.zeros((max_len - len(question), 300)), axis=0))
else:
ques_aligned.append(question)
val_X = [np.array(images), np.array(ques_aligned)]
predicted_y = model.predict_on_batch(val_X)
# add results to map
for ans, question_id in zip(predicted_y, question_ids):
res = {}
res['question_id'] = int(question_id)
# Get the best answer via argmax
res['answer'] = id_to_ans[np.argmax(ans)]
# Get the best answer via sampling
# res['answer'] = id_to_ans[np.random.choice(range(len(ans)), p=ans)]
results.append(res)
def print_accuracies(vqaEval):
print "\n"
print "Overall Accuracy is: %.02f\n" % (vqaEval.accuracy['overall'])
print "Per Question Type Accuracy is the following:"
for quesType in vqaEval.accuracy['perQuestionType']:
print "%s : %.02f" % (quesType, vqaEval.accuracy['perQuestionType'][quesType])
print "\n"
print "Per Answer Type Accuracy is the following:"
for ansType in vqaEval.accuracy['perAnswerType']:
print "%s : %.02f" % (ansType, vqaEval.accuracy['perAnswerType'][ansType])
print "\n"
def evaluate(
vqa,
model,
batch_size,
ques_map,
ans_map,
img_map,
id_to_ans,
verbose=False):
annIds = vqa.getQuesIds();
anns = vqa.loadQA(annIds)
questions = []
question_ids = []
images = []
results = []
for ann in tqdm(anns):
questions.append(ques_map[ann['question_id']])
question_ids.append(ann['question_id'])
images.append(img_map[ann['image_id']])
if len(questions) == batch_size:
process_question_batch(questions, question_ids, images, results)
# clear arrays
questions, question_ids, images = [], [], []
if len(questions) > 0:
process_question_batch(questions, question_ids, images, results)
# save results as a json
with open(resFile,"w") as outfile:
json.dump(results, outfile)
# create vqa object and vqaRes object
vqa_ann = VQA(annFile, quesFile)
vqaRes = vqa_ann.loadRes(resFile, quesFile)
# create vqaEval object by taking vqa and vqaRes
vqaEval = VQAEval(vqa_ann, vqaRes, n=2) # n is precision of accuracy (number of places after decimal), default is 2
vqaEval.evaluate()
if verbose:
print_accuracies(vqaEval)
return vqaEval.accuracy['overall']
In [23]:
def Word2VecModel(embedding_dim, dropout_rate):
print("Creating text model...")
model = Sequential()
model.add(LSTM(units=512, return_sequences=True, input_shape=(None, embedding_dim)))
model.add(Dropout(dropout_rate))
model.add(LSTM(units=512, return_sequences=False))
model.add(Dropout(dropout_rate))
model.add(Dense(1024, activation='tanh'))
return model
def img_model(dropout_rate):
print("Creating image model...")
model = Sequential()
model.add(Dense(1024, input_dim=2048, activation='tanh'))
return model
def vqa_model(embedding_dim, dropout_rate, num_classes):
vgg_model = img_model(dropout_rate)
lstm_model = Word2VecModel(embedding_dim, dropout_rate)
print("Merging final model...")
fc_model = Sequential()
fc_model.add(Merge([vgg_model, lstm_model], mode='mul'))
fc_model.add(Dropout(dropout_rate))
fc_model.add(Dense(1000, activation='tanh'))
fc_model.add(Dropout(dropout_rate))
fc_model.add(Dense(num_classes, activation='softmax'))
fc_model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
metrics=['accuracy'])
return fc_model
In [31]:
# model parameters
dropout_rate = 0.5
embedding_size = 300
num_classes = 1000
# training parameters
num_epochs = 80
batch_size = 1000
num_batches_train = train_dim // batch_size
num_batches_val = val_dim // batch_size
eval_every = 5
train_loss, train_acc = [], []
val_loss, val_acc = [], []
eval_acc = []
In [32]:
model = vqa_model(embedding_size, dropout_rate, num_classes)
In [17]:
model.load_weights("models/model_1_weights.h5")
In [82]:
for k in range(num_epochs):
loss, acc = train_epoch(k + 1, model, num_batches_train, batch_size, ques_train_map, ans_train_map, img_train_map, ques_train_ids, ques_to_img_train)
train_loss.append(loss)
train_acc.append(acc)
loss, acc = val_epoch(k + 1, model, num_batches_val, batch_size, ques_val_map, ans_val_map, img_val_map, ques_val_ids, ques_to_img_val)
val_loss.append(loss)
val_acc.append(acc)
if (k + 1) % eval_every == 0:
model.save_weights("models/model_1/model_1_epoch_%d_weights.h5" % (k + 1), overwrite=True)
eval_accuracy = evaluate(vqa_val, model, batch_size, ques_val_map, ans_val_map, img_val_map, id_to_ans)
print ("Eval accuracy: %.2f" % eval_accuracy)
eval_acc.append(eval_accuracy)
In [83]:
plt.plot(train_loss)
plt.plot(val_loss)
plt.legend(['Train loss', 'Validation loss'], loc='upper left')
plt.show()
In [84]:
plt.plot(train_acc)
plt.plot(val_acc)
plt.legend(['Train accuracy', 'Validation accuracy'], loc='upper left')
plt.show()
In [85]:
plt.plot(eval_acc)
plt.legend(['Eval accuracy on validation'], loc='lower right')
plt.show()
In [86]:
print "Best accuracy %.02f on epoch %d" % (max(eval_acc), (1 + np.argmax(np.array(eval_acc))) * eval_every)
In [33]:
model.load_weights("models/model_1/model_1_epoch_45_weights.h5")
In [42]:
evaluate(vqa_val, model, batch_size, ques_val_map, ans_val_map, img_val_map, id_to_ans, verbose=True)
Out[42]:
In [51]:
def Word2VecModel(embedding_dim, dropout_rate):
print("Creating text model...")
model = Sequential()
model.add(LSTM(units=512, return_sequences=True, input_shape=(None, embedding_dim)))
model.add(Dropout(dropout_rate))
model.add(LSTM(units=512, return_sequences=False))
model.add(Dropout(dropout_rate))
model.add(Dense(1024, activation='tanh'))
return model
def img_model(dropout_rate):
print("Creating image model...")
model = Sequential()
model.add(Dense(1024, input_dim=2048, activation='tanh'))
return model
def vqa_model(embedding_dim, dropout_rate, num_classes):
vgg_model = img_model(dropout_rate)
lstm_model = Word2VecModel(embedding_dim, dropout_rate)
print("Merging final model...")
fc_model = Sequential()
fc_model.add(Merge([vgg_model, lstm_model], mode='mul'))
fc_model.add(Dropout(dropout_rate))
fc_model.add(Dense(1000, activation='tanh'))
fc_model.add(Dropout(dropout_rate))
fc_model.add(Dense(num_classes, activation='softmax'))
fc_model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
metrics=['accuracy'])
return fc_model
In [52]:
# model parameters
dropout_rate = 0.5
embedding_size = 300
num_classes = 3000
# training parameters
num_epochs = 80
batch_size = 1000
num_batches_train = train_dim // batch_size
num_batches_val = val_dim // batch_size
eval_every = 5
train_loss, train_acc = [], []
val_loss, val_acc = [], []
eval_acc = []
In [53]:
model = vqa_model(embedding_size, dropout_rate, num_classes)
In [60]:
model.load_weights("models/model_2/model_2_epoch_40_weights.h5")
In [54]:
for k in range(num_epochs):
loss, acc = train_epoch(k + 1, model, num_batches_train, batch_size, ques_train_map, ans_train_map, img_train_map, ques_train_ids, ques_to_img_train)
train_loss.append(loss)
train_acc.append(acc)
loss, acc = val_epoch(k + 1, model, num_batches_val, batch_size, ques_val_map, ans_val_map, img_val_map, ques_val_ids, ques_to_img_val)
val_loss.append(loss)
val_acc.append(acc)
if (k + 1) % eval_every == 0:
model.save_weights("models/model_2/model_2_epoch_%d_weights.h5" % (k + 1), overwrite=True)
eval_accuracy = evaluate(vqa_val, model, batch_size, ques_val_map, ans_val_map, img_val_map, id_to_ans)
print ("Eval accuracy: %.2f" % eval_accuracy)
eval_acc.append(eval_accuracy)
In [55]:
plt.plot(train_loss)
plt.plot(val_loss)
plt.legend(['Train loss', 'Validation loss'], loc='upper left')
plt.show()
In [56]:
plt.plot(train_acc)
plt.plot(val_acc)
plt.legend(['Train accuracy', 'Validation accuracy'], loc='upper left')
plt.show()
In [57]:
plt.plot(eval_acc)
plt.legend(['Eval accuracy on validation'], loc='lower right')
plt.show()
In [59]:
print "Best accuracy %.02f on epoch %d" % (max(eval_acc), (1 + np.argmax(np.array(eval_acc))) * eval_every)
In [61]:
# Load best epoch
model.load_weights("models/model_2/model_2_epoch_40_weights.h5")
In [62]:
evaluate(vqa_val, model, batch_size, ques_val_map, ans_val_map, img_val_map, id_to_ans, verbose=True)
Out[62]: