In [1]:
from __future__ import absolute_import, division, print_function
import os; os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import numpy as np
import tensorflow as tf
sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)))
# Our implementation of ConvNet layers and Compact Bilinear in TensorFlow
from vqa_mcb_model import vqa_mcb_model
# For ResNet, we (currently) still use Caffe to extract image features.
# We DO NOT need the vqa Caffe branch. You can use standard Caffe for ResNet.
import sys; sys.path.append('/home/ronghang/workspace/caffe-vqa-mcb/python')
import caffe; caffe.set_mode_cpu()
# For loading data, we use the LoadVQADataProvider in original code
import vqa_data_provider_layer
In [2]:
# constants
MCB_PROTOTXT_PATH = '../data/multi_att_2_glove_pretrained/proto_test_batchsize1.prototxt'
MCB_CAFFEMODEL_PATH = '../data/multi_att_2_glove_pretrained/_iter_190000.caffemodel'
vqa_data_provider_layer.CURRENT_DATA_SHAPE = (2048, 14, 14)
# Converted from the corresponding Caffe model
SAVE_MODEL = './tf_vqa_data/_iter_190000.tfmodel'
In [3]:
def convert_embed_param(params):
embed_W = params[0].data[...].copy()
embed_B = params[1].data[...].copy()
return embed_W, embed_B
def convert_conv_param(params):
W = params[0].data.transpose((2, 3, 1, 0))
B = params[1].data[...].copy()
return W, B
def convert_fc_param(params):
W = params[0].data.transpose((1, 0))
B = params[1].data[...].copy()
return W, B
def convert_lstm_param(params):
W = np.hstack((params[0].data, params[2].data)) # input before states
B = params[1].data
# convert the gate order
W_i, W_f, W_o, W_g = np.split(W, 4, axis=0)
W = np.vstack((W_i, W_g, W_f, W_o))
W = W.transpose((1, 0))
B_i, B_f, B_o, B_g = np.split(B, 4)
B = np.hstack((B_i, B_g, B_f, B_o))
return W, B
def assign_var(name, value):
return tf.assign(tf.get_variable(name), value)
In [4]:
caffe_net = caffe.Net(MCB_PROTOTXT_PATH, MCB_CAFFEMODEL_PATH, caffe.TEST)
embed_W, embed_B = convert_embed_param(caffe_net.params['embed_ba'])
att_conv1_W, att_conv1_B = convert_conv_param(caffe_net.params['att_conv1'])
att_conv2_W, att_conv2_B = convert_conv_param(caffe_net.params['att_conv2'])
prediction_W, prediction_B = convert_fc_param(caffe_net.params['prediction'])
lstm1_W, lstm1_B = convert_lstm_param(caffe_net.params['lstm1'])
lstm2_W, lstm2_B = convert_lstm_param(caffe_net.params['lstm2'])
In [5]:
max_time = 20
batch_size = 1
glove_dim = 300
feat_h, feat_w, img_feat_dim = 14, 14, 2048
num_vocab = 21025
lstm_output_dim = 1024
lstm_layers = 2
embed_dim = 300
cbp0_dim= 16000
cbp1_dim = 16000
num_classes = 3000
word_indices = tf.placeholder(tf.int32, [max_time, batch_size])
glove_vector = tf.placeholder(tf.float32, [max_time, batch_size, glove_dim])
seq_length = tf.placeholder(tf.int32, [batch_size])
img_feature = tf.placeholder(tf.float32, [batch_size, feat_h, feat_w, img_feat_dim])
prediction, att_softmax0, _ = vqa_mcb_model(word_indices, glove_vector,
seq_length, img_feature, batch_size, num_vocab, embed_dim, glove_dim, max_time,
lstm_output_dim, lstm_layers, feat_h, feat_w, img_feat_dim, cbp0_dim, cbp1_dim,
num_classes)
In [6]:
init_ops = []
with tf.variable_scope('vqa_mcb', reuse=True):
init_ops.append(assign_var('embedding/weights', embed_W))
init_ops.append(assign_var('embedding/biases', embed_B))
init_ops.append(assign_var('lstm12/multi_rnn_cell/cell_0/basic_lstm_cell/weights', lstm1_W))
init_ops.append(assign_var('lstm12/multi_rnn_cell/cell_0/basic_lstm_cell/biases', lstm1_B))
init_ops.append(assign_var('lstm12/multi_rnn_cell/cell_1/basic_lstm_cell/weights', lstm2_W))
init_ops.append(assign_var('lstm12/multi_rnn_cell/cell_1/basic_lstm_cell/biases', lstm2_B))
init_ops.append(assign_var('att_conv1/weights', att_conv1_W))
init_ops.append(assign_var('att_conv1/biases', att_conv1_B))
init_ops.append(assign_var('att_conv2/weights', att_conv2_W))
init_ops.append(assign_var('att_conv2/biases', att_conv2_B))
init_ops.append(assign_var('prediction/weights', prediction_W))
init_ops.append(assign_var('prediction/biases', prediction_B))
In [7]:
saver = tf.train.Saver(write_version=1)
In [8]:
sess.run(tf.group(*init_ops))
saver.save(sess, SAVE_MODEL, write_meta_graph=False)
Out[8]: