In [1]:
from __future__ import absolute_import, division, print_function

import os; os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import numpy as np
import tensorflow as tf
sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)))

# Our implementation of ConvNet layers and Compact Bilinear in TensorFlow
from vqa_mcb_model import vqa_mcb_model

# For ResNet, we (currently) still use Caffe to extract image features.
# We DO NOT need the vqa Caffe branch. You can use standard Caffe for ResNet.
import sys; sys.path.append('/home/ronghang/workspace/caffe-vqa-mcb/python')
import caffe; caffe.set_mode_cpu()

# For loading data, we use the LoadVQADataProvider in original code
import vqa_data_provider_layer

In [2]:
# constants
MCB_PROTOTXT_PATH = '../data/multi_att_2_glove_pretrained/proto_test_batchsize1.prototxt'
MCB_CAFFEMODEL_PATH =  '../data/multi_att_2_glove_pretrained/_iter_190000.caffemodel'
vqa_data_provider_layer.CURRENT_DATA_SHAPE = (2048, 14, 14)

# Converted from the corresponding Caffe model
SAVE_MODEL = './tf_vqa_data/_iter_190000.tfmodel'

In [3]:
def convert_embed_param(params):
    embed_W = params[0].data[...].copy()
    embed_B = params[1].data[...].copy()
    return embed_W, embed_B

def convert_conv_param(params):
    W = params[0].data.transpose((2, 3, 1, 0))
    B = params[1].data[...].copy()
    return W, B

def convert_fc_param(params):
    W = params[0].data.transpose((1, 0))
    B = params[1].data[...].copy()
    return W, B

def convert_lstm_param(params):
    W = np.hstack((params[0].data, params[2].data))  # input before states
    B = params[1].data
    
    # convert the gate order
    W_i, W_f, W_o, W_g = np.split(W, 4, axis=0)
    W = np.vstack((W_i, W_g, W_f, W_o))
    W = W.transpose((1, 0))
    B_i, B_f, B_o, B_g = np.split(B, 4)
    B = np.hstack((B_i, B_g, B_f, B_o))
    return W, B

def assign_var(name, value):
    return tf.assign(tf.get_variable(name), value)

In [4]:
caffe_net = caffe.Net(MCB_PROTOTXT_PATH, MCB_CAFFEMODEL_PATH, caffe.TEST)

embed_W, embed_B = convert_embed_param(caffe_net.params['embed_ba'])
att_conv1_W, att_conv1_B = convert_conv_param(caffe_net.params['att_conv1'])
att_conv2_W, att_conv2_B = convert_conv_param(caffe_net.params['att_conv2'])
prediction_W, prediction_B = convert_fc_param(caffe_net.params['prediction'])
lstm1_W, lstm1_B = convert_lstm_param(caffe_net.params['lstm1'])
lstm2_W, lstm2_B = convert_lstm_param(caffe_net.params['lstm2'])

In [5]:
max_time = 20
batch_size = 1
glove_dim = 300
feat_h, feat_w, img_feat_dim = 14, 14, 2048
num_vocab = 21025
lstm_output_dim = 1024
lstm_layers = 2
embed_dim = 300
cbp0_dim= 16000
cbp1_dim = 16000
num_classes = 3000

word_indices = tf.placeholder(tf.int32, [max_time, batch_size])
glove_vector = tf.placeholder(tf.float32, [max_time, batch_size, glove_dim])
seq_length = tf.placeholder(tf.int32, [batch_size])
img_feature = tf.placeholder(tf.float32, [batch_size, feat_h, feat_w, img_feat_dim])
prediction, att_softmax0, _ = vqa_mcb_model(word_indices, glove_vector,
    seq_length, img_feature, batch_size, num_vocab, embed_dim, glove_dim, max_time,
    lstm_output_dim, lstm_layers, feat_h, feat_w, img_feat_dim, cbp0_dim, cbp1_dim,
    num_classes)

In [6]:
init_ops = []
with tf.variable_scope('vqa_mcb', reuse=True):
    init_ops.append(assign_var('embedding/weights', embed_W))
    init_ops.append(assign_var('embedding/biases', embed_B))
    init_ops.append(assign_var('lstm12/multi_rnn_cell/cell_0/basic_lstm_cell/weights', lstm1_W))
    init_ops.append(assign_var('lstm12/multi_rnn_cell/cell_0/basic_lstm_cell/biases', lstm1_B))
    init_ops.append(assign_var('lstm12/multi_rnn_cell/cell_1/basic_lstm_cell/weights', lstm2_W))
    init_ops.append(assign_var('lstm12/multi_rnn_cell/cell_1/basic_lstm_cell/biases', lstm2_B))
    init_ops.append(assign_var('att_conv1/weights', att_conv1_W))
    init_ops.append(assign_var('att_conv1/biases', att_conv1_B))
    init_ops.append(assign_var('att_conv2/weights', att_conv2_W))
    init_ops.append(assign_var('att_conv2/biases', att_conv2_B))
    init_ops.append(assign_var('prediction/weights', prediction_W))
    init_ops.append(assign_var('prediction/biases', prediction_B))

In [7]:
saver = tf.train.Saver(write_version=1)

In [8]:
sess.run(tf.group(*init_ops))
saver.save(sess, SAVE_MODEL, write_meta_graph=False)


WARNING:tensorflow:*******************************************************
WARNING:tensorflow:TensorFlow's V1 checkpoint format has been deprecated.
WARNING:tensorflow:Consider switching to the more efficient V2 format:
WARNING:tensorflow:   `tf.train.Saver(write_version=tf.train.SaverDef.V2)`
WARNING:tensorflow:now on by default.
WARNING:tensorflow:*******************************************************
Out[8]:
'./tf_vqa_data/_iter_190000.tfmodel'