In [77]:
# TensorFlow Model !
import os
import shutil
import numpy as np
import tensorflow as tf
tf.reset_default_graph()
from cell import ConvLSTMCell
import sys
module_path = os.path.join("/home/pratik/work/dl/deepvideos/model/../")
if module_path not in sys.path:
sys.path.append(module_path)
from datasets.batch_generator import datasets
slim = tf.contrib.slim
from tensorflow.python.ops import init_ops
from tensorflow.contrib.layers.python.layers import regularizers
trunc_normal = lambda stddev: init_ops.truncated_normal_initializer(0.0, stddev)
l2_val = 0.00005
In [2]:
# For looped RNN
batch_size = 4
timesteps = 4
conv_data_timesteps = timesteps * 2
shape = [64, 64] # Image shape
H, W, C = 64, 64, 3
kernel = [5, 5]
channels = 3
filters = [128, 128] # 2 stacked conv lstm filters
In [3]:
inp = tf.placeholder(tf.float32,(batch_size, conv_data_timesteps, H, W, C))
inp_to_conv_layer = tf.reshape(inp,[-1,H,W,C])
In [4]:
def conv_layer(inp,reuse):
with tf.variable_scope('conv_before_lstm',reuse=reuse):
net = slim.conv2d(inp, 128, [7,7], scope='conv_1',weights_initializer=trunc_normal(0.01))
print net
net = slim.conv2d(net, 256, [5,5], scope='conv_2',weights_initializer=trunc_normal(0.01))
print net
net = slim.conv2d(net, 512, [5,5], scope='conv_3',weights_initializer=trunc_normal(0.01))
print net
net = slim.conv2d(net, 256, [5,5], scope='conv_4',weights_initializer=trunc_normal(0.01))
print net
net = slim.conv2d(net, 128, [7,7], scope='conv_5',weights_initializer=trunc_normal(0.01))
print net
return net
In [5]:
def deconv_layer(deconv_input,reuse=None):
with tf.variable_scope('deconv_after_lstm',reuse=reuse):
net = slim.conv2d_transpose(deconv_input, 128, [7, 7], scope='deconv_5',weights_initializer=trunc_normal(0.01))
print net
net = slim.conv2d_transpose(net, 256, [5, 5], scope='deconv_4', weights_initializer=trunc_normal(0.01))
print net
net = slim.conv2d_transpose(net, 512, [5, 5], scope='deconv_3',weights_initializer=trunc_normal(0.01))
print net
net = slim.conv2d_transpose(net, 256, [5, 5], scope='deconv_2',weights_initializer=trunc_normal(0.01))
print net
net = slim.conv2d_transpose(net, 128, [7, 7], scope='deconv_1',weights_initializer=trunc_normal(0.01))
print net
net = slim.conv2d_transpose(net, 3, [7, 7], activation_fn=tf.tanh, scope='deconv_0',weights_initializer=trunc_normal(0.01))
print net
return net
In [6]:
deinp = tf.placeholder(tf.float32,(batch_size, 8, H, W, 128))
inp_to_deconv_layer = tf.reshape(deinp,[-1,64,64,128])
In [7]:
inp_to_deconv_layer
Out[7]:
In [8]:
deconv_layer(inp_to_deconv_layer)
Out[8]:
In [9]:
output_of_conv_layer = conv_layer(inp_to_conv_layer,None)
cB, cH, cW, cC = output_of_conv_layer.get_shape().as_list()
print (cB,cH,cW,cC)
inp_time_based = tf.reshape(output_of_conv_layer, [-1,conv_data_timesteps, cH, cW, cC])
print (inp_time_based)
encoder_input = tf.slice(inp_time_based,[0,0,0,0,0],[batch_size,timesteps,cH,cW,cC])
print (encoder_input)
decoder_input = tf.slice(inp_time_based,[0,timesteps,0,0,0],[batch_size,timesteps,cH,cW,cC])
print (decoder_input)
In [28]:
kernels = [[3,3],[5, 5]]
H, W = 64, 64
with tf.variable_scope('enc_conv_lstm_model'):
cells = []
for i, (each_filter,each_kernel) in enumerate(zip(filters,kernels)):
cell = ConvLSTMCell([H,W], each_filter, each_kernel,reuse=tf.get_variable_scope().reuse)
cells.append(cell)
cell = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
In [29]:
cell
Out[29]:
In [30]:
zero_state = cell.zero_state(batch_size,dtype=tf.float32)
zero_state
Out[30]:
In [31]:
encoder_output, encoder_state = tf.nn.dynamic_rnn(cell,inputs=encoder_input,initial_state=zero_state)
In [32]:
encoder_input
Out[32]:
In [33]:
encoder_output
Out[33]:
In [34]:
encoder_state
Out[34]:
In [55]:
encoder_output.get_shape().as_list()
Out[55]:
In [ ]:
kernels = [[3,3],[5, 5]]
H, W = 64, 64
with tf.variable_scope('dec_conv_lstm_model'):
cells = []
for i, (each_filter,each_kernel) in enumerate(zip(filters,kernels)):
cell = ConvLSTMCell([H,W], each_filter, each_kernel,reuse=tf.get_variable_scope().reuse)
cells.append(cell)
cell = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
In [11]:
batch_size = 16
number_of_images_to_show = 4
assert number_of_images_to_show <= batch_size
shape = [64, 64] # Image shape
H, W = shape
kernels = [[3, 3],[5, 5]]
channels = C = 3
filters = [128, 128] # 2 stacked conv lstm filters
enc_timesteps = 8 - 1
dec_timesteps = 8
timesteps = enc_timesteps + dec_timesteps
images_summary_timesteps = [0, 2, 5, 7]
# Create a placeholder for videos.
inputs = tf.placeholder(tf.float32, [batch_size, timesteps] + shape + [channels],
name="seq2seq_inputs") # (batch_size, timestep, H, W, C)
outputs_exp = tf.placeholder(tf.float32, [batch_size, dec_timesteps] + shape + [channels],
name="seq2seq_outputs_exp") # (batch_size, timestep, H, W, C)
teacher_force_sampling = tf.placeholder(tf.float32, [dec_timesteps], name="teacher_force_sampling")
prob_select_teacher = tf.placeholder(tf.float32, shape=(), name="prob_select_teacher")
# model output
model_output = None
# loss
l2_loss = None
# optimizer
optimizer = None
reuse_conv = None
reuse_deconv = None
In [23]:
def conv_layer(conv_input):
# conv before lstm
with tf.variable_scope('conv_before_lstm',reuse=True):
net = slim.conv2d(conv_input, 128, [7,7], scope='conv_1',weights_initializer=trunc_normal(0.01))
net = slim.conv2d(net, 256, [5,5], scope='conv_2',weights_initializer=trunc_normal(0.01))
net = slim.conv2d(net, 512, [5,5], scope='conv_3',weights_initializer=trunc_normal(0.01))
net = slim.conv2d(net, 256, [5,5], scope='conv_4',weights_initializer=trunc_normal(0.01))
net = slim.conv2d(net, 128, [7,7], scope='conv_5',weights_initializer=trunc_normal(0.01))
reuse_conv = True
return net
In [24]:
def deconv_layer(deconv_input):
with tf.variable_scope('deconv_after_lstm',reuse=True):
net = slim.conv2d_transpose(deconv_input, 128, [7, 7], scope='deconv_5',weights_initializer=trunc_normal(0.01))
net = slim.conv2d_transpose(net, 256, [5, 5], scope='deconv_4', weights_initializer=trunc_normal(0.01))
net = slim.conv2d_transpose(net, 512, [5, 5], scope='deconv_3',weights_initializer=trunc_normal(0.01))
net = slim.conv2d_transpose(net, 256, [5, 5], scope='deconv_2',weights_initializer=trunc_normal(0.01))
net = slim.conv2d_transpose(net, 128, [7, 7], scope='deconv_1',weights_initializer=trunc_normal(0.01))
net = slim.conv2d_transpose(net, 3, [7, 7], activation_fn=tf.tanh, scope='deconv_0',weights_initializer=trunc_normal(0.01))
reuse_deconv = True
return net
In [14]:
def enc_lstm_layer(H,W):
with tf.variable_scope('enc_lstm_model'):
cells = []
for i, (each_filter, each_kernel) in enumerate(zip(filters,kernels)):
cell = ConvLSTMCell([H, W], each_filter, each_kernel,reuse=tf.get_variable_scope().reuse)
cells.append(cell)
cell = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
return cell
def dec_lstm_layer(H,W):
with tf.variable_scope('dec_lstm_model'):
cells = []
for i, (each_filter, each_kernel) in enumerate(zip(filters,kernels)):
cell = ConvLSTMCell([H, W], each_filter, each_kernel,reuse=tf.get_variable_scope().reuse)
cells.append(cell)
cell = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
return cell
In [15]:
H, W, C = shape[0], shape[1], channels
input_conv_layer = tf.reshape(inputs, [-1,H,W,C])
output_conv_layer = conv_layer(input_conv_layer)
_, H, W, C = output_conv_layer.get_shape().as_list()
lstm_shaped_input = tf.reshape(output_conv_layer, [-1,timesteps,H,W,C])
In [16]:
input_conv_layer
Out[16]:
In [17]:
output_conv_layer
Out[17]:
In [18]:
lstm_shaped_input
Out[18]:
In [19]:
# slice first part to feed to encoder and second to decoder
encoder_inp = tf.slice(lstm_shaped_input,[0,0,0,0,0],[batch_size,enc_timesteps,H,W,C])
decoder_inp = tf.slice(lstm_shaped_input,[0,enc_timesteps,0,0,0],[batch_size,dec_timesteps,H,W,C])
# dynamic rnn as encoder
encoder_cell = enc_lstm_layer(H,W)
zero_state = encoder_cell.zero_state(batch_size, dtype=tf.float32)
encoder_output, encoder_final_state = tf.nn.dynamic_rnn(encoder_cell,inputs=encoder_inp,initial_state=zero_state)
In [20]:
encoder_final_state
Out[20]:
In [21]:
encoder_output
Out[21]:
In [22]:
# decoder cell
decoder_cell = dec_lstm_layer(H,W)
state = encoder_final_state
input_for_first_time = tf.slice(decoder_inp, [0,0,0,0,0], [batch_size,1,H,W,C])
input_for_first_time = tf.squeeze(input_for_first_time,[1])
input_deconv, state = decoder_cell(input_for_first_time,state)
predications = []
deconv_output = deconv_layer(input_deconv)
predications.append(deconv_output)
In [25]:
input_for_first_time
Out[25]:
In [26]:
input_deconv
Out[26]:
In [27]:
deconv_output
Out[27]:
In [28]:
for i in range(1,dec_timesteps):
select_sampling = tf.greater_equal(prob_select_teacher, tf.gather(teacher_force_sampling,i))
# Conv on actual t_timestep input
ith_frame = tf.slice(decoder_inp,[0,i,0,0,0],[batch_size,1,64,64,3])
ith_frame = tf.squeeze(ith_frame,[1])
conv_output = conv_layer(ith_frame)
branch_1 = decoder_cell(conv_output, state)
# Conv on predicated t-1_timestep input
conv_output = conv_layer(deconv_output)
branch_2 = decoder_cell(conv_output, state)
deconv_input, state = tf.cond(select_sampling, lambda: branch_1, lambda: branch_2)
deconv_output = deconv_layer(deconv_input)
predications.append(deconv_output)
In [29]:
predications
Out[29]:
In [30]:
model_output = tf.transpose(tf.stack(predications),perm=[1,0,2,3,4])
In [31]:
model_output
Out[31]:
In [32]:
outputs_exp
Out[32]:
In [134]:
def l2_loss(generated_frames, expected_frames):
losses = []
for each_scale_gen_frames, each_scale_exp_frames in zip(generated_frames, expected_frames):
losses.append(tf.nn.l2_loss(tf.subtract(each_scale_gen_frames, each_scale_exp_frames)))
loss = tf.reduce_mean(tf.stack(losses))
return loss
def gdl_loss(generated_frames, expected_frames, alpha=2):
"""
difference with side pixel and below pixel
"""
scale_losses = []
for i in xrange(len(generated_frames)):
# create filters [-1, 1] and [[1],[-1]] for diffing to the left and down respectively.
pos = tf.constant(np.identity(3), dtype=tf.float32)
neg = -1 * pos
filter_x = tf.expand_dims(tf.stack([neg, pos]), 0) # [-1, 1]
filter_y = tf.stack([tf.expand_dims(pos, 0), tf.expand_dims(neg, 0)]) # [[1],[-1]]
strides = [1, 1, 1, 1] # stride of (1, 1)
padding = 'SAME'
gen_dx = tf.abs(tf.nn.conv2d(generated_frames[i], filter_x, strides, padding=padding))
gen_dy = tf.abs(tf.nn.conv2d(generated_frames[i], filter_y, strides, padding=padding))
gt_dx = tf.abs(tf.nn.conv2d(expected_frames[i], filter_x, strides, padding=padding))
gt_dy = tf.abs(tf.nn.conv2d(expected_frames[i], filter_y, strides, padding=padding))
grad_diff_x = tf.abs(gt_dx - gen_dx)
grad_diff_y = tf.abs(gt_dy - gen_dy)
scale_losses.append(tf.reduce_sum((grad_diff_x ** alpha + grad_diff_y ** alpha)))
# condense into one tensor and avg
return tf.reduce_mean(tf.stack(scale_losses))
def total_loss(generated_frames, expected_frames, lambda_gdl=1.0, lambda_l2=1.0):
B, T, H, W, C = generated_frames.get_shape().as_list()
B1, T1, H1, W1, C1 = expected_frames.get_shape().as_list()
assert (B, T, H, W, C)==(B1, T1, H1, W1, C1),"shape should be equal of gen and exp frames !"
each_step_gen_frames = []
each_step_exp_frames = []
for each_i in range(T):
input_for_gen = tf.slice(generated_frames, [0,each_i,0,0,0], [B,1,H,W,C])
input_for_gen = tf.squeeze(input_for_gen,[1])
each_step_gen_frames.append(input_for_gen)
input_for_exp = tf.slice(expected_frames, [0,each_i,0,0,0], [B,1,H,W,C])
input_for_exp = tf.squeeze(input_for_exp,[1])
each_step_exp_frames.append(input_for_exp)
total_loss_cal = (lambda_gdl * gdl_loss(each_step_gen_frames, each_step_exp_frames) +
lambda_l2 * l2_loss(each_step_gen_frames, each_step_exp_frames))
return total_loss_cal
In [135]:
l = total_loss(model_output,outputs_exp)
In [33]:
encoder_inp
Out[33]:
In [34]:
decoder_inp
Out[34]:
In [35]:
sess = tf.Session()
In [36]:
x = np.arange(1*15*64*64*3).reshape((1,15,64,64,3))
In [37]:
p = tf.constant(x)
In [42]:
z = tf.slice(p,[0,7,0,0,0],[1,8,64,64,3])
In [43]:
z
Out[43]:
In [45]:
In [78]:
batch_size = 16
number_of_images_to_show = 4
assert number_of_images_to_show <= batch_size
shape = [64, 64] # Image shape
H, W = shape
kernels = [[3, 3],[5, 5]]
channels = C = 3
enc_timesteps = 4 - 1
dec_timesteps = 4
timesteps = enc_timesteps + dec_timesteps
images_summary_timesteps = [0, 1, 2, 3]
In [79]:
# Create a placeholder for videos.
inputs = tf.placeholder(tf.float32, [batch_size, timesteps] + shape + [channels],
name="seq2seq_inputs") # (batch_size, timestep, H, W, C)
outputs_exp = tf.placeholder(tf.float32, [batch_size, dec_timesteps] + shape + [channels],
name="seq2seq_outputs_exp") # (batch_size, timestep, H, W, C)
In [80]:
inputs
Out[80]:
In [81]:
outputs_exp
Out[81]:
In [82]:
def conv_layer(conv_input,reuse=None):
# conv before lstm
with tf.variable_scope('conv_before_lstm',reuse=reuse):
net = slim.conv2d(conv_input, 32, [3, 3], scope='conv_1', weights_initializer=trunc_normal(0.01),
weights_regularizer=regularizers.l2_regularizer(l2_val))
print net
net = slim.conv2d(net, 64, [3, 3], scope='conv_2', weights_initializer=trunc_normal(0.01),
weights_regularizer=regularizers.l2_regularizer(l2_val))
print net
net = slim.conv2d(net, 128, [3, 3], stride=2, scope='conv_3', weights_initializer=trunc_normal(0.01),
weights_regularizer=regularizers.l2_regularizer(l2_val))
print net
net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv_4', weights_initializer=trunc_normal(0.01),
weights_regularizer=regularizers.l2_regularizer(l2_val))
print net
return net
def deconv_layer(deconv_input,reuse=None):
with tf.variable_scope('deconv_after_lstm',reuse=reuse):
net = slim.conv2d_transpose(deconv_input, 256, [3, 3], scope='deconv_4',
weights_initializer=trunc_normal(0.01),
weights_regularizer=regularizers.l2_regularizer(l2_val))
print net
net = slim.conv2d_transpose(net, 128, [3, 3], stride=2, scope='deconv_3', weights_initializer=trunc_normal(0.01),
weights_regularizer=regularizers.l2_regularizer(l2_val))
print net
net = slim.conv2d_transpose(net, 64, [3, 3], stride=2, scope='deconv_2',
weights_initializer=trunc_normal(0.01),
weights_regularizer=regularizers.l2_regularizer(l2_val))
print net
net = slim.conv2d_transpose(net, 32, [3, 3], scope='deconv_1',
weights_initializer=trunc_normal(0.01),
weights_regularizer=regularizers.l2_regularizer(l2_val))
print net
net = slim.conv2d_transpose(net, 3, [3, 3], activation_fn=tf.tanh, scope='deconv_0',
weights_initializer=trunc_normal(0.01),
weights_regularizer=regularizers.l2_regularizer(l2_val))
print net
return net
In [83]:
B, T, H, W, C = inputs.get_shape().as_list()
print (B, T, H, W, C)
In [84]:
reshaped_inputs_for_conv = tf.reshape(inputs, [-1,H,W,C])
In [85]:
reshaped_inputs_for_conv
Out[85]:
In [86]:
conved_output = conv_layer(reshaped_inputs_for_conv)
In [87]:
def conv_lstm_encoder(H,W,filter_size,kernel,encoder_input):
with tf.variable_scope('enc_lstm_model'):
encoder_cell = ConvLSTMCell([H,W], filter_size, kernel,reuse=tf.get_variable_scope().reuse)
zero_state = encoder_cell.zero_state(batch_size,dtype=tf.float32)
_, encoded_state = tf.nn.dynamic_rnn(cell=encoder_cell, inputs=encoder_input, initial_state=zero_state)
return encoded_state
def conv_lstm_decoder(H,W,filter_size,kernel,decoder_input,enc_final_state):
with tf.variable_scope('dec_lstm_model'):
decoder_cell = ConvLSTMCell([H,W], filter_size, kernel,reuse=tf.get_variable_scope().reuse)
decoder_outputs, _ = tf.nn.dynamic_rnn(cell=decoder_cell, inputs=decoder_input, initial_state=enc_final_state)
return decoder_outputs
In [88]:
_, H, W, C = conved_output.get_shape().as_list()
print (_, H, W, C)
lstm_input_reshape = tf.reshape(conved_output, [B,T,H,W,C])
print lstm_input_reshape
In [89]:
B, T, H, W, C = lstm_input_reshape.get_shape().as_list()
# split conv input into two parts
encoder_input_from_conv = tf.slice(lstm_input_reshape,[0,0,0,0,0],[B,enc_timesteps,H,W,C])
decoder_input_from_conv = tf.slice(lstm_input_reshape,[0,enc_timesteps,0,0,0],[B,dec_timesteps,H,W,C])
print encoder_input_from_conv
print decoder_input_from_conv
In [90]:
filter_size = C
kernel_size = [3,3]
encoded_state = conv_lstm_encoder(H,W,filter_size,kernel_size,encoder_input_from_conv)
print encoded_state
In [91]:
decoder_output = conv_lstm_decoder(H,W,filter_size,kernel_size,decoder_input_from_conv,encoded_state)
In [92]:
decoder_output
Out[92]:
In [93]:
# pass through deconv layer
B, T, H, W, C = decoder_output.get_shape().as_list()
deconv_layer_input = tf.reshape(decoder_output,[-1,H, W, C])
predication = deconv_layer(deconv_layer_input)
In [98]:
print (B, T, H, W, C)
_, H, W, C = predication.get_shape().as_list()
print (B, T, H, W, C)
In [99]:
model_output = tf.reshape(predication,[B,T,H,W,C])
print model_output
In [100]:
def l2_loss(generated_frames, expected_frames):
losses = []
for each_scale_gen_frames, each_scale_exp_frames in zip(generated_frames, expected_frames):
losses.append(tf.nn.l2_loss(tf.subtract(each_scale_gen_frames, each_scale_exp_frames)))
loss = tf.reduce_mean(tf.stack(losses))
return loss
def gdl_loss(generated_frames, expected_frames, alpha=2):
"""
difference with side pixel and below pixel
"""
scale_losses = []
for i in xrange(len(generated_frames)):
# create filters [-1, 1] and [[1],[-1]] for diffing to the left and down respectively.
pos = tf.constant(np.identity(3), dtype=tf.float32)
neg = -1 * pos
filter_x = tf.expand_dims(tf.stack([neg, pos]), 0) # [-1, 1]
filter_y = tf.stack([tf.expand_dims(pos, 0), tf.expand_dims(neg, 0)]) # [[1],[-1]]
strides = [1, 1, 1, 1] # stride of (1, 1)
padding = 'SAME'
gen_dx = tf.abs(tf.nn.conv2d(generated_frames[i], filter_x, strides, padding=padding))
gen_dy = tf.abs(tf.nn.conv2d(generated_frames[i], filter_y, strides, padding=padding))
gt_dx = tf.abs(tf.nn.conv2d(expected_frames[i], filter_x, strides, padding=padding))
gt_dy = tf.abs(tf.nn.conv2d(expected_frames[i], filter_y, strides, padding=padding))
grad_diff_x = tf.abs(gt_dx - gen_dx)
grad_diff_y = tf.abs(gt_dy - gen_dy)
scale_losses.append(tf.reduce_sum((grad_diff_x ** alpha + grad_diff_y ** alpha)))
# condense into one tensor and avg
return tf.reduce_mean(tf.stack(scale_losses))
def total_loss(generated_frames, expected_frames, lambda_gdl=1.0, lambda_l2=1.0):
B, T, H, W, C = generated_frames.get_shape().as_list()
B1, T1, H1, W1, C1 = expected_frames.get_shape().as_list()
assert (B, T, H, W, C)==(B1, T1, H1, W1, C1),"shape should be equal of gen and exp frames !"
each_step_gen_frames = []
each_step_exp_frames = []
for each_i in range(T):
input_for_gen = tf.slice(generated_frames, [0,each_i,0,0,0], [B,1,H,W,C])
input_for_gen = tf.squeeze(input_for_gen,[1])
each_step_gen_frames.append(input_for_gen)
input_for_exp = tf.slice(expected_frames, [0,each_i,0,0,0], [B,1,H,W,C])
input_for_exp = tf.squeeze(input_for_exp,[1])
each_step_exp_frames.append(input_for_exp)
total_loss_cal = (lambda_gdl * gdl_loss(each_step_gen_frames, each_step_exp_frames) +
lambda_l2 * l2_loss(each_step_gen_frames, each_step_exp_frames))
return total_loss_cal
In [101]:
loss = total_loss(model_output,outputs_exp)
In [102]:
print loss
In [103]:
a = range(8)
print a[:7]
print a[-4:]
In [ ]: