In [1]:
"""
---------- Dirl Multi GPU template ----------
There are methods to define the inputs of model
1. Input by queue
2. Input by data generator
you can choose one of the methods which meets your demand,
we have already commented out two blocks for each method
You can design your own codes below the quotes ''' '''
For example :
''' define your optimizer '''
opt = tf.train.AdamOptimizer(0.0001, beta1=0.5, beta2=0.999)
Author: Shin-Yi Ding
"""
Out[1]:
In [2]:
# system package
import time
import os
import numpy as np
import tensorflow as tf
# defined package
import deepwarp
import load_dataset
# get the predefined parameters
# we implement this by package:argparse
from config import get_config
conf,_ = get_config()
TOWER_NAME = 'tower'
In [3]:
''' set name (for save checkpoints) '''
ckp_fn = 'Adam_0.001_v1'
In [4]:
''' set visible gpu IDs '''
# input example in config, "1,2"
gpus = conf.gpus.split(',')
os.environ["CUDA_VISIBLE_DEVICES"]=','.join([str(i) for i in gpus])
n_gpus = len(gpus)
In [5]:
''' define input setting '''
# case dependent
validation_portion = 0.05
conf.eye = "L"
data_dir = conf.dataset
dirs = np.asarray([d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))])
training_dirs = dirs[0:(dirs.shape[0]-int(dirs.shape[0]*validation_portion))]
valiation_dirs = dirs[(dirs.shape[0]-int(dirs.shape[0]*validation_portion)):dirs.shape[0]]
In [6]:
# define by cifar-10 example
# you don't have to change this function
def average_gradients(tower_grads):
"""Calculate the average gradient for each shared variable across all towers.
Note that this function provides a synchronization point across all towers.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been averaged
across all towers.
"""
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
In [7]:
# define the inputs and outputs of the inference and loss function here
# the inference model and loss function are defined in the *deepwarp.py*
def tower_loss(scope, input_img, input_fp, input_ang, img_, phase_train):
"""
Args:
scope: unique prefix string identifying the tower, e.g. 'tower_0'
input_(): batch inputs
Returns:
Tensor of shape [] containing the total loss for a batch of data
"""
''' define your inputs and outputs of inference function'''
img_pred = deepwarp.inference(input_img, input_fp, input_ang, phase_train, conf)
''' define your inputs and outputs of loss function '''
with tf.name_scope('l2_loss'):
loss = deepwarp.loss(img_pred, img_)
# Assemble all of the losses for the current tower only.
losses = tf.get_collection('losses', scope)
# Calculate the total loss for the current tower.
total_loss = tf.add_n(losses, name='total_loss')
return total_loss
In [ ]:
# the main function of training process, you have to design an input method here
def train():
# calculate in cpu
with tf.Graph().as_default(), tf.device('/cpu:0'):
# define the counter of global step
global_step = tf.get_variable('global_step',
[],
initializer=tf.constant_initializer(0),
trainable=False)
# define the phase train constant (bool) for batch normalization
with tf.name_scope('phase_train'):
phase_train = tf.placeholder(tf.bool, name='phase_train')
''' there are two methods to define the inputs of model '''
# 1. input by queue
# initial the data generator
coord = tf.train.Coordinator()
# define your queue in *load_dataset.py
with tf.name_scope('create_inputs'):
reader = load_dataset.DataGenerator(coord,
pack_size=conf.batch_size*n_gpus,
buffer_ratio=3,
data_dir=data_dir,
input_dirs=training_dirs,
eye=conf.eye)
# 2. input by data generator
# define your generator in *load_dataset.py
# with tf.name_scope('create_inputs'):
# reader = load_dataset.load_data(data_dir, training_dirs, conf.batch_size*n_gpus, conf.eye)
# define the placeholder for inputs
# with tf.name_scope('inputs'):
# input_img = tf.placeholder(tf.float32, [None, conf.height, conf.width, conf.channel], name="input_img") # [None, 41, 51, 3]
# input_fp = tf.placeholder(tf.float32, [None, conf.height, conf.width,conf.ef_dim], name="input_fp") # [None, 41, 51, 14]
# input_ang = tf.placeholder(tf.float32, [None, conf.agl_dim], name="input_ang") ## [None, 41, 51, 2]
# img_ = tf.placeholder(tf.float32, [None, conf.height, conf.width, conf.channel], name ="Ground_Truth")
''' define the optimizer '''
opt = tf.train.AdamOptimizer(conf.lr, beta1=0.5, beta2=0.999)
# initial the tower gradients
tower_grads = []
# define the multiple gpu towers
# you only need to deal with the input here
with tf.variable_scope(tf.get_variable_scope()):
for i in range(n_gpus):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
''' there are two methods to feed the training inputs '''
# 1. calculate loss (feed input by queue)
# get training batch from queue
input_batch = reader.dequeue(conf.batch_size)
# calculate the loss for each tower
loss = tower_loss(scope,
input_batch[0],
input_batch[1],
input_batch[2],
input_batch[3],
phase_train)
# 2. calculate loss (feed input by feed_dict)
# loss = tower_loss(scope,
# input_img[i*conf.batch_size:(i+1)*conf.batch_size],
# input_fp[i*conf.batch_size:(i+1)*conf.batch_size],
# input_ang[i*conf.batch_size:(i+1)*conf.batch_size],
# img_[i*conf.batch_size:(i+1)*conf.batch_size],
# phase_train)
# Reuse variables for the next tower.
tf.get_variable_scope().reuse_variables()
# Calculate the gradients for the batch of data on this tower.
grads = opt.compute_gradients(loss)
# Keep track of the gradients across all towers.
tower_grads.append(grads)
# calculate mean gradients of towers
grads = average_gradients(tower_grads)
# apply gradients
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# define sess
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False))
''' start queue runners (Only needed when you input the dataset with queue) '''
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
reader.start_threads(sess, n_threads=2)
init = tf.global_variables_initializer()
sess.run(init)
for s in range(conf.steps):
start_time = time.time()
''' define your sess.run (feed input by queue) '''
# 1. define by queue
_, loss_train = sess.run([apply_gradient_op, loss], feed_dict={phase_train: True})
# 2.define by feed_dict
# inputs_batch = next(reader)
# _, loss_train = sess.run([apply_gradient_op, loss], feed_dict={phase_train: True,
# input_img: inputs_batch[0],
# input_fp:inputs_batch[1],
# input_ang:inputs_batch[2],
# img_:inputs_batch[3]})
# print loss and time cost
if (s % 100 == 0):
duration = time.time() - start_time
start_time = time.time()
print('Step %d, loss = %.4f (%.2f s)' % (s, np.mean(loss_train), duration))
# save checkpoint
if s%1000 == 0:
saver = tf.train.Saver()
saver.save(sess, "checkpoints/"+ ckp_fn +"_"+conf.dataset+'_'+conf.eye, global_step)
In [ ]:
def main(argv=None):
train()
if __name__ == '__main__':
tf.app.run()