In [1]:
"""
---------- Dirl Multi GPU template ----------
There are methods to define the inputs of model
1. Input by queue
2. Input by data generator
you can choose one of the methods which meets your demand,
we have already commented out two blocks for each method


You can design your own codes below the quotes ''' '''

For example : 

''' define your optimizer '''
opt = tf.train.AdamOptimizer(0.0001, beta1=0.5, beta2=0.999)

Author: Shin-Yi Ding
"""


Out[1]:
"\n---------- Dirl Multi GPU template ----------\nThere are methods to define the inputs of model\n1. Input by queue\n2. Input by data generator\nyou can choose one of the methods which meets your demand,\nwe have already commented out two blocks for each method\n\n\nYou can design your own codes below the quotes ''' '''\n\nFor example : \n\n''' define your optimizer '''\nopt = tf.train.AdamOptimizer(0.0001, beta1=0.5, beta2=0.999)\n\nAuthor: Shin-Yi Ding\n"

In [2]:
# system package
import time
import os
import numpy as np
import tensorflow as tf

# defined package
import deepwarp
import load_dataset

# get the predefined parameters
# we implement this by package:argparse
from config import get_config
conf,_ = get_config()
TOWER_NAME = 'tower'


Namespace(agl_dim=2, batch_size=128, channel=3, dataset='test_set', ef_dim=14, encoded_agl_dim=16, eye='None', gpus='0', height=41, is_cfw_only=False, lr=0.0001, steps=1250000, width=51)

In [3]:
''' set name (for save checkpoints) '''
ckp_fn = 'Adam_0.001_v1'

In [4]:
''' set visible gpu IDs '''
# input example in config, "1,2"
gpus = conf.gpus.split(',')
os.environ["CUDA_VISIBLE_DEVICES"]=','.join([str(i) for i in gpus])
n_gpus = len(gpus)

In [5]:
''' define input setting '''
# case dependent
validation_portion = 0.05
conf.eye = "L"
data_dir = conf.dataset
dirs = np.asarray([d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))])

training_dirs = dirs[0:(dirs.shape[0]-int(dirs.shape[0]*validation_portion))]
valiation_dirs = dirs[(dirs.shape[0]-int(dirs.shape[0]*validation_portion)):dirs.shape[0]]

In [6]:
# define by cifar-10 example
# you don't have to change this function
def average_gradients(tower_grads):
    """Calculate the average gradient for each shared variable across all towers.
    Note that this function provides a synchronization point across all towers.
    Args:
      tower_grads: List of lists of (gradient, variable) tuples. The outer list
        is over individual gradients. The inner list is over the gradient
        calculation for each tower.
    Returns:
       List of pairs of (gradient, variable) where the gradient has been averaged
       across all towers.
    """
    average_grads = []
    
    for grad_and_vars in zip(*tower_grads):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        grads = []
        
        for g, _ in grad_and_vars:
            # Add 0 dimension to the gradients to represent the tower.
            expanded_g = tf.expand_dims(g, 0)
    
            # Append on a 'tower' dimension which we will average over below.
            grads.append(expanded_g)

        # Average over the 'tower' dimension.
        grad = tf.concat(axis=0, values=grads)
        grad = tf.reduce_mean(grad, 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads

In [7]:
# define the inputs and outputs of the inference and loss function here
# the inference model and loss function are defined in the *deepwarp.py*
def tower_loss(scope, input_img, input_fp, input_ang, img_, phase_train):
    """
    Args:
      scope: unique prefix string identifying the tower, e.g. 'tower_0'
      input_(): batch inputs
    Returns:
       Tensor of shape [] containing the total loss for a batch of data
    """

    ''' define your inputs and outputs of inference function'''
    img_pred = deepwarp.inference(input_img, input_fp, input_ang, phase_train, conf)
    
    ''' define your inputs and outputs of loss function '''
    with tf.name_scope('l2_loss'):
        loss = deepwarp.loss(img_pred, img_)
   
    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)

    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    return total_loss

In [ ]:
# the main function of training process, you have to design an input method here

def train():
    # calculate in cpu
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        
        # define the counter of global step
        global_step = tf.get_variable('global_step',
                                      [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)
        
        # define the phase train constant (bool) for batch normalization
        with tf.name_scope('phase_train'):
            phase_train = tf.placeholder(tf.bool, name='phase_train')
  
        ''' there are two methods to define the inputs of model '''
        # 1. input by queue
        # initial the data generator
        coord = tf.train.Coordinator()

        # define your queue in *load_dataset.py
        with tf.name_scope('create_inputs'):
            reader = load_dataset.DataGenerator(coord,
                                                pack_size=conf.batch_size*n_gpus,
                                                buffer_ratio=3,
                                                data_dir=data_dir,
                                                input_dirs=training_dirs,
                                                eye=conf.eye)
        
        # 2. input by data generator
        # define your generator in *load_dataset.py
        # with tf.name_scope('create_inputs'):
        #     reader = load_dataset.load_data(data_dir, training_dirs, conf.batch_size*n_gpus, conf.eye)        
        # define the placeholder for inputs
        # with tf.name_scope('inputs'):
        #     input_img = tf.placeholder(tf.float32, [None, conf.height, conf.width, conf.channel], name="input_img") # [None, 41, 51, 3]
        #     input_fp = tf.placeholder(tf.float32, [None, conf.height, conf.width,conf.ef_dim], name="input_fp") # [None, 41, 51, 14]
        #     input_ang = tf.placeholder(tf.float32, [None, conf.agl_dim], name="input_ang") ## [None, 41, 51, 2]
        #     img_ = tf.placeholder(tf.float32, [None, conf.height, conf.width, conf.channel], name ="Ground_Truth")

        
        ''' define the optimizer '''
        opt = tf.train.AdamOptimizer(conf.lr, beta1=0.5, beta2=0.999)
             
        # initial the tower gradients        
        tower_grads = []
        
        # define the multiple gpu towers
        # you only need to deal with the input here
        with tf.variable_scope(tf.get_variable_scope()):
            for i in range(n_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
                                                
                        ''' there are two methods to feed the training inputs '''
                        # 1. calculate loss (feed input by queue)
                        # get training batch from queue
                        input_batch = reader.dequeue(conf.batch_size)
                        
                        # calculate the loss for each tower
                        loss = tower_loss(scope,
                                          input_batch[0],
                                          input_batch[1],
                                          input_batch[2],
                                          input_batch[3],
                                          phase_train)
                        # 2. calculate loss (feed input by feed_dict)
                        # loss = tower_loss(scope,
                        #                   input_img[i*conf.batch_size:(i+1)*conf.batch_size],
                        #                   input_fp[i*conf.batch_size:(i+1)*conf.batch_size],
                        #                   input_ang[i*conf.batch_size:(i+1)*conf.batch_size],
                        #                   img_[i*conf.batch_size:(i+1)*conf.batch_size],
                        #                   phase_train)
                        
                        # Reuse variables for the next tower.
                        tf.get_variable_scope().reuse_variables()
                                                
                        # Calculate the gradients for the batch of data on this tower.
                        grads = opt.compute_gradients(loss)
                        
                        # Keep track of the gradients across all towers.
                        tower_grads.append(grads)
        
        # calculate mean gradients of towers
        grads = average_gradients(tower_grads)
               
        # apply gradients
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
        
        # define sess
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False))

        ''' start queue runners (Only needed when you input the dataset with queue) '''
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        reader.start_threads(sess, n_threads=2)
    
        init = tf.global_variables_initializer()
        sess.run(init)

        for s in range(conf.steps):
            start_time = time.time()
                        
            ''' define your sess.run  (feed input by queue) '''
            # 1. define by queue
            _, loss_train = sess.run([apply_gradient_op, loss], feed_dict={phase_train: True})

            # 2.define by feed_dict
            # inputs_batch = next(reader)
            # _, loss_train = sess.run([apply_gradient_op, loss], feed_dict={phase_train: True,
            #                                                                input_img: inputs_batch[0],
            #                                                                input_fp:inputs_batch[1],
            #                                                                input_ang:inputs_batch[2],
            #                                                                img_:inputs_batch[3]})
            
            # print loss and time cost
            if (s % 100 == 0):
                duration = time.time() - start_time    
                start_time = time.time()
                print('Step %d, loss = %.4f (%.2f s)' % (s, np.mean(loss_train), duration))
                
            # save checkpoint
            if s%1000 == 0:
                saver = tf.train.Saver()
                saver.save(sess, "checkpoints/"+ ckp_fn +"_"+conf.dataset+'_'+conf.eye, global_step)

In [ ]:
def main(argv=None):
    train()

if __name__ == '__main__':
    tf.app.run()


Step 0, loss = 0.3101 (2.21 s)