In [1]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import os
import sys
from six.moves import cPickle as pickle
%matplotlib inline

In [2]:
pickle_file = 'mini_train.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    mini_X = save['data']
    mini_outcome = save['outcome']
    del save  # hint to help gc free up memory

In [3]:
batch_size=40
num_unrollings=5

class BatchGenerator(object):
    
    def __init__(self, x_image, y_labels, batch_size, num_unrollings):
        self._x_image = x_image
        self._y_labels = y_labels
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        self._y_digits = self._extract_digits()
        
        
    def _extract_digits(self):
        end_digit = 10.0
        
        digits = np.ndarray(shape=(
                self._num_unrollings, len(self._y_labels), int(end_digit + 1)), 
                            dtype=np.float32)
        
        for i in range(self._num_unrollings):
            digit_coding = np.asarray( [x[i] if len(x)>i else end_digit 
                                        for x in self._y_labels])
            digit_coding = (
                np.arange(end_digit+1) == digit_coding[:,None]).astype(np.float32)
            digits[i,:,:] = digit_coding
        
        return digits
    
    def next_batch(self):
        idx = np.random.choice(self._x_image.shape[0],self._batch_size)
        batch_x = self._x_image[idx,:,:,:]
        batch_y = self._y_digits[:,idx,:]
        
        return batch_x, batch_y

In [4]:
mini_train_batches = BatchGenerator(mini_X[:100], 
                                    mini_outcome['label'][:100],
                                    batch_size, num_unrollings)

batch_x, batch_y = mini_train_batches.next_batch()
print batch_y.shape
print batch_x.shape


(5, 40, 11)
(40, 64, 64, 3)

In [5]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

In [32]:
image_size = mini_X.shape[1]
num_channels = mini_X.shape[3]
CNN_num_nodes = 1024


#this should be large enough
RNN_num_nodes = 1024


#11 collums for each digits, i.e., 0,1,...,9, and a ending ch <END>
vocabulary_size = 11

graph = tf.Graph()
with graph.as_default():
    
    x_image = tf.placeholder(tf.float32, shape=(batch_size, 
                                                image_size, 
                                                image_size, num_channels))
    
    W_conv1 = weight_variable([5, 5, num_channels, 32])
    b_conv1 = bias_variable([32])

    h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
    h_pool1 = max_pool_2x2(h_conv1)
    
    
    W_conv2 = weight_variable([5, 5, 32, 64])
    b_conv2 = bias_variable([64])

    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
    h_pool2 = max_pool_2x2(h_conv2)
    
    W_fc1 = weight_variable([16 * 16 * 64, CNN_num_nodes])
    b_fc1 = bias_variable([CNN_num_nodes])

    h_pool2_flat = tf.reshape(h_pool2, [-1, 16*16*64])
    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
    
    
    # Input gate: input, previous output, and bias.
    ix = weight_variable([vocabulary_size, RNN_num_nodes])
    im = weight_variable([RNN_num_nodes, RNN_num_nodes])
    ib = bias_variable([RNN_num_nodes])

    # Forget gate: input, previous output, and bias.
    fx = weight_variable([vocabulary_size, RNN_num_nodes])
    fm = weight_variable([RNN_num_nodes, RNN_num_nodes])
    fb = bias_variable([RNN_num_nodes])

    # Memory cell: input, state and bias.                             
    cx = weight_variable([vocabulary_size, RNN_num_nodes])
    cm = weight_variable([RNN_num_nodes, RNN_num_nodes])
    cb = bias_variable([RNN_num_nodes])

    # Output gate: input, previous output, and bias.
    ox = weight_variable([vocabulary_size, RNN_num_nodes])
    om = weight_variable([RNN_num_nodes, RNN_num_nodes])
    ob = bias_variable([RNN_num_nodes])
    
    # Definition of the cell computation.
    # state is cell state, o is hidden state, i is input
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
        forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
        update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
        return output_gate * tf.tanh(state), state
    
    
    # placeholder for digit input and digit labels
    digits_data = []
    for _ in range(num_unrollings + 1):
        digits_data.append(
            tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
        digits_inputs = digits_data[:num_unrollings]
        digits_labels = digits_data[1:]  # labels are inputs shifted by one time step.
    
    
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, RNN_num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, RNN_num_nodes]), trainable=False)

    #connect with CNN

    W_CNN = weight_variable([CNN_num_nodes, RNN_num_nodes])
    b_CNN = bias_variable([RNN_num_nodes])

    CNN_output = tf.matmul(h_fc1, W_CNN) + b_CNN

    output = saved_output + CNN_output
    state = saved_state + CNN_output

    # Unrolled LSTM loop.
    outputs = list()
    
    for i in digits_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)
        
    # Classifier weights and biases.
    w_fc_rnn = weight_variable([RNN_num_nodes, vocabulary_size])
    b_fc_rnn = bias_variable([vocabulary_size])

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w_fc_rnn, b_fc_rnn)
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits, tf.concat(0, digits_labels)))
        
    
    # Optimizer.
    #optimizer = tf.train.AdamOptimizer(1e-4).minimize(loss)
    # learning rate decay and gradiant clipping
    global_step = tf.Variable(0, trainable=False)
    starter_learning_rate = 0.1
    learning_rate = tf.train.exponential_decay(starter_learning_rate, 
                                               global_step, 100, 0.5, staircase=True)
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss)) # reverse zip opration
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)
    
    
    # Predictions.
    train_prediction = tf.nn.softmax(logits)
    
    #let's check the prediction accuracy for 1st digit
    correct_prediction = tf.equal(tf.argmax(
            tf.matmul(outputs[0], w_fc_rnn) + b_fc_rnn
            ,1), 
                                  tf.argmax(
            digits_labels[0]
            ,1))

    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

it seems that gradiant clipping does not really help, at least in this setting.


In [33]:
num_steps = 1000
summary_frequency = 20

with tf.Session(graph=graph) as session:

    tf.initialize_all_variables().run()
    print('Initialized')

    mean_loss = 0
    mean_accuracy = 0

    for step in range(num_steps):
        batch_x, batch_y = mini_train_batches.next_batch()
    
        feed_dict = dict()
        feed_dict[x_image] = batch_x
    
        feed_dict[digits_data[0]] = np.zeros([batch_y.shape[1],batch_y.shape[2]])
    
        for i in range(num_unrollings):
            feed_dict[digits_data[i+1]] = batch_y[i]
            
        _, l, lr = session.run(
            [optimizer, loss, learning_rate], feed_dict=feed_dict)
        mean_loss += l
    
        train_accuracy = accuracy.eval(feed_dict=feed_dict)
        mean_accuracy += train_accuracy
    
        #now print something
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
                mean_accuracy = mean_accuracy/ summary_frequency
            
            # The mean loss is an estimate of the loss over the last few batches.
            print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
        
        
            print("step %d, training accuracy %g"%(step, mean_accuracy))
            mean_accuracy = 0


Initialized
Average loss at step 0: 3.531158 learning rate: 0.100000
step 0, training accuracy 0.2
Average loss at step 20: 1.294838 learning rate: 0.100000
step 20, training accuracy 0.30375
Average loss at step 40: 1.137932 learning rate: 0.100000
step 40, training accuracy 0.38875
Average loss at step 60: 1.099598 learning rate: 0.100000
step 60, training accuracy 0.355
Average loss at step 80: 1.089548 learning rate: 0.100000
step 80, training accuracy 0.39
Average loss at step 100: 1.061030 learning rate: 0.050000
step 100, training accuracy 0.425
Average loss at step 120: 0.945924 learning rate: 0.050000
step 120, training accuracy 0.5125
Average loss at step 140: 0.863832 learning rate: 0.050000
step 140, training accuracy 0.59375
Average loss at step 160: 0.823998 learning rate: 0.050000
step 160, training accuracy 0.62
Average loss at step 180: 0.774189 learning rate: 0.050000
step 180, training accuracy 0.64
Average loss at step 200: 0.737642 learning rate: 0.025000
step 200, training accuracy 0.705
Average loss at step 220: 0.645075 learning rate: 0.025000
step 220, training accuracy 0.80875
Average loss at step 240: 0.644279 learning rate: 0.025000
step 240, training accuracy 0.7975
Average loss at step 260: 0.608758 learning rate: 0.025000
step 260, training accuracy 0.8125
Average loss at step 280: 0.604276 learning rate: 0.025000
step 280, training accuracy 0.84625
Average loss at step 300: 0.575329 learning rate: 0.012500
step 300, training accuracy 0.8425
Average loss at step 320: 0.529598 learning rate: 0.012500
step 320, training accuracy 0.8825
Average loss at step 340: 0.532718 learning rate: 0.012500
step 340, training accuracy 0.87125
Average loss at step 360: 0.515809 learning rate: 0.012500
step 360, training accuracy 0.895
Average loss at step 380: 0.511643 learning rate: 0.012500
step 380, training accuracy 0.88125
Average loss at step 400: 0.498578 learning rate: 0.006250
step 400, training accuracy 0.92625
Average loss at step 420: 0.479310 learning rate: 0.006250
step 420, training accuracy 0.91
Average loss at step 440: 0.502095 learning rate: 0.006250
step 440, training accuracy 0.895
Average loss at step 460: 0.488675 learning rate: 0.006250
step 460, training accuracy 0.90375
Average loss at step 480: 0.475908 learning rate: 0.006250
step 480, training accuracy 0.915
Average loss at step 500: 0.466610 learning rate: 0.003125
step 500, training accuracy 0.925
Average loss at step 520: 0.466694 learning rate: 0.003125
step 520, training accuracy 0.925
Average loss at step 540: 0.450436 learning rate: 0.003125
step 540, training accuracy 0.9125
Average loss at step 560: 0.460300 learning rate: 0.003125
step 560, training accuracy 0.9225
Average loss at step 580: 0.443997 learning rate: 0.003125
step 580, training accuracy 0.9375
Average loss at step 600: 0.442042 learning rate: 0.001563
step 600, training accuracy 0.9225
Average loss at step 620: 0.446174 learning rate: 0.001563
step 620, training accuracy 0.92875
Average loss at step 640: 0.452038 learning rate: 0.001563
step 640, training accuracy 0.93125
Average loss at step 660: 0.444264 learning rate: 0.001563
step 660, training accuracy 0.92875
Average loss at step 680: 0.434963 learning rate: 0.001563
step 680, training accuracy 0.95125
Average loss at step 700: 0.455665 learning rate: 0.000781
step 700, training accuracy 0.93625
Average loss at step 720: 0.457733 learning rate: 0.000781
step 720, training accuracy 0.935
Average loss at step 740: 0.451910 learning rate: 0.000781
step 740, training accuracy 0.935
Average loss at step 760: 0.443841 learning rate: 0.000781
step 760, training accuracy 0.945
Average loss at step 780: 0.436770 learning rate: 0.000781
step 780, training accuracy 0.92
Average loss at step 800: 0.421478 learning rate: 0.000391
step 800, training accuracy 0.935
Average loss at step 820: 0.426666 learning rate: 0.000391
step 820, training accuracy 0.94
Average loss at step 840: 0.435431 learning rate: 0.000391
step 840, training accuracy 0.9325
Average loss at step 860: 0.437289 learning rate: 0.000391
step 860, training accuracy 0.9375
Average loss at step 880: 0.438290 learning rate: 0.000391
step 880, training accuracy 0.9325
Average loss at step 900: 0.451918 learning rate: 0.000195
step 900, training accuracy 0.93625
Average loss at step 920: 0.431810 learning rate: 0.000195
step 920, training accuracy 0.93
Average loss at step 940: 0.438990 learning rate: 0.000195
step 940, training accuracy 0.93875
Average loss at step 960: 0.444538 learning rate: 0.000195
step 960, training accuracy 0.92375
Average loss at step 980: 0.416328 learning rate: 0.000195
step 980, training accuracy 0.945

In [ ]:


In [ ]: