Training: Extended Sequence to Sequence Model - Word Classification

Training extended sequence to sequnce model for recognition of words. This is my personal experimental model, which should improve accuracy of regular sequence to sequence model

TODO

Testing imageStandardization on whole images, removed from TF model
Remove random border

In [2]:
import sys
import numpy as np
import pandas as pd
import matplotlib as plt
import tensorflow as tf
import tensorflow.contrib.seq2seq as seq2seq
from tensorflow.python.layers import core as layers_core
from tensorflow.python.ops import math_ops
import time
import math
import unidecode
import cv2

sys.path.append('src')
from ocr.datahelpers import load_words_data, corresponding_shuffle, char2idx
from ocr.helpers import implt, img_extend, resize
from ocr.mlhelpers import TrainingPlot
from ocr.normalization import letter_normalization, image_standardization
from ocr.tfhelpers import create_cell


%matplotlib notebook
plt.rcParams['figure.figsize'] = (9.0, 5.0)

Loading images


In [3]:
LANG = 'en'

In [5]:
images, labels, gaplines = load_words_data(
    ['data/processed/breta/words_gaplines/'], load_gaplines=True)


Loading words...
 |████████████████████████████████████████| 100.0% 
-> Number of words: 5069

Settings


In [5]:
CHARS = 82 if LANG =='cz' else 52

PAD = 0   # Padding
EOS = 1   # End of seq
LETTER_PAD = -1.0

# Testing partialy separated dataset
sep_words = 500
is_sep_words = False

num_buckets = 20
slider_size = (60, 1)
step_size = 1
N_INPUT = slider_size[0]*slider_size[1]
char_size = CHARS + 2
letter_size = 64*64

encoder_layers = 2
encoder_residual_layers = 1        # HAVE TO be smaller than encoder_layers
encoder_units = 128                # 256

decoder_layers = 2*encoder_layers  # 2* is due to the bidirectional encoder
decoder_residual_layers = 2*encoder_residual_layers
decoder_units = encoder_units

wordRNN_layers = 2
wordRNN_residual_layers = 1
wordRNN_units = 128

attention_size = 256

add_output_length = 0 # 4

learning_rate = 1e-3               # 1e-4
max_gradient_norm = 5.0            # For gradient clipping
dropout = 0.4
train_per = 0.8                    # Percentage of training data

TRAIN_STEPS = 1000000               # Number of training steps!
TEST_ITER = 150
LOSS_ITER = 50
SAVE_ITER = 1000
BATCH_SIZE = 25                    # 20
EPOCH = 1000                       # Number of batches in epoch - not accurate
save_location = 'models/word-clas/' + LANG + '/SeqRNN/Classifier4'

Dataset


In [6]:
# Shuffle data for later splitting
images, labels, gaplines = corresponding_shuffle([images, labels, gaplines])

# TODO: for testing, can be removed
if slider_size[1] > 2:
    for i in range(len(images)):
        images[i] = cv2.copyMakeBorder(
            images[i],
            0, 0, slider_size[1]//2, slider_size[1]//2,
            cv2.BORDER_CONSTANT,
            value=[0, 0, 0])

labels_idx = np.empty(len(labels), dtype=object)
for i, label in enumerate(labels):
    labels_idx[i] = [char2idx(c, True) for c in label]
    labels_idx[i].append(EOS)    
    
# Split data on train and test dataset
div = int(train_per * len(images))

trainImages = images[0:div]
testImages = images[div:]

trainGaplines = gaplines[0:div]
testGaplines = gaplines[div:]

trainLabels_idx = labels_idx[0:div]
testLabels_idx = labels_idx[div:]

print("Training images:", div)
print("Testing images:", len(images) - div)


Training images: 4055
Testing images: 1014

In [7]:
def stackImage(img, a, b):
    """ Add blank columns (lenght a, b) at start and end of image """
    return np.concatenate(
        (np.zeros((img.shape[0], a)),
         np.concatenate((img, np.zeros((img.shape[0], b))), axis=1)),
        axis=1)
    
# Dont mix train and test images
num_new_images = 0 # 2
trainImagesF = np.empty(len(trainImages) * (num_new_images+1), dtype=object)
trainGaplinesF = np.empty(len(trainImages) * (num_new_images+1), dtype=object)
trainLabelsF_idx = np.empty(len(trainImages)*(num_new_images+1), dtype=object)
for idx, img in enumerate(trainImages):
    add_idx = idx*(num_new_images+1)
    trainImagesF[add_idx] = img
    trainGaplinesF[add_idx] = trainGaplines[idx]
    trainLabelsF_idx[add_idx] = trainLabels_idx[idx]
    for i in range(num_new_images):
        a, b = np.random.randint(1, 16, size=2)
        trainImagesF[add_idx + (i+1)] = stackImage(img, a, b)
        trainGaplinesF[add_idx + (i+1)] = trainGaplines[idx] + a
        trainLabelsF_idx[add_idx + (i+1)] = trainLabels_idx[idx]
        
print("Total train images", len(trainImagesF))


Total train images 4055

In [8]:
class BucketDataIterator():
    """ Iterator for feeding seq2seq model during training """
    def __init__(self,
                 images,
                 targets,
                 gaplines,
                 num_buckets=5,
                 slider=(60, 30),
                 slider_step=2,
                 train=True):
        
        self.train = train
        self.slider = slider
        images4letters = images.copy()   # Just for 1px slider testing
        
        # PADDING of images to slider size; -(a // b) =  ceil(a/b)
        for i in range(len(images)):
            images[i] = image_standardization(
                img_extend(
                    images[i],
                    (images[i].shape[0],
                     max(-(-images[i].shape[1] // slider_step) * slider_step, 60))))
        in_length = [(image.shape[1] + 1 - slider[1])//slider_step for image in images]
        
        # Split images to sequence of vectors
        img_seq = np.empty(len(images), dtype=object)
        for i, img in enumerate(images):
            img_seq[i] = [img[:, loc * slider_step: loc*slider_step + slider[1]].flatten()
                          for loc in range(in_length[i])]
            
        end_letter = np.ones(letter_size) # * LETTER_PAD # End letter is full white
        np.put(end_letter, [0], [1])
        letter_seq = np.empty(len(images), dtype=object)
        for i, img in enumerate(images4letters):
            # Removed imageStandardization
            letter_seq[i] = [imageStandardization(    
                letter_normalization(img[:, gaplines[i][x]:gaplines[i][x+1]])).flatten()
                             for x in range(len(gaplines[i])-1)]
#             letter_seq[i] = [letter_normalization(img[:, gaplines[i][x]:gaplines[i][x+1]]).flatten()
#                              for x in range(len(gaplines[i])-1)]
            letter_seq[i].append(end_letter)

        # Create pandas dataFrame and sort it by images width (length)
        # letters_length is num_letter + EOS
        self.dataFrame = pd.DataFrame({'in_length': in_length,
                                       'letters_length': [len(g) for g in gaplines],
                                       'words_length': [len(t) for t in targets],
                                       'in_images': img_seq,
                                       'letters': letter_seq,
                                       'words': targets
                                      }).sort_values('in_length').reset_index(drop=True)

        bsize = int(len(images) / num_buckets)
        self.num_buckets = num_buckets
        
        # Create buckets by slicing parts by indexes
        self.buckets = []
        for bucket in range(num_buckets-1):
            self.buckets.append(self.dataFrame.iloc[bucket * bsize: (bucket+1) * bsize])
        self.buckets.append(self.dataFrame.iloc[(num_buckets-1) * bsize:])        
        
        self.buckets_size = [len(bucket) for bucket in self.buckets]

        # cursor[i] will be the cursor for the ith bucket
        self.cursor = np.array([0] * num_buckets)
        self.bucket_order = np.random.permutation(num_buckets)
        self.bucket_cursor = 0
        self.shuffle()
        print("Iterator created.")

        
    def shuffle(self, idx=None):
        """ Shuffle idx bucket or each bucket separately """
        for i in [idx] if idx is not None else range(self.num_buckets):
            self.buckets[i] = self.buckets[i].sample(frac=1).reset_index(drop=True)
            self.cursor[i] = 0


    def next_batch(self, batch_size):
        """
        Creates next training batch of size: batch_size
        Retruns: (in_images, letters, words,
                  in_length, letter_length, word_length)
        """
        i_bucket = self.bucket_order[self.bucket_cursor]
        # Increment cursor and shuffle in case of new round
        self.bucket_cursor = (self.bucket_cursor + 1) % self.num_buckets
        if self.bucket_cursor == 0:
            self.bucket_order = np.random.permutation(self.num_buckets)
            
        if self.cursor[i_bucket] + batch_size > self.buckets_size[i_bucket]:
            self.shuffle(i_bucket)

        # Handle too big batch sizes
        if (batch_size > self.buckets_size[i_bucket]):
            batch_size = self.buckets_size[i_bucket]

        res = self.buckets[i_bucket].iloc[self.cursor[i_bucket]:
                                          self.cursor[i_bucket]+batch_size]
        self.cursor[i_bucket] += batch_size
        
        # Check correct length of later prediction of sequences
        assert np.all(res['in_length'] + add_output_length >= res['letters_length'])

        input_max = max(res['in_length'])
        letters_max = max(res['letters_length'])
        words_max = max(res['words_length'])        
        
        input_seq = np.ones((batch_size, input_max, N_INPUT), dtype=np.float32) * LETTER_PAD
        for i, img in enumerate(res['in_images']):
            input_seq[i][:res['in_length'].values[i]] = img
        input_seq = input_seq.swapaxes(0, 1)    # Time major
        
        letters = np.ones((batch_size, letters_max, letter_size), dtype=np.float32) * LETTER_PAD
        for i, img in enumerate(res['letters']):
            letters[i][:res['letters_length'].values[i]] = img
        
        # Need to pad according to the maximum length output sequence
        words = np.zeros([batch_size, words_max], dtype=np.int32)
        for i, word in enumerate(res['words']):
            words[i][:res['words_length'].values[i]] = word
        
        return (input_seq, letters, words,
                res['in_length'].values, res['letters_length'].values, res['words_length'].values)


    def next_feed(self, size, words=True, train=None):
        """ Create feed directly for model training """
        if train is None:
            train = self.train
        (encoder_inputs_,
         letter_targets_,
         word_targets_,
         encoder_inputs_length_,
         letter_targets_length_,
         word_targets_length_) = self.next_batch(size)
        return {
            encoder_inputs: encoder_inputs_,
            encoder_inputs_length: encoder_inputs_length_,
            letter_targets: letter_targets_,
            letter_targets_length: letter_targets_length_,
            word_targets: word_targets_,
            word_targets_length: word_targets_length_,
            keep_prob: (1.0 - dropout) if self.train else 1.0,
            is_training: train,
            is_words: words
        }

In [9]:
# Create iterator for feeding RNN
# Create only once, it modifies: labels_idx
train_iterator = BucketDataIterator(
    trainImagesF,
    trainLabelsF_idx,
    trainGaplinesF,
    num_buckets,
    slider_size,
    step_size,
    train=True)

if is_sep_words:
    train_letters_iterator = BucketDataIterator(
        trainImagesF[:sep_words],
        trainLabelsF_idx[:sep_words],
        trainGaplinesF[:sep_words],
        1,
        slider_size,
        step_size,
        train=True)

test_iterator = BucketDataIterator(
    testImages,
    testLabels_idx,
    testGaplines,
    num_buckets,
    slider_size,
    step_size,
    train=False)

del images
del labels
del gaplines


Iterator created.
Iterator created.

Placeholders


In [10]:
# Only encoder inputs are time major
# Encoder inputs shape (max_seq_length, batch_size, vec_size)
encoder_inputs = tf.placeholder(shape=(None, None, N_INPUT),
                                dtype=tf.float32,
                                name='encoder_inputs')
encoder_inputs_length = tf.placeholder(shape=(None,),
                                       dtype=tf.int32,
                                       name='encoder_inputs_length')

# Required for letter sep. training
# Contains EOS symbol
letter_targets = tf.placeholder(shape=(None, None, letter_size),
                                dtype=tf.float32,
                                name='letter_targets')
letter_targets_length = tf.placeholder(shape=(None,),
                                       dtype=tf.int32,
                                       name='letter_targets_length')

# Required for word training
word_targets = tf.placeholder(shape=(None, None),
                              dtype=tf.int32,
                              name='word_targets')
word_targets_length = tf.placeholder(shape=(None,),
                                     dtype=tf.int32,
                                     name='word_targets_length')
# Dropout value
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
# Testing control
is_training = tf.placeholder(tf.bool, shape=None, name="is_training")
is_words = tf.placeholder(tf.bool, shape=None, name="is_words")

Decoder Train Feeds


In [11]:
sequence_size, batch_size, _ = tf.unstack(tf.shape(encoder_inputs)) # letter_targets

EOS_SLICE = tf.cast(tf.fill([batch_size, 1, letter_size], EOS), tf.float32)
PAD_SLICE = tf.cast(tf.fill([batch_size, 1, letter_size], LETTER_PAD), tf.float32) # PAD

# Train inputs with EOS symbol at start of seq
letter_train_inputs = tf.concat([PAD_SLICE, letter_targets], axis=1) #EOS_SLICE
letter_train_length = letter_targets_length

# Length of infer (test) letter output
# TODO: will have to make shorte...
output_length = tf.minimum(
    tf.reduce_max(encoder_inputs_length) * step_size // 15 + add_output_length,
    23)

Encoder


In [12]:
enc_cell_fw = create_cell(encoder_units,
                          encoder_layers,
                          encoder_residual_layers,
                          is_dropout=True,
                          keep_prob=keep_prob)
enc_cell_bw = create_cell(encoder_units,
                          encoder_layers,
                          encoder_residual_layers,
                          is_dropout=True,
                          keep_prob=keep_prob)

In [13]:
### CNN ###
SCALE = 0.001 # 0.1
# Functions for initializing convulation and pool layers
def weights(name, shape):
    return tf.get_variable(name, shape=shape,
                           initializer=tf.contrib.layers.xavier_initializer(),
                           regularizer=tf.contrib.layers.l2_regularizer(scale=SCALE))

def bias(const, shape, name=None):
    return tf.Variable(tf.constant(const, shape=shape), name=name)

def conv2d2(x, W, name=None):
    return tf.nn.conv2d(x, W, strides=[1, 2, 1, 1], padding='SAME', name=name)


# W_conv1 = weights('W_conv1', shape=[2, 1, 1, 1])
# b_conv1 = bias(0.1, shape=[1], name='b_conv1')


# def CNN_1(x):
#     x = tf.reshape(x, [slider_size[0], slider_size[1], 1])
#     x = tf.image.per_image_standardization(x)
#     img = tf.reshape(x, [1, slider_size[0], slider_size[1], 1])  
#     h_conv1 = tf.nn.relu(conv2d2(img, W_conv1) + b_conv1, name='h_conv1')    
#     return h_conv1

def CNN_1(x):
    x = tf.image.per_image_standardization(x)
    return x

In [14]:
# inputs = tf.map_fn(
#     lambda seq: tf.map_fn(
#         lambda img:
#             tf.reshape(
#                 CNN_1(tf.reshape(img, [slider_size[0], slider_size[1], 1])), [-1]),
#         seq),
#     encoder_inputs,
#     dtype=tf.float32)
inputs = encoder_inputs

# Bidirectional RNN, gibe fw and bw outputs separately
enc_outputs, enc_state = tf.nn.bidirectional_dynamic_rnn(
    cell_fw = enc_cell_fw,
    cell_bw = enc_cell_bw,
    inputs = inputs,
    sequence_length = encoder_inputs_length,
    dtype = tf.float32,
    time_major = True)

encoder_outputs = tf.concat(enc_outputs, -1)

if encoder_layers == 1:
    encoder_state = enc_state
else:
    encoder_state = []
    for layer_id in range(encoder_layers):
        encoder_state.append(enc_state[0][layer_id])  # forward
        encoder_state.append(enc_state[1][layer_id])  # backward
    encoder_state = tuple(encoder_state)

Decoder


In [15]:
# attention_states: size [batch_size, max_time, num_units]
attention_states = tf.transpose(encoder_outputs, [1, 0, 2])

# Create an attention mechanism
attention_mechanism = seq2seq.BahdanauAttention(
    attention_size, attention_states,              # decoder_units instead of attention_size
    memory_sequence_length=encoder_inputs_length)

decoder_cell = create_cell(decoder_units,
                           decoder_layers,
                           decoder_residual_layers,
                           is_dropout=True,
                           keep_prob=keep_prob)

decoder_cell = seq2seq.AttentionWrapper(
    decoder_cell, attention_mechanism,
    attention_layer_size=attention_size)

decoder_initial_state = decoder_cell.zero_state(batch_size, tf.float32).clone(
    cell_state=encoder_state)

TRAIN DECODER


In [16]:
# Helper
helper = seq2seq.TrainingHelper(
    letter_train_inputs, letter_targets_length)

# Decoder
projection_layer = layers_core.Dense(
    letter_size, activation=tf.tanh, use_bias=True)

decoder = seq2seq.BasicDecoder(
    decoder_cell, helper, decoder_initial_state,
    output_layer=projection_layer)

# Dynamic decoding
outputs, final_context_state, _ = seq2seq.dynamic_decode(
    decoder)

letter_logits_train = outputs.rnn_output
letter_prediction_train = outputs.sample_id

INFERENCE DECODER


In [17]:
### CNN ###
def conv2d(x, W, name=None):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME', name=name)

def max_pool_2x2(x, name=None):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name)

W_conv1_ = weights('W_conv1_', shape=[16, 16, 1, 4])
b_conv1_ = bias(0.1, shape=[4], name='b_conv1_')
W_conv2_ = weights('W_conv2_', shape=[5, 5, 4, 12])
b_conv2_ = bias(0.1, shape=[12], name='b_conv2_')
W_conv3_ = weights('W_conv3_', shape=[3, 3, 12, 20])
b_conv3_ = bias(0.1, shape=[20], name='b_conv3_')
W_fc1_ = weights('W_fc2_', shape=[8*8*20, char_size])
b_fc1_ = bias(0.1, shape=[char_size], name='b_fc2_')


def CNN_2(x, clas=True):
    if clas:
        b_size, seq_size, _ = tf.unstack(tf.shape(x))
    else:
        b_size, _ = tf.unstack(tf.shape(x))
        
    imgs = tf.reshape(x, [-1, 64, 64, 1])
    x_imgs = tf.map_fn(
        lambda img: tf.image.per_image_standardization(img), imgs)    
    # 1. Layer - Convulation
    h_conv1 = tf.nn.relu(conv2d(x_imgs, W_conv1_) + b_conv1_, name='h_conv1_')
    # 2. Layer - Max Pool
    h_pool1 = max_pool_2x2(h_conv1, name='h_pool1_')
    # 3. Layer - Convulation
    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2_) + b_conv2_, name='h_conv2_')
    # Reshape filters into flat arraty
    h_pool2 = max_pool_2x2(h_conv2, name='h_pool2_')
    h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3_) + b_conv3_, name='h_conv3_')
    # Reshape filters into flat arraty
    h_pool3 = max_pool_2x2(h_conv3, name='h_pool3_')
    h_flat = tf.reshape(h_pool3, [-1, 8*8*20])
    # 6. Dropout
    h_flat_drop = tf.nn.dropout(h_flat, keep_prob)
    # 7. Output layer
    out = tf.matmul(h_flat_drop, W_fc1_) + b_fc1_
    if clas:
        return tf.reshape(out, [b_size, seq_size, -1])
    else:
        return tf.reshape(out, [b_size, -1])

In [18]:
# Helper without embedding, can add param: 'next_inputs_fn'
helper_infer = seq2seq.InferenceHelper(
    sample_fn=(lambda x: x),
    sample_shape=[letter_size],
    sample_dtype=tf.float32,
    start_inputs=tf.cast(tf.fill([batch_size, letter_size], LETTER_PAD), tf.float32), # PAD <- EOS, need flaot32
    end_fn=(lambda sample_ids:
            tf.equal(tf.argmax(CNN_2(sample_ids, False), axis=-1, output_type=tf.int32), 1)))
            

decoder_infer = seq2seq.BasicDecoder(
    decoder_cell, helper_infer, decoder_initial_state,
    output_layer=projection_layer)

# Dynamic decoding
outputs_infer, final_context_state, final_seq_lengths = seq2seq.dynamic_decode(
    decoder_infer,
    impute_finished=True,
    maximum_iterations=output_length)

letter_prediction_infer = tf.identity(outputs_infer.rnn_output, # sample_id
                                      name='letter_prediction_infer')

RNN


In [19]:
word_inputs = tf.cond(is_training,
                      lambda: letter_targets,
                      lambda: letter_prediction_infer)
word_inputs_length_ = tf.cond(is_training,
                              lambda: letter_targets_length,
                              lambda: final_seq_lengths)

word_inputs_length = word_inputs_length_    # tf.subtract(word_inputs_length_, 1)

# Input images CNN
word_outputs = CNN_2(word_inputs)

word_logits = word_outputs
word_prediction = tf.argmax(
    word_logits, axis=-1, output_type=tf.int32,    # word_logits = tf.layers.dense(
    name='word_prediction')

Optimizer

Weights + Paddings


In [20]:
# Pad test accuracy
letter_test_targets = tf.pad(
    letter_targets,
    [[0, 0],
     [0, output_length - tf.reduce_max(letter_targets_length)],
     [0, 0]],
    constant_values=LETTER_PAD,
    mode='CONSTANT')

# Pad prediction to match lengths
letter_pred_infer_pad = tf.pad(
    letter_prediction_infer,
    [[0, 0],
     [0, output_length - tf.reduce_max(word_inputs_length_)],
     [0, 0]],
    constant_values=LETTER_PAD,
    mode='CONSTANT')


word_pad_lenght = tf.maximum(
    tf.reduce_max(word_inputs_length_),
    tf.reduce_max(word_targets_length))

word_logits_pad = tf.pad(
    word_logits,
    [[0, 0],
     [0, word_pad_lenght - tf.reduce_max(word_inputs_length_)],
     [0, 0]],
    constant_values=PAD,
    mode='CONSTANT')
word_pred_pad = tf.pad(
    word_prediction,
    [[0, 0],
     [0, word_pad_lenght - tf.reduce_max(word_inputs_length_)]],
    constant_values=PAD,
    mode='CONSTANT')
word_targets_pad = tf.pad(
    word_targets,
    [[0, 0],
     [0, word_pad_lenght - tf.reduce_max(word_targets_length)]],
    constant_values=PAD,
    mode='CONSTANT')


# Weights
letter_loss_weights = tf.sequence_mask(
    letter_train_length,
    tf.reduce_max(letter_train_length),
    dtype=tf.float32)

letter_test_weights = tf.sequence_mask(
    letter_train_length,
    output_length,
    dtype=tf.float32)

word_loss_weights = tf.sequence_mask(
    word_targets_length,    # word_inputs_length, try max(targets, inputs)
    word_pad_lenght,
    dtype=tf.float32)

word_acc_weights = tf.sequence_mask(
    tf.subtract(final_seq_lengths, 1),    # word_inputs_length, try max(targets, inputs)
    word_pad_lenght,
    dtype=tf.float32)

In [21]:
## Loss
letter_loss = tf.losses.mean_squared_error(
    predictions=letter_logits_train,
    labels=letter_targets,
    weights=tf.stack([letter_loss_weights for i in range(letter_size)], axis=-1))

word_seq_loss = seq2seq.sequence_loss(
    logits=word_logits_pad,
    targets=word_targets_pad,
    weights=word_loss_weights,
    name='word_loss')
regularization = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
word_loss = word_seq_loss + sum(regularization)


loss = tf.cond(is_words,
               lambda: word_loss,
               lambda: letter_loss)

# learning_rate_ = learning_rate
learning_rate_ = tf.cond(tf.logical_and(is_words, tf.logical_not(is_training)),
                         lambda: learning_rate, # * 0.1,
                         lambda: learning_rate)

## Calculate and clip gradients
params = tf.trainable_variables()
gradients = tf.gradients(loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(
    gradients, max_gradient_norm)

### Optimization
optimizer = tf.train.AdamOptimizer(learning_rate_)
train_step = optimizer.apply_gradients(
    zip(clipped_gradients, params),
    name='train_step')

### Evaluate model
correct_prediction = tf.equal(
    word_pred_pad,
    word_targets_pad)
## Advanced accuracy only the elements of seq including EOS symbol
# accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
accuracy = (tf.reduce_sum(tf.cast(correct_prediction, tf.float32) * word_acc_weights) \
            / tf.reduce_sum(word_acc_weights))

Training


In [23]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()

# Creat plot for live stats ploting
trainPlot = TrainingPlot(TRAIN_STEPS, TEST_ITER, LOSS_ITER)

try:
    for i_batch in range(TRAIN_STEPS):
        # Three steps (can overflow): 1. train letters, 2. train words on known letter 3. combine
        is_words_, is_train_ = (True, True)
        if is_sep_words:
            is_words_, is_train_ = (True, False)   # (True, True) - without partial sep letters
        if i_batch < 1000 or i_batch % 2 == 0:    # 2000 # 5000
            is_words_, is_train_ = (False, True)
#         elif i_batch < 3000:                      # 12000
#             is_words_, is_train_ = (True, True)
        elif i_batch % 4 == 3:
            is_words_, is_train_ = (True, False)

        fd = train_iterator.next_feed(BATCH_SIZE, words=is_words_, train=is_train_)
        
        if is_train_ and is_sep_words:
            train_step.run(train_letters_iterator.next_feed(BATCH_SIZE, words=is_words_, train=is_train_))
        else:
            train_step.run(fd)
        
        if i_batch % LOSS_ITER == 0:
            # Plotting loss
            tmpLoss = loss.eval(fd)
            trainPlot.updateCost(tmpLoss, i_batch // LOSS_ITER)
    
        if i_batch % TEST_ITER == 0:
            # Plotting accuracy
            fd_test = test_iterator.next_feed(BATCH_SIZE)
            fd = train_iterator.next_feed(BATCH_SIZE, words=True, train=False)
            accTest = accuracy.eval(fd_test)
            accTrain = accuracy.eval(fd)
            trainPlot.updateAcc(accTest, accTrain, i_batch // TEST_ITER)

        if i_batch % SAVE_ITER == 0:
            saver.save(sess, save_location)
        
        if i_batch % EPOCH == 0:
            fd_test = test_iterator.next_feed(BATCH_SIZE)
            print('batch %r - loss: %r' % (i_batch, sess.run(loss, fd_test)))
            predict_, target_ = sess.run([word_prediction, word_targets], fd_test)
            for i, (inp, pred) in enumerate(zip(target_, predict_)):
                print('    expected  > {}'.format(inp))
                print('    predicted > {}'.format(pred))
                if i >= 1:
                    break
            print()

except KeyboardInterrupt:
    print('Training interrupted, model saved.')
    
saver.save(sess, save_location)


batch 0 - loss: 4.1433916
    expected  > [22 46 28 34 32  1  0  0]
    predicted > [25 25 25 25 25 25 25]
    expected  > [31 32 28 39  1  0  0  0]
    predicted > [25 25 25 25 25 25 39]

batch 1000 - loss: 3.9913728
    expected  > [17 45 49 32 38  1  0  0]
    predicted > [25 39 25 25 25 25 25 25]
    expected  > [30 28 46 47  1  0  0  0]
    predicted > [25 25 25 25 25 25 25 25]

batch 2000 - loss: 2.3057177
    expected  > [10  1  0  0  0]
    predicted > [11  1  1  1]
    expected  > [52  1  0  0  0]
    predicted > [11  1  1  1]

batch 3000 - loss: 1.8378663
    expected  > [37 36  1  0  0]
    predicted > [37 36 47  1]
    expected  > [19  1  0  0  0]
    predicted > [33  1  1  1]

batch 4000 - loss: 1.9805311
    expected  > [45 42 39 39  1  0  0  0]
    predicted > [45 32 39 39  1  1]
    expected  > [27 42 41 32  1  0  0  0]
    predicted > [21 28 40 32  1  1]

batch 5000 - loss: 2.1887887
    expected  > [49 28 46 36  1  0  0  0  0]
    predicted > [41 42 46 28  1  1  1  1]
    expected  > [40  1  0  0  0  0  0  0  0]
    predicted > [41  1  1  1  1  1  1  1]

batch 6000 - loss: 1.872894
    expected  > [17 45 36 41 28 46 32 37 36  1]
    predicted > [20 36 41 36 36 41 41  1]
    expected  > [43 39 28 41 32 47  1  0  0  0]
    predicted > [43 39 42 40 42 28 47  1]

batch 7000 - loss: 1.7666719
    expected  > [46 47 28 47  1  0  0  0]
    predicted > [46 47 28 47  1  1]
    expected  > [32 28 30 35  1  0  0  0]
    predicted > [28 28 28  1  1  1]

batch 8000 - loss: 1.7898357
    expected  > [45 28 31  1  0  0  0]
    predicted > [45 28 31  1  1  1]
    expected  > [38 52 47 38 28  1  0]
    predicted > [47 52 29 39 39 39]

batch 9000 - loss: 2.1676068
    expected  > [46 36 47 48 28 30 32  1  0  0  0  0]
    predicted > [46 36 47 45 42 32  1  1  1  1  1]
    expected  > [49 30 32 45 28  1  0  0  0  0  0  0]
    predicted > [41 32 32 42  1  1  1  1  1  1  1]

batch 10000 - loss: 1.7424697
    expected  > [49 52 38 45 52 46 47 28 39 36 53 42 49 28 39 28  1]
    predicted > [49 52 39 32 42 42 47 28 42  1]
    expected  > [53 32 41 28  1  0  0  0  0  0  0  0  0  0  0  0  0]
    predicted > [45 32 41 28  1  1  1  1  1  1]

batch 11000 - loss: 1.4472994
    expected  > [53 42 39 36 38  1  0  0  0]
    predicted > [53 30 39 36 38  1  1  1]
    expected  > [43 45 42 30 32 46  1  0  0]
    predicted > [43 45 45 46  1  1  1  1]

batch 12000 - loss: 1.5876625
    expected  > [42 35 32 41  1  0  0  0]
    predicted > [42 35 32 36  1  1  1]
    expected  > [43 42 47 45 32 39 36  1]
    predicted > [43 42 47 36 32 39  1]

batch 13000 - loss: 1.1057456
    expected  > [40  1  0  0  0  0  0  0]
    predicted > [41  1  1  1  1  1  1]
    expected  > [29 52 49 28 47  1  0  0]
    predicted > [29 52 49 28 47  1  1]

batch 14000 - loss: 1.254143
    expected  > [30 32 46 47 28  1  0  0  0]
    predicted > [42 32 47 47 28  1  1  1  1]
    expected  > [29 45 42 48 34 35 47  1  0]
    predicted > [29 42 40 42 34 34 47 47  1]

batch 15000 - loss: 1.5632484
    expected  > [35 32 28 49 52  1  0]
    predicted > [38 45 28 41 52]
    expected  > [10 41 33 42  1  0  0]
    predicted > [17 28 33 32  1]

batch 16000 - loss: 1.454107
    expected  > [49  1  0  0  0  0  0  0  0  0]
    predicted > [49  1  1  1  1  1  1  1  1]
    expected  > [24 32 45 41 32 45  1  0  0  0]
    predicted > [23 28 45 41 45  1  1  1  1]

batch 17000 - loss: 1.2110069
    expected  > [37 36 41 52  1  0]
    predicted > [37 36 28  1  1]
    expected  > [19 36 43  1  0  0]
    predicted > [19 36 43  1  1]

batch 18000 - loss: 1.3091056
    expected  > [21 36 43  1  0  0]
    predicted > [27 28 43  1]
    expected  > [23  1  0  0  0  0]
    predicted > [23  1  1  1]

batch 19000 - loss: 1.0000377
    expected  > [45  1  0  0  0]
    predicted > [27  1  1  1]
    expected  > [ 8 42  1  0  0]
    predicted > [ 8 28  1  1]

batch 20000 - loss: 1.087924
    expected  > [ 8 32 41  1  0  0]
    predicted > [26 48 41  1  1]
    expected  > [53 36 43  1  0  0]
    predicted > [46 45 43  1  1]

batch 21000 - loss: 2.0892587
    expected  > [ 5 28 29  1  0  0  0]
    predicted > [ 5 28 29  1  1]
    expected  > [29 42 45  1  0  0  0]
    predicted > [29 42 45  1  1]

batch 22000 - loss: 1.8513888
    expected  > [49 30 32 45 28  1  0  0  0  0  0]
    predicted > [49 30 45 28  1  1  1  1  1  1  1]
    expected  > [36 41 30 39 48 31 32  1  0  0  0]
    predicted > [36 41 30 39 28 31 32  1  1  1  1]

batch 23000 - loss: 1.4094399
    expected  > [52 32 39 39 42 50  1  0  0]
    predicted > [52 32 39 42 50  1  1  1  1]
    expected  > [52 42 48 45  1  0  0  0  0]
    predicted > [52 42 46  1  1  1  1  1  1]

batch 24000 - loss: 1.4057717
    expected  > [33 36 28 39 42 49 28  1  0]
    predicted > [33 36 28 39 42 49  1  1]
    expected  > [28 46  1  0  0  0  0  0  0]
    predicted > [28 45  1  1  1  1  1  1]

batch 25000 - loss: 2.255497
    expected  > [15 36 38 31 42  1  0  0  0  0]
    predicted > [15 36 38 31 42  1  1  1  1]
    expected  > [16 47 28 38 28 45  1  0  0  0]
    predicted > [42 31 38 28 45  1  1  1  1]

batch 26000 - loss: 2.1663573
    expected  > [14 42 41 36 38 28  1  0  0  0  0  0  0]
    predicted > [14 42 40 38 42  1  1  1  1  1]
    expected  > [40 28 47 47 32 45  1  0  0  0  0  0  0]
    predicted > [40 42 47 47 32 45  1  1  1  1]

batch 27000 - loss: 1.1713476
    expected  > [47 35 32 45 32  1  0  0  0]
    predicted > [47 35 32 45 32  1  1  1]
    expected  > [50 36 46 35  1  0  0  0  0]
    predicted > [50 36 46 35  1  1  1  1]

batch 28000 - loss: 1.4209536
    expected  > [33 28 30 32  1  0  0]
    predicted > [33 42 30 32  1  1]
    expected  > [46 48 43 28  1  0  0]
    predicted > [46 42 43 42  1  1]

batch 29000 - loss: 1.3666291
    expected  > [45 52 47  1  0  0]
    predicted > [45 52 47  1]
    expected  > [4 1 0 0 0 0]
    predicted > [4 1 1 1]

batch 30000 - loss: 1.3133948
    expected  > [ 7 48 32 39  1  0  0  0]
    predicted > [ 7 32 48 39  1  1  1]
    expected  > [19 52 30 35 39 32  1  0]
    predicted > [ 3 52 30 35 39 32  1]

batch 31000 - loss: 1.6442363
    expected  > [ 8 39 42 45 52  1]
    predicted > [ 8 32 41 52  1]
    expected  > [33 32 32 39  1  0]
    predicted > [33 32 39  1  1]

batch 32000 - loss: 1.2510978
    expected  > [42 29 30 28 46  1  0  0  0]
    predicted > [42 38 32 28  1  1  1  1]
    expected  > [43 45 32 46 47 42  1  0  0]
    predicted > [43 45 32 46 46 47 42  1]

batch 33000 - loss: 1.1826903
    expected  > [20 43 45 36 41 34  1  0]
    predicted > [20 42 45 36 41 34  1]
    expected  > [41 28 46 47 45 42 37  1]
    predicted > [41 28 46 47 45 34 37]

Training interrupted, model saved.
Out[23]:
'models/word-clas/en/SeqRNN/Classifier4'

In [24]:
%matplotlib inline

def evalLetters(feed):
    predict_, target_, predict_lengths_, target_lengths_ = sess.run(
        [letter_prediction_infer,
         letter_targets,
         final_seq_lengths,
         letter_targets_length],
        feed)
    for i, (inp, pred) in enumerate(zip(target_, predict_)):
        print("Expected images:", target_lengths_[i])
        for x in range(len(inp)):
            implt(inp[x].reshape((64, 64)), 'gray')
        print("Predicted images:", predict_lengths_[i])
        for x in range(len(pred)):
            implt(pred[x].reshape((64, 64)), 'gray')
        if i >= 0:
            break
            
fd_test = test_iterator.next_feed(BATCH_SIZE)
fd = train_iterator.next_feed(BATCH_SIZE, words=False, train=False)

evalLetters(fd_test)
evalLetters(fd)


Expected images: 3
Predicted images: 3
Expected images: 2
Predicted images: 2