Training: CNN with Bidirctional RNN - Character Segmentation

Training model combining CNN with bidirectional RNN for character segmentation


In [2]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import tensorflow as tf
import cv2

sys.path.append('src')
from ocr.helpers import implt
from ocr.mlhelpers import TrainingPlot
from ocr.datahelpers import load_words_data, corresponding_shuffle
from ocr.tfhelpers import create_cell


%matplotlib notebook
plt.rcParams['figure.figsize'] = (9.0, 5.0)

Settings


In [3]:
PAD = 0                            # Value for PADding images
POS = 1                            # Values of positive and negative label 0/-1
NEG = 0

POS_SPAN = 1                             # Number of positive values around true position (5 is too high)
POS_WEIGHT = 3                           # Weighting possitive values in loss counting

slider_size = (60, 60)                   # Height is set to 60 by data and width should be even
slider_step = 2
N_INPUT = slider_size[0]*slider_size[1]  # Size of sequence input vector will depend on CNN
num_buckets = 5
n_classes = 2                            # Number of different outputs

rnn_layers = 4
rnn_residual_layers = 2                  # HAVE TO be smaller than encoder_layers
rnn_units = 256

learning_rate = 1e-4
dropout = 0.4                            # Percentage of dopped out data
train_set = 0.8                          # Percentage of training data

TRAIN_STEPS = 500000                     # Number of training steps!
TEST_ITER = 150
LOSS_ITER = 50
SAVE_ITER = 2000
BATCH_SIZE = 10
# EPOCH = 2000                           # "Number" of batches in epoch

save_loc = 'models/gap-clas/RNN/Bi-RNN-new'

Loading Images


In [6]:
images, _, gaplines = load_words_data(
    ['data/processed/breta/words_gaplines/'],
    load_gaplines=True)


Loading words...
 |████████████████████████████████████████| 100.0% 
-> Number of words: 5069

Dataset


In [4]:
# Shuffle data
images, gaplines = corresponding_shuffle([images, gaplines])

for i in range(len(images)):
    # Add border and offset gaplines - RUN ONLY ONCE
    images[i] = cv2.copyMakeBorder(images[i],
                                   0, 0, int(slider_size[1]/2), int(slider_size[1]/2),
                                   cv2.BORDER_CONSTANT,
                                   value=0)
    gaplines[i] += int(slider_size[1] / 2)
    
# Split data on train and test dataset
div = int(train_set * len(images))

trainImages = images[0:div]
testImages = images[div:]

trainGaplines = gaplines[0:div]
testGaplines = gaplines[div:]

print("Training images:", div)
print("Testing images:", len(images) - div)


Training images: 4055
Testing images: 1014

In [5]:
class BucketDataIterator():
    """ Iterator for feeding seq2seq model during training """
    def __init__(self,
                 images,
                 gaplines,
                 gap_span,
                 num_buckets=5,
                 slider=(60, 30),
                 slider_step=2,
                 imgprocess=lambda x: x,
                 train=True):
        
        self.train = train
        length = [(image.shape[1]-slider[1])//slider_step for image in images]
    
        # Creating indices from gaplines
        indices = gaplines - int(slider[1]/2)
        indices = indices // slider_step
        
        # Split images to sequence of vectors
        # + targets seq of labels per image in images seq
        images_seq = np.empty(len(images), dtype=object)
        targets_seq = np.empty(len(images), dtype=object)
        for i, img in enumerate(images):
            images_seq[i] = [imgprocess(img[:, loc * slider_step: loc * slider_step + slider[1]].flatten())
                             for loc in range(length[i])]
            
            targets_seq[i] = np.ones((length[i])) * NEG
            for offset in range(gap_span):
                ind = indices[i] + (-(offset % 2) * offset // 2) + ((1 - offset%2) * offset // 2) 
                
                if ind[0] < 0:
                    ind[0] = 0
                if ind[-1] >= length[i]:
                    ind[-1] = length[i] - 1
                    
                targets_seq[i][ind] = POS  

        # Create pandas dataFrame and sort it by images seq lenght (length) 
        # in_length == out_length
        self.dataFrame = pd.DataFrame({'length': length,
                                       'images': images_seq,
                                       'targets': targets_seq
                                      }).sort_values('length').reset_index(drop=True)

        bsize = int(len(images) / num_buckets)
        self.num_buckets = num_buckets
        
        # Create buckets by slicing parts by indexes
        self.buckets = []
        for bucket in range(num_buckets-1):
            self.buckets.append(self.dataFrame.iloc[bucket * bsize: (bucket+1) * bsize])
        self.buckets.append(self.dataFrame.iloc[(num_buckets-1) * bsize:])        
        
        self.buckets_size = [len(bucket) for bucket in self.buckets]

        # cursor[i] will be the cursor for the ith bucket
        self.cursor = np.array([0] * num_buckets)
        self.bucket_order = np.random.permutation(num_buckets)
        self.bucket_cursor = 0
        self.shuffle()
        print("Iterator created.")


    def shuffle(self, idx=None):
        """ Shuffle idx bucket or each bucket separately """
        for i in [idx] if idx is not None else range(self.num_buckets):
            self.buckets[i] = self.buckets[i].sample(frac=1).reset_index(drop=True)
            self.cursor[i] = 0


    def next_batch(self, batch_size):
        """
        Creates next training batch of size: batch_size
        Retruns: image seq, letter seq, seq lengths
        """
        i_bucket = self.bucket_order[self.bucket_cursor]
        # Increment cursor and shuffle in case of new round
        self.bucket_cursor = (self.bucket_cursor + 1) % self.num_buckets
        if self.bucket_cursor == 0:
            self.bucket_order = np.random.permutation(self.num_buckets)
            
        if self.cursor[i_bucket] + batch_size > self.buckets_size[i_bucket]:
            self.shuffle(i_bucket)

        # Handle too big batch sizes
        if (batch_size > self.buckets_size[i_bucket]):
            batch_size = self.buckets_size[i_bucket]

        res = self.buckets[i_bucket].iloc[self.cursor[i_bucket]:
                                          self.cursor[i_bucket]+batch_size]
        self.cursor[i_bucket] += batch_size

        # PAD input sequence and output
        # Pad sequences with <PAD> to same length
        max_length = max(res['length'])
        
        input_seq = np.zeros((batch_size, max_length, N_INPUT), dtype=np.float32)
        for i, img in enumerate(res['images']):
            input_seq[i][:res['length'].values[i]] = img
        
        # Need to pad according to the maximum length output sequence
        targets = np.ones([batch_size, max_length], dtype=np.float32) * PAD
        for i, target in enumerate(targets):
            target[:res['length'].values[i]] = res['targets'].values[i]
        
        return input_seq, targets, res['length'].values


    def next_feed(self, size):
        """ Create feed directly for model training """
        (inputs_,
         targets_,
         length_) = self.next_batch(size)
        return {
            inputs: inputs_,            
            targets: targets_,
            length: length_,
            keep_prob: (1.0 - dropout) if self.train else 1.0
        }

In [6]:
# Create iterator for feeding BiRNN
train_iterator = BucketDataIterator(trainImages,
                                    trainGaplines,
                                    POS_SPAN,
                                    num_buckets,
                                    slider_size,
                                    slider_step,
                                    train=True)
test_iterator = BucketDataIterator(testImages,
                                   testGaplines,
                                   POS_SPAN,
                                   1,
                                   slider_size,
                                   slider_step,
                                   train=False)


Iterator created.
Iterator created.

Create classifier

Inputs


In [7]:
# Input placehodlers
# N_INPUT -> size of vector representing one image in sequence
# Inputs shape (batch_size, max_seq_length, vec_size) - time major
inputs = tf.placeholder(shape=(None, None, N_INPUT),
                                dtype=tf.float32,
                                name='inputs')
length = tf.placeholder(shape=(None,),
                        dtype=tf.int32,
                        name='length')
# Required for training, not required for application
targets = tf.placeholder(shape=(None, None),
                         dtype=tf.int64,
                         name='targets')
# Dropout value
keep_prob = tf.placeholder(tf.float32, name='keep_prob')

Standardization + CNN


In [8]:
# Help functions for standard layers
def conv2d(x, W, name=None):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME', name=name)

def max_pool_2x2(x, name=None):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name)

# 1. Layer - Convulation variables
W_conv1 = tf.get_variable('W_conv1', shape=[5, 5, 1, 2],
                          initializer=tf.contrib.layers.xavier_initializer())
b_conv1 = tf.Variable(tf.constant(0.1, shape=[2]), name='b_conv1')
# 3. Layer - Convulation variables
W_conv2 = tf.get_variable('W_conv2', shape=[5, 5, 2, 4],
                          initializer=tf.contrib.layers.xavier_initializer())
b_conv2 = tf.Variable(tf.constant(0.1, shape=[4]), name='b_conv2')

def CNN(x):
    x = tf.image.per_image_standardization(x)
    x = tf.reshape(x, [1, slider_size[0], slider_size[1], 1])
    # 1. Layer - Convulation
    h_conv1 = tf.nn.relu(conv2d(x, W_conv1) + b_conv1, name='h_conv1')
    # 2. Layer - Max Pool
    h_pool1 = max_pool_2x2(h_conv1, name='h_pool1')
    # 3. Layer - Convulation
    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2, name='h_conv2')
    # 4. Layer - Max Pool
    return max_pool_2x2(h_conv2, name='h_pool2')

# Input images CNN
inpts = tf.map_fn(
    lambda seq: tf.map_fn(
        lambda img:
            tf.reshape(
                CNN(tf.reshape(img, [slider_size[0], slider_size[1], 1])),
#                 CNN(tf.reshape(img, [1, slider_size[0], slider_size[1], 1])),
                [-1]),
        seq),
    inputs,
    dtype=tf.float32)

Bi-RNN


In [9]:
# Cells
cell_fw = create_cell(rnn_units,
                      rnn_layers,
                      rnn_residual_layers,
                      is_dropout=True,
                      keep_prob=keep_prob)
cell_bw = create_cell(rnn_units,
                      rnn_layers,
                      rnn_residual_layers,
                      is_dropout=True,
                      keep_prob=keep_prob)

In [10]:
# Bidirectional RNN
bi_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
    cell_fw = cell_fw,
    cell_bw = cell_bw,
    inputs = inpts,
    sequence_length = length,
    dtype = tf.float32)

outputs = tf.concat(bi_outputs, -1, name='outputs')

# pred = tf.matmul(outputs, W)
# pred = tf.scan(lambda a, x: tf.matmul(x, W), outputs, infer_shape=False)
pred = tf.layers.dense(inputs=outputs,
                       units=n_classes,
                       name='pred')
prediction = tf.argmax(pred, axis=-1, name='prediction')

Optimizer


In [11]:
# Define loss and optimizer
weights = tf.multiply(targets, POS_WEIGHT) + 1
loss = tf.reduce_mean(tf.losses.sparse_softmax_cross_entropy(
    logits=pred,
    labels=targets,
    weights=weights), name='loss')
train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss, name='train_step')

# Evaluate model
correct_pred = tf.equal(prediction, targets)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

Training


In [12]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()

# Creat plot for live stats ploting
trainPlot = TrainingPlot(TRAIN_STEPS, TEST_ITER, LOSS_ITER)

try:
    for i_batch in range(TRAIN_STEPS):
        fd = train_iterator.next_feed(BATCH_SIZE)
        train_step.run(fd)
        
        if i_batch % LOSS_ITER == 0:
            # Plotting loss
            tmpLoss = loss.eval(fd)
            trainPlot.updateCost(tmpLoss, i_batch // LOSS_ITER)
    
        if i_batch % TEST_ITER == 0:
            # Plotting accuracy
            fd_test = test_iterator.next_feed(BATCH_SIZE)
            accTest = accuracy.eval(fd_test)
            accTrain = accuracy.eval(fd)
            trainPlot.updateAcc(accTest, accTrain, i_batch // TEST_ITER)

        if i_batch % SAVE_ITER == 0:
            saver.save(sess, save_loc)
        
except KeyboardInterrupt:
    saver.save(sess, save_loc)
    print('Training interrupted, model saved.')


fd_test = test_iterator.next_feed(2*BATCH_SIZE)
accTest = accuracy.eval(fd_test)
print("Training finished with accuracy:", accTest)


Training interrupted, model saved.
Training finished with accuracy: 0.975385