In [7]:
import sys
import numpy as np
import pandas as pd
import cv2
import matplotlib as plt
import tensorflow as tf
import os
from imgaug import augmenters as iaa
sys.path.append('src')
from ocr.datahelpers import load_words_data, char2idx, CHAR_SIZE
from ocr.dataiterator import BucketDataIterator
from ocr.helpers import img_extend, resize
from ocr.mlhelpers import TrainingPlot
from ocr.tfhelpers import create_cell
In [3]:
%matplotlib notebook
plt.rcParams['figure.figsize'] = (9.0, 5.0)
In [5]:
train_images, train_labels = load_words_data('data/sets/train.csv', is_csv=True)
dev_images, dev_labels = load_words_data('data/sets/dev.csv', is_csv=True)
In [8]:
slider_size = (64, 64) # Second parameter can be edited
layers = 4
residual_layers = 0
units = 512
num_buckets = 10
N_INPUT = slider_size[0]*slider_size[1]
vocab_size = CHAR_SIZE + 2 # Number of different chars + <PAD> and <EOS>
learning_rate = 1e-4 # 1e-4
dropout = 0.4
TRAIN_STEPS = 20000
TEST_ITER = 150
SAVE_ITER = 500
EPOCH = 500
BATCH_SIZE = 64
model_name = 'Classifier1'
save_loc = 'models/word-clas/CTC/'
summaries_dir = 'logs/word-clas/CTC/' + model_name
if not os.path.exists(save_loc):
os.makedirs(save_loc)
save_loc += model_name
In [10]:
data = {
'train': (train_images, train_labels, np.empty(len(train_labels), dtype=object)),
'dev': (dev_images, dev_labels, np.empty(len(dev_labels), dtype=object))
}
for d in ['train', 'dev']:
for i in range(len(data[d][0])):
data[d][0][i] = resize(data[d][0][i], slider_size[1], True)
data[d][2][i] = [char2idx(c) for c in data[d][1][i]]
print("Training images:", len(train_images))
print("Testing images:", len(dev_images))
In [11]:
seq = iaa.Sequential([
iaa.Sometimes(
0.3,
iaa.ElasticTransformation(alpha=(0.5, 10.0), sigma=5.0)),
iaa.OneOf([
iaa.GaussianBlur((0, 0.5)),
iaa.AverageBlur(k=(1, 3)),
iaa.MedianBlur(k=(1, 3)),
]),
iaa.Sometimes(
0.3,
iaa.AdditiveGaussianNoise(scale=0.01*255)),
])
In [12]:
class DataIterator(BucketDataIterator):
def next_feed(self, batch_size):
"""Create feed directly for model training."""
(inputs_,
targets_,
inputs_length_) = self.next_batch(batch_size)
return {
inputs: inputs_,
inputs_length: inputs_length_,
targets: targets_,
keep_prob: (1.0 - self.dropout) if self.train else 1.0
}
In [13]:
train_iterator = DataIterator(
data['train'][0],
data['train'][2],
num_buckets,
slider_size,
augmentation=seq,
dropout=dropout,
train=True)
test_iterator = DataIterator(
data['dev'][0],
data['dev'][2],
1,
slider_size,
train=False)
In [14]:
# Input placehodlers
# N_INPUT -> size of vector representing one image in sequence
# Inputs - Time major: (max_seq_length, batch_size, vec_size)
inputs = tf.placeholder(shape=(None, slider_size[0], None, 1),
dtype=tf.float32,
name='inputs')
inputs_length = tf.placeholder(shape=(None,),
dtype=tf.int32,
name='inputs_length')
targets = tf.sparse_placeholder(dtype=tf.int32,
name='targets')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
In [ ]:
SCALE = 0.0003
h_n_pool = 16 # = 2^(number of pooling layers on height)
w_n_pool = 8 # = 2^(number of pooling layers on width)
# 1. Convulation
conv1 = tf.layers.conv2d(
inputs=inputs,
filters=64,
kernel_size=[7, 7],
strides=(2, 2),
padding="same",
kernel_initializer=tf.contrib.layers.xavier_initializer(),
kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=SCALE),
activation=tf.nn.relu)
conv12 = tf.layers.conv2d(
inputs=conv1,
filters=64,
kernel_size=[5, 5],
strides=(1, 1),
padding="same",
kernel_initializer=tf.contrib.layers.xavier_initializer(),
kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=SCALE),
activation=tf.nn.relu)
# 2. Max Pool
pool1 = tf.layers.max_pooling2d(conv12, pool_size=[2, 2], strides=2)
# 3. Inception
conv2 = tf.layers.conv2d(
inputs=pool1,
filters=64,
kernel_size=[5, 5],
strides=(1, 1),
padding="same",
kernel_initializer=tf.contrib.layers.xavier_initializer(),
kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=SCALE),
activation=tf.nn.relu)
conv22 = tf.layers.conv2d(
inputs=conv2,
filters=128,
kernel_size=[5, 5],
strides=(1, 1),
padding="same",
kernel_initializer=tf.contrib.layers.xavier_initializer(),
kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=SCALE),
activation=tf.nn.relu)
# 4. Max Pool
pool2 = tf.layers.max_pooling2d(conv22, pool_size=[2, 1], strides=[2, 1])
# 5. Inception
conv3 = tf.layers.conv2d(
inputs=pool2,
filters=128,
kernel_size=[5, 5],
strides=(1, 1),
padding="same",
kernel_initializer=tf.contrib.layers.xavier_initializer(),
kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=SCALE),
activation=tf.nn.relu)
conv32 = tf.layers.conv2d(
inputs=conv3,
filters=256,
kernel_size=[5, 5],
strides=(1, 1),
padding="same",
kernel_initializer=tf.contrib.layers.xavier_initializer(),
kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=SCALE),
activation=tf.nn.relu)
# 6. Max Pool
pool3 = tf.layers.max_pooling2d(conv32, pool_size=[2, 2], strides=2)
# Image patches for RNN
image_patches1 = tf.layers.conv2d(
inputs=pool3,
filters=256,
kernel_size=[slider_size[0]//(h_n_pool), slider_size[1]//(w_n_pool)],
strides=(1, 1),
padding="same",
kernel_initializer=tf.contrib.layers.xavier_initializer(),
kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=SCALE),
activation=tf.nn.relu)
image_patches = tf.layers.separable_conv2d(
inputs=image_patches1,
filters=1280,
kernel_size=[slider_size[0]//(h_n_pool), 1],
strides=(1, 1),
depth_multiplier=5,
name='image_patches')
processed_inputs = tf.transpose(
tf.squeeze(image_patches, [1]),
[1, 0, 2],
name='squeeze_transpose')
lengths = inputs_length//w_n_pool
In [ ]:
enc_cell_fw = create_cell(units,
layers,
residual_layers,
is_dropout=True,
keep_prob=keep_prob)
enc_cell_bw = create_cell(units,
layers,
residual_layers,
is_dropout=True,
keep_prob=keep_prob)
bi_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
cell_fw=enc_cell_fw,
cell_bw=enc_cell_bw,
inputs=processed_inputs,
sequence_length=lengths,
dtype=tf.float32,
time_major=True)
con_outputs = tf.concat(bi_outputs, -1)
logits = tf.layers.dense(inputs=con_outputs, units=vocab_size)
# Final outputs
decoded, log_prob = tf.nn.ctc_beam_search_decoder(
logits, lengths, merge_repeated=False)
word_prediction = tf.sparse_tensor_to_dense(decoded[0], name='word_prediction')
In [17]:
ctc_loss = tf.nn.ctc_loss(
targets,
logits,
lengths,
time_major=True,
ctc_merge_repeated=True,
ignore_longer_outputs_than_inputs=True)
regularization = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
loss = tf.identity(tf.reduce_mean(ctc_loss) + sum(regularization), name='loss')
optimizer = tf.train.AdamOptimizer(learning_rate)
train_step = optimizer.minimize(loss, name='train_step')
# Label error rate
label_err_rate = tf.reduce_mean(
tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))
In [18]:
# pred_length = decoded[0].get_shape().as_list()[1]
# targets_length = targets.get_shape().as_list()[1]
# pad_lenght = tf.maximum(pred_length, targets_length)
# pred_pad = tf.pad(
# word_prediction,
# [[0, 0],
# [0, pad_lenght - pred_length]],
# constant_values=PAD,
# mode='CONSTANT')
# targets_pad = tf.pad(
# test_targets,
# [[0, 0],
# [0, pad_lenght - targets_lenght]],
# constant_values=PAD,
# mode='CONSTANT')
# acc_weights = tf.cast(tf.divide(pred_pad, pred_pad), tf.float32)
# # acc_weights = tf.sequence_mask(
# # tf.subtract(final_seq_lengths, 1), # word_inputs_length, try max(targets, inputs)
# # word_pad_lenght,
# # dtype=tf.float32)
# correct_prediction = tf.equal(pred_pad, targets_pad)
# accuracy = (tf.reduce_sum(tf.cast(correct_prediction, tf.float32) * acc_weights) \
# / tf.reduce_sum(acc_weights))
In [ ]:
## Training
sess = tf.Session()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
# TesorBoard stats
tf.summary.scalar('Loss', loss)
tf.summary.scalar('Label_err_rate', label_err_rate)
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(summaries_dir + '/train',
sess.graph)
test_writer = tf.summary.FileWriter(summaries_dir + '/test')
# I recommend using tensorboard, comment this out if you want
trainPlot = TrainingPlot(TRAIN_STEPS, TEST_ITER, LOSS_ITER)
try:
for i_batch in range(TRAIN_STEPS):
fd = train_iterator.next_feed(BATCH_SIZE)
sess.run(train_step, feed_dict=fd)
if i_batch % LOSS_ITER == 0:
# Plotting loss
tmpLoss = loss.eval(fd)
trainPlot.updateCost(tmpLoss, i_batch // LOSS_ITER)
if i_batch % TEST_ITER == 0:
# Plotting accuracy
fd_test = test_iterator.next_feed(BATCH_SIZE)
accTest = accuracy.eval(fd_test)
accTrain = accuracy.eval(fd)
trainPlot.updateAcc(accTest, accTrain, i_batch // TEST_ITER)
testSummary = sess.run(merged, feed_dict=fd_test)
trainSummary = sess.run(merged, feed_dict=fd)
test_writer.add_summary(testSummary, i_batch)
train_writer.add_summary(trainSummary, i_batch)
if i_batch % SAVE_ITER == 0:
saver.save(sess, save_loc, global_step=i_batch)
if i_batch % EPOCH == 0:
fd_test = test_iterator.next_feed(BATCH_SIZE) # BATCH_SIZE)
print('Batch %r - Loss: %r' % (i_batch, sess.run(loss, fd)))
print(' Train Label Err Rate: %r' % sess.run(label_err_rate,
feed_dict=fd))
print(' Eval Label Err Rate: %r' % sess.run(label_err_rate,
feed_dict=fd_test))
print()
except KeyboardInterrupt:
print('Stopped on batch:', i_batch)
saver.save(sess, save_loc)
print('Training interrupted, model saved.')
In [ ]:
for i in range(5):
fd_test = test_iterator.next_feed(BATCH_SIZE)
predict_, target_ = sess.run([word_prediction, test_targets], fd_test)
for i, (inp, pred) in enumerate(zip(target_, predict_)):
print(' expected > {}'.format(inp))
print(' predicted > {}'.format(pred))
if i >= 1:
break
print()