In [1]:
import os
import sys
import argparse
import math
import logging
import random
import cv2
import numpy as np
import tensorflow as tf
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 10)
data_path = '/home/ubuntu/data/oxford_syntetic_text/mnt/ramdisk/max/90kDICT32px'
experiment_dir ='/home/ubuntu/data/oxford_syntetic_text/models/test03'
In [2]:
def dense_to_sparse(dense_tensor, out_type):
indices = tf.where(tf.not_equal(dense_tensor, tf.constant(-1, dense_tensor.dtype)))
values = tf.gather_nd(dense_tensor, indices)
shape = tf.shape(dense_tensor, out_type=out_type)
return tf.SparseTensor(indices, values, shape)
def decode_word(l, decoder_dict, blank_code=-1):
return ''.join([decoder_dict[x] for x in l if x!=blank_code])
def evaluate_wer(pred, real):
wer = 0
for p, r in zip(pred, real):
try:
if p != r:
wer +=1
except:
print(p, r)
wer = wer / len(pred)
return wer
In [3]:
df_train = pd.read_csv(os.path.join(data_path,'annotation_train.txt'), delimiter=' ', names=['file', 'n'])
df_val = pd.read_csv(os.path.join(data_path,'annotation_val.txt' ), delimiter=' ', names=['file', 'n'])
df_test = pd.read_csv(os.path.join(data_path,'annotation_test.txt' ), delimiter=' ', names=['file', 'n'])
print('Images train: ', df_train.shape[0])
print('Images valid: ', df_val.shape[0])
print('Images test: ', df_test.shape[0])
# Cuda devices
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
gpu_options = tf.GPUOptions(allow_growth = True)
# Decoder dict and num_classes
char_list = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E',
'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U',
'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
encoder_dict = {}
for i, c in enumerate(char_list):
encoder_dict[c] = i
decoder_dict = {}
for e in encoder_dict:
decoder_dict[encoder_dict[e]]=e
num_classes = len(decoder_dict) + 1
In [4]:
def adjust_image(img_file, x_size=192, y_size=48, x_blanks_ini=0):
im = Image.open(img_file)
if np.max(im)>0:
x, y = im.size
factor = y_size/y
new_x = min( max(1, int(factor*x)), x_size-x_blanks_ini)
if len(np.array(im.resize((new_x, y_size))).shape) == 3:
img = np.array(im.resize((new_x, y_size)))[:,:,0]
else:
img = np.array(im.resize((new_x, y_size)))
img_adjusted = np.concatenate([np.zeros((y_size, x_blanks_ini)), img], axis=1)
new_x_size = img_adjusted.shape[1]
if new_x_size < x_size:
img_adjusted = np.concatenate([img_adjusted, np.zeros((y_size, x_size-new_x_size))], axis=1)
if np.max(img_adjusted)>0:
return img_adjusted
else:
return []
else:
return []
def read_word_image(f, target_c, x_size=192, y_size=48, target_max_size=19):
'''
'''
# adjust and resize to the final size
img_adjusted = adjust_image(f, x_size=x_size, y_size=y_size)
if img_adjusted != []:
#Calculate image_len
image_len = np.max(np.nonzero(np.max(img_adjusted, axis=0)))
# Target
target_ini = [encoder_dict[k] for k in target_c] # encode to
if len(target_ini)>target_max_size: # Pendiente de resolver mejor
target_ini = target_ini[:target_max_size]
target_len = len(target_ini)
target = np.ones([target_max_size], dtype=np.uint8)*(-1)
target[:target_len] = target_ini
return img_adjusted/255, list(target), image_len, target_len
else:
return [], None, None, None
In [ ]:
In [5]:
def data_generator(file_list, path_files=data_path, batch_size=16, max_files=0):
if max_files==0: # all files
num_batches = len(file_list)//batch_size
else:
num_batches = min(max_files//batch_size, len(file_list)//batch_size)
n = 0
# Shuffle files
np.random.shuffle(file_list)
for j in range(num_batches):
images_batch = []
images_len_batch = []
target_batch = []
target_len_batch = []
for i in range(batch_size):
f = os.path.join(path_files, file_list[n])
target_c = f.split('/')[-1].split('.')[0].split('_')[1]
img, t, img_l, t_l = read_word_image(os.path.join(path_files, f), target_c, x_size=192, y_size=48, target_max_size=19)
img = np.reshape(img, (48, 192, 1))
images_batch += [img]
images_len_batch += [img_l]
target_batch += [t]
target_len_batch += [t_l]
n += 1
yield np.array(images_batch), images_len_batch, target_batch, target_len_batch
# test
trn_generator = data_generator(list(df_test.file), path_files=data_path, batch_size=16, max_files=100)
images_b, images_len_b, target_b, target_len_b = next(trn_generator)
print(images_b.shape)
In [6]:
# Size of
filters_list = [512,512,512,512,512]
kernels_list = [3,3,3,3,3]
dilations_list = [1,1,2,4,8,16]
if 1:
# Model
graph = tf.Graph()
with graph.as_default():
#with tf.device('/cpu:0'): # Check how to put this on CPU
if 1:
#Placeholders
with tf.name_scope('inputs') as scope:
# List of TFRecod filenames (for train, valid and test)
images_batch_ph = tf.placeholder(tf.float32, shape=[None, 48, 192, 1], name='images_batch_ph')
image_len_ph = tf.placeholder(tf.int32, shape=[None], name='image_len_ph')
labels_batch_ph = tf.placeholder(tf.int32, shape=[None, 19], name='labels_batch_ph')
labels_len_batch_ph = tf.placeholder(tf.int32, shape=[None], name='labels_len_batch_ph')
# Convert target to sparse
target = tf.cast(dense_to_sparse(labels_batch_ph, tf.int64), tf.int32)
target_len = tf.cast(labels_len_batch_ph, tf.int32)
# Dropout parameter
#keep_prob = tf.placeholder(tf.float32, name='keep_prob')
if 1:
with tf.name_scope('model') as scope:
# First 2 2D convolution of 3x3with 10 filters
conv2d = tf.layers.conv2d(images_batch_ph, 20, 5, padding='SAME')
conv2d = tf.layers.conv2d(conv2d, 20, 5, padding='SAME')
conv2d = tf.layers.max_pooling2d(conv2d, 2, 2)
conv2d = tf.layers.conv2d(conv2d, 50, 5, padding='SAME')
conv2d = tf.layers.conv2d(conv2d, 50, 5, padding='SAME')
conv2d = tf.layers.max_pooling2d(conv2d, 2, 2)
# convert to list by filters
conv2d_unstack = tf.unstack(conv2d, axis=-1)
conv1_list=[]
for conv2_filter in conv2d_unstack: # for each filter of shape 28x140 - 7X35 ...
conv2_filter_transpose = tf.transpose(conv2_filter, (0, 2, 1)) # convert to 140x28 - 35X7
conv1_list += [conv2_filter_transpose]
# Concatenate
convf = tf.concat(conv1_list, axis=-1) # out of 140x200
# Final 1d convolutions stacked
for filters, kernel, dilation in zip(filters_list, kernels_list, dilations_list):
convf_out = tf.layers.conv1d(convf, filters=filters, kernel_size=[kernel], activation=tf.nn.relu, padding='SAME', dilation_rate=[dilation])
convf = tf.concat([convf_out, convf], axis=-1) # Residual connections
# Reshape vector to classes+1 with dense layers
logits_input = tf.layers.conv1d(convf, filters=num_classes, kernel_size=[1], activation=tf.nn.relu, padding='SAME')
# Create logits
with tf.name_scope("Logit") as scope:
logits = tf.transpose(logits_input, (1, 0, 2), name='logits') #Time major [t, b, NClasses+1], for CTC
#variable_summaries(logits, 'logits')
# Create CTC loss
with tf.name_scope("loss") as scope:
sequence_len = tf.ones_like(image_len_ph)*tf.constant(35)
sequence_len = tf.cast(sequence_len, tf.int32)
loss = tf.nn.ctc_loss(target, logits, sequence_len, ignore_longer_outputs_than_inputs=True)
cost = tf.reduce_mean(loss, name='cost')
cost_summary = tf.summary.scalar("cost", cost)
#Optimizer
with tf.name_scope("train") as scope:
global_step = tf.Variable(0, trainable=False)
optimizer = tf.train.MomentumOptimizer(learning_rate=0.0001, momentum=0.97)
gvs = optimizer.compute_gradients(cost)
#for i,t in enumerate(gvs):
# logger.info('gradients: %s - %s', i, t)
# variable_summaries(t, 'grad'+str(i))
#capped_gvs = gvs
capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs]
train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step)
# decode CTC
with tf.name_scope("predict") as scope:
decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, sequence_len, merge_repeated=False)
prediction = tf.cast(decoded[0], tf.int32, name='prediction')
dense_prediction = tf.sparse_to_dense(prediction.indices, prediction.dense_shape,
prediction.values, default_value=-1)
# Accuracy --> Levensteing distance: CER/num_chars
with tf.name_scope("accuracy") as scope:
accuracy = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), target), name='acuracy')
accuracy_summary = tf.summary.scalar("accuracy", accuracy)
# Summaries
summaries_dir = os.path.join(experiment_dir)
merged = tf.summary.merge_all()
# Saver
tf.add_to_collection('images_batch_ph', images_batch_ph)
tf.add_to_collection('image_len_ph', image_len_ph)
tf.add_to_collection('labels_batch_ph', labels_batch_ph)
tf.add_to_collection('labels_len_batch_ph', labels_len_batch_ph)
tf.add_to_collection('logits', logits)
tf.add_to_collection('dense_prediction', dense_prediction)
saver = tf.train.Saver(max_to_keep=5)
print('Model created!')
In [7]:
def train_step(epoch, decoder_dict):
'''
'''
step=1
cost_l = []
acc_l = []
train_generator = data_generator(list(df_train.file)[:150000])
for img, img_len, t, t_len in train_generator:
_, ce, acc = sess.run([train_op, cost, accuracy],
feed_dict={images_batch_ph: img, image_len_ph: img_len,
labels_batch_ph: t, labels_len_batch_ph: t_len})
cost_l += [ce]
acc_l += [acc]
step += 1
if step%500 == 0:
# Sumaries train
summary_str, pred = sess.run([merged, dense_prediction],
feed_dict={images_batch_ph: img, image_len_ph: img_len,
labels_batch_ph: t, labels_len_batch_ph: t_len})
train_writer.add_summary(summary_str, epoch)
pred = [decode_word(w, decoder_dict) for w in pred]
real = [decode_word(w, decoder_dict) for w in t]
wer = evaluate_wer(pred, real)
print('TRAIN - Epoch:', epoch,
' - Step:', step,
' - Cost:', np.mean(cost_l),
' - CER:', np.mean(acc_l),
' - WER (step):', wer)
cost_l = [] #Reset
acc_l = []
print('\nTrain examples pred vs real:')
for i in range(7):
print(pred[i], ' - ', real[i])
In [8]:
def eval_step(decoder_dict):
'''
'''
step=1
cost_l = []
cer_l = []
wer_l = []
pred_l = []
real_l = []
val_generator = data_generator(list(df_val.file)[:2048])
for img, img_len, t, t_len in val_generator:
ce, acc, pred = sess.run([cost, accuracy, dense_prediction],
feed_dict={images_batch_ph: img, image_len_ph: img_len,
labels_batch_ph: t, labels_len_batch_ph: t_len})
pred = [decode_word(w, decoder_dict) for w in pred]
real = [decode_word(w, decoder_dict) for w in t]
cost_l += [ce]
cer_l += [acc]
pred_l += pred
real_l += real
step +=1
# Sumaries eval
summary_str = sess.run(merged, feed_dict={images_batch_ph: img, image_len_ph: img_len,
labels_batch_ph: t, labels_len_batch_ph: t_len})
test_writer.add_summary(summary_str, epoch)
wer = evaluate_wer(pred_l, real_l)
print('TEST - Cost:', np.mean(cost_l),
' - CER:', np.mean(cer_l),
' - WER:', wer)
print('\nTest examples pred vs real:')
for i in range(10):
print(pred_l[i], ' - ', real_l[i])
return cost_l, cer_l, wer_l
In [9]:
num_epochs = 8
# Train the model
gpu_options = tf.GPUOptions(allow_growth = False)
with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=True)) as sess:
train_writer = tf.summary.FileWriter(os.path.join(experiment_dir, 'train'), graph=graph)
test_writer = tf.summary.FileWriter(os.path.join(experiment_dir, 'test'))
# Initialize vars if dont exist previous checkpoints.
ckpt = tf.train.get_checkpoint_state(experiment_dir)
if ckpt == None:
# Initialize vars
sess.run(tf.global_variables_initializer())
print('vars initialized!')
epoch_ini = 1
else:
# Load last model
saver.restore(sess, ckpt.model_checkpoint_path)
if os.path.basename(ckpt.model_checkpoint_path).split('-')[-1] == 'best_model':
epoch_ini = 1
else:
epoch_ini = int(os.path.basename(ckpt.model_checkpoint_path).split('-')[-1]) + 1
print('model loaded: %s', ckpt.model_checkpoint_path)
# Compute for num_epochs.
if 1:
cost_val_l = []
cer_val_l = []
wer_val_l = []
continue_training = True
epoch = epoch_ini
while (epoch < num_epochs and continue_training):
# Train phase
print('Training epoch:', epoch)
train_step(epoch, decoder_dict)
# Test phase
print('Testing epoch:', epoch)
cost_val, cer_val, wer_val = eval_step(decoder_dict)
cost_val_l += [cost_val]
# Check accuracy improventents and stop training
if len(cost_val_l) > 10:
print('cost val %s', cost_val_l)
if np.min(cost_val_l) < np.min(cost_val_l[-10]):
continue_training = False
print('STOPING TRAINING')
#Save model
save_path = saver.save(sess, os.path.join(experiment_dir, 'model'), global_step=epoch)
print("Model saved in file: %s" % save_path)
epoch += 1
In [ ]: