In [1]:
#reset python environment
%reset -f
from pathlib import Path
import numpy as np
import tensorflow as tf
import time
import os
current_dir = os.getcwd()
home_directory = Path(os.getcwd())
dataset_directory = home_directory / "datasets" / "dogs-vs-cats-redux-kernels-edition"
training_dataset_dir = dataset_directory / "train"
validation_dataset_dir = dataset_directory / "valid"
test_dataset_dir = dataset_directory / "test1"
sample_dataset_directory = home_directory / "datasets" / "dogs-vs-cats-redux-kernels-edition" / "sample"
sample_training_dataset_dir = sample_dataset_directory / "train"
sample_validation_dataset_dir = sample_dataset_directory / "valid"
sample_test_dataset_dir = sample_dataset_directory / "test1"
dogs_dir = "dog"
cats_dir = "cat"
default_device = "/gpu:0"
# default_device = v/cpu:0"
In [7]:
from zipfile import ZipFile
# Create base directory
dataset_directory.mkdir(parents=True)
# Kaggle's train.zip and test1.zip have to be present in ./zips/dogs-vs-cats-redux-kernels-edition/
zips_directory = Path("zips") / "dogs-vs-cats-redux-kernels-edition"
with ZipFile(str(zips_directory / "train.zip")) as train_zip:
train_zip.extractall(dataset_directory)
with ZipFile(str(zips_directory / "test1.zip")) as test_zip:
test_zip.extractall(dataset_directory)
In [8]:
import os
import shutil
from glob import glob
valid_percentage = 0.1
sample_percentage = 0.1
def pick_random(files, percentage, target_dir, move=False):
shuffled = np.random.permutation(files)
num_files = int(len(shuffled) * percentage)
for f in shuffled[:num_files]:
if move:
f.rename(target_dir / f.name)
else:
shutil.copy(str(f), str(target_dir / f.name))
try:
# Create directory for training and validation images
cats_training_dataset_dir = training_dataset_dir / cats_dir
dogs_training_dataset_dir = training_dataset_dir / dogs_dir
cats_training_dataset_dir.mkdir()
dogs_training_dataset_dir.mkdir()
cats_validation_dataset_dir = validation_dataset_dir / cats_dir
dogs_validation_dataset_dir = validation_dataset_dir / dogs_dir
cats_validation_dataset_dir.mkdir(parents=True)
dogs_validation_dataset_dir.mkdir(parents=True)
# Move classes to their respective directories
for f in training_dataset_dir.glob("cat.*.jpg"):
f.rename(cats_training_dataset_dir / f.name)
for f in training_dataset_dir.glob("dog.*.jpg"):
f.rename(dogs_training_dataset_dir / f.name)
# Move randomly picked validation files
pick_random(
list(cats_training_dataset_dir.glob("*.jpg")), valid_percentage,
cats_validation_dataset_dir, move=True)
pick_random(
list(dogs_training_dataset_dir.glob("*.jpg")), valid_percentage,
dogs_validation_dataset_dir, move=True)
# Create directories for sample data
cats_sample_training_dataset_dir = (sample_training_dataset_dir / cats_dir)
dogs_sample_training_dataset_dir = (sample_training_dataset_dir / dogs_dir)
cats_sample_training_dataset_dir.mkdir(parents=True)
dogs_sample_training_dataset_dir.mkdir(parents=True)
cats_sample_validation_dataset_dir = sample_validation_dataset_dir / cats_dir
dogs_sample_validation_dataset_dir = sample_validation_dataset_dir / dogs_dir
cats_sample_validation_dataset_dir.mkdir(parents=True)
dogs_sample_validation_dataset_dir.mkdir(parents=True)
sample_test_dataset_dir.mkdir(parents=True)
# Copy randomly picked training and test files to samples
pick_random(
list(cats_training_dataset_dir.glob("*.jpg")), sample_percentage,
cats_sample_training_dataset_dir, move=False)
pick_random(
list(dogs_training_dataset_dir.glob("*.jpg")), sample_percentage,
dogs_sample_training_dataset_dir, move=False)
pick_random(
list(test_dataset_dir.glob("*.jpg")), sample_percentage,
sample_test_dataset_dir, move=False)
# Move randomly picked validation files
pick_random(
list(cats_sample_training_dataset_dir.glob("*.jpg")), valid_percentage,
cats_sample_validation_dataset_dir, move=False)
pick_random(
list(dogs_sample_training_dataset_dir.glob("*.jpg")), valid_percentage,
dogs_sample_validation_dataset_dir, move=False)
print("Done. Validation and sample sets created.")
except FileExistsError as e:
print("Error: Looks like data has already been prepared. Delete everything except the zip files to recreate.")
In [16]:
from glob import glob
def filenames_and_labels(path):
cat_filenames = np.array(glob("{}/cat/*.jpg".format(path)))
cat_labels = np.tile([1, 0], (len(cat_filenames), 1))
dog_filenames = np.array(glob("{}/dog/*.jpg".format(path)))
dog_labels = np.tile([0, 1], (len(dog_filenames), 1))
return np.concatenate([cat_filenames, dog_filenames]), np.concatenate([cat_labels, dog_labels])
In [150]:
import time
import tensorflow as tf
import tensorflow_image_utils as tiu
from vgg16 import Vgg16Model
def extract_features(*, sess, directory, output_filename, batch_size=32, augment=False, input_epochs=1):
filenames, labels = filenames_and_labels(directory)
filename_queue, label_queue = tf.train.slice_input_producer(
[
tf.convert_to_tensor(filenames, dtype=tf.string),
tf.convert_to_tensor(labels, dtype=tf.float32)
], num_epochs=input_epochs, shuffle=False)
image = tiu.load_image(filename_queue, size=(224, 224))
if augment:
image = tiu.distort_image(image)
image = tiu.vgg16_preprocess(image, shape=(224, 224, 3))
batched_data = tf.train.batch(
[image, label_queue, filename_queue],
batch_size=batch_size,
num_threads=4,
enqueue_many=False,
allow_smaller_final_batch=True,
capacity=3 * batch_size, )
inputs = tf.placeholder(tf.float32, shape=(None, 224, 224, 3), name="input")
model = Vgg16Model()
model.build(inputs)
sess.run([
tf.local_variables_initializer(),
tf.global_variables_initializer()
])
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess,coord=coord)
codes = []
num_unique_files = len(filenames)
num_files_to_process = num_unique_files * input_epochs
num_iterations = num_files_to_process // batch_size
if num_files_to_process % batch_size != 0:
num_iterations = num_iterations + 1
current_iteration = 0
tstart = time.perf_counter()
try:
while not coord.should_stop():
t0 = time.perf_counter()
batch_images, batch_labels, batch_filenames = sess.run(batched_data)
t1 = time.perf_counter()
print("\nIteration {}/{}:".format(current_iteration + 1, num_iterations))
print("\tFetching batch took {:.3f} seconds".format(t1-t0))
# flatten shape of maxpool5: (7, 7, 512) -> 7 * 7 * 512
flattened = tf.reshape(model.max_pool5, shape=(-1, 7 * 7 * 512))
features = sess.run(flattened, feed_dict={inputs: batch_images})
t2 = time.perf_counter()
print("\tExtracting features took {:.3f} seconds".format(t2-t1))
for i, batch_filename in enumerate(batch_filenames):
codes.append((batch_labels[i], batch_filename, features[i]))
t3 = time.perf_counter()
current_iteration = current_iteration + 1
print("\tProcessing {} images took {:.3f} seconds".format(len(batch_filenames), t3-t0))
except tf.errors.OutOfRangeError:
print("\nDone -- epoch limit reached")
finally:
coord.request_stop()
coord.join(threads)
np.save(output_filename, np.array(codes, dtype="object"))
print("Extracted to '{}' in {:.3f} seconds\n\n".format(output_filename ,time.perf_counter() - tstart))
In [151]:
with tf.Session(graph=tf.Graph()) as sess:
extract_features(sess=sess, directory=sample_validation_dataset_dir,
output_filename="sample_validation_codes.npy", input_epochs=2)
with tf.Session(graph=tf.Graph()) as sess:
extract_features(sess=sess, directory=sample_training_dataset_dir,
output_filename="sample_training_codes.npy",
augment=True, input_epochs=4)
In [5]:
from tensorflow.python.saved_model import builder as saved_model_builder
from tensorflow.python.saved_model.signature_def_utils import predict_signature_def
from tensorflow.python.saved_model.tag_constants import SERVING
from tensorflow.python.saved_model.signature_constants import DEFAULT_SERVING_SIGNATURE_DEF_KEY
from tensorflow.python.saved_model.signature_constants import PREDICT_INPUTS
from tensorflow.python.saved_model.signature_constants import PREDICT_OUTPUTS
class TransferModel:
def build(self, *, input_size, num_hidden=1, hidden_layer_size=256, use_batchnorm=True, use_dropout=True):
with tf.name_scope("inputs"):
self.input = tf.placeholder(tf.float32, shape=(None, input_size), name="input")
self.is_training = tf.placeholder(tf.bool, name="is_training")
self.keep_prob = tf.placeholder(tf.float32, name="keep_probability")
self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
with tf.name_scope("targets"):
self.labels = tf.placeholder(tf.float32, shape=(None, 2), name="labels")
prev_size = input_size
next_input = self.input
for i in range(num_hidden):
with tf.variable_scope("hidden_layer_{}".format(i)):
hidden_weights = tf.Variable(
initial_value = tf.truncated_normal([prev_size, hidden_layer_size], mean=0.0, stddev=0.01),
dtype=tf.float32, name="hidden_weights"
)
hidden_bias = tf.Variable(
initial_value = tf.zeros(hidden_layer_size),
dtype=tf.float32,
name="hidden_bias"
)
hidden = tf.matmul(next_input, hidden_weights) + hidden_bias
if use_batchnorm:
hidden = tf.layers.batch_normalization(hidden, training=self.is_training)
hidden = tf.nn.relu(hidden, name="hidden_relu")
if use_dropout:
hidden = tf.nn.dropout(hidden, keep_prob=self.keep_prob, name="hidden_dropout")
tf.summary.histogram("hidden_weights_{}".format(i), hidden_weights)
tf.summary.histogram("hidden_bias_{}".format(i), hidden_bias)
next_input = hidden
prev_size = hidden_layer_size
with tf.name_scope("outputs"):
output_weights = tf.Variable(
initial_value=tf.truncated_normal(shape=(hidden_layer_size, 2), mean=0.0, stddev=0.01),
dtype=tf.float32, name="output_weights"
)
output_bias = tf.Variable(initial_value=tf.zeros(2), dtype=tf.float32, name="output_bias")
self.logits = tf.matmul(next_input, output_weights) + output_bias
self.predictions = tf.nn.softmax(self.logits, name="predictions")
tf.summary.histogram("output_weights", output_weights)
tf.summary.histogram("output_bias", output_bias)
tf.summary.histogram("predictions", self.predictions)
with tf.name_scope("cost"):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.labels, name="cross_entropy")
self.cost = tf.reduce_mean(cross_entropy, name="cost")
tf.summary.scalar("cost", self.cost)
with tf.name_scope("train"):
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
correct_predictions = tf.equal(tf.argmax(self.predictions, 1), tf.argmax(self.labels, 1), name="correct_predictions")
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
self.merged_summaries = tf.summary.merge_all()
def run_training(self, *, sess, fn_get_batches, num_epochs,
validation_images, validation_labels,
writer=None, keep_prob=0.5, batch_size=64,
learning_rate=0.01, accuracy_print_steps=100):
sess.run(tf.global_variables_initializer())
iteration = 0
for epoch in range(num_epochs):
for batch_train_images, batch_train_labels in fn_get_batches(batch_size):
train_acc, train_loss, _, p, summary = sess.run(
[self.accuracy, self.cost, self.optimizer, self.logits, self.merged_summaries],
feed_dict = {
self.input: batch_train_images,
self.labels: batch_train_labels,
self.keep_prob: keep_prob,
self.learning_rate: learning_rate,
self.is_training: True})
iteration = iteration + 1
if iteration % accuracy_print_steps == 0:
if not writer == None:
writer.add_summary(summary, iteration)
if iteration % accuracy_print_steps == 0:
val_acc = sess.run(self.accuracy, feed_dict ={
self.input: validation_images,
self.labels: validation_labels,
self.keep_prob: 1.,
self.is_training: False})
print("\tEpoch {}/{} Iteration {}, trainacc: {}, valacc: {}, loss: {}".format(epoch + 1, num_epochs, iteration, train_acc, val_acc, train_loss))
def save_model(self, *, sess, saved_model_path):
builder = saved_model_builder.SavedModelBuilder(saved_model_path)
builder.add_meta_graph_and_variables(
sess, [SERVING],
signature_def_map = {
DEFAULT_SERVING_SIGNATURE_DEF_KEY: predict_signature_def(
inputs = { PREDICT_INPUTS: self.images },
outputs = { PREDICT_OUTPUTS: self.predictions }
)
}
)
builder.save()
In [6]:
training_features = np.load("sample_training_codes.npy")
validation_features = np.load("sample_validation_codes.npy")
np.random.shuffle(training_features)
training_x = np.array(list(map(lambda row: row[2], training_features)))
training_y = np.array(list(map(lambda row: row[0], training_features)))
validation_x = np.array(list(map(lambda row: row[2], validation_features)))
validation_y = np.array(list(map(lambda row: row[0], validation_features)))
In [7]:
def get_batches(x, y, batch_size=32):
num_rows = y.shape[0]
num_batches = num_rows // batch_size
if num_rows % batch_size != 0:
num_batches = num_batches + 1
for batch in range(num_batches):
yield x[batch_size * batch: batch_size * (batch + 1)], y[batch_size * batch: batch_size * (batch + 1)]
In [12]:
tf.reset_default_graph()
with tf.Session() as sess:
m = TransferModel()
m.build(input_size=7 * 7 * 512, num_hidden=1, hidden_layer_size=256, use_batchnorm=True, use_dropout=True)
m.run_training(
sess=sess, num_epochs=5, learning_rate=0.01, keep_prob=0.8, batch_size=64,
fn_get_batches=lambda batch_size: get_batches(training_x, training_y),
validation_images=validation_x, validation_labels=validation_y)
Looks like we can stop after 2 epochs. Doesn't get much better afterwards.
In [13]:
tf.reset_default_graph()
with tf.Session() as sess:
m = TransferModel()
m.build(input_size=7 * 7 * 512, num_hidden=1, hidden_layer_size=256, use_batchnorm=False, use_dropout=True)
m.run_training(
sess=sess, num_epochs=5, learning_rate=0.01, keep_prob=0.8, batch_size=64,
fn_get_batches=lambda batch_size: get_batches(training_x, training_y),
validation_images=validation_x, validation_labels=validation_y)
Training accuracy jumps around a lot and is way lower than validation accuracy.
Either the learning rate it too low, or we're underfitting either because of regularization (dropout), or our model is not complex enough.
Yes your assumption is true - although if you're underfitting due to reasons other than dropout (or other regularization techniques), you won't see this.
The key technique to avoiding underfitting is using a model with plenty of layers and parameters, and picking an appropriate architecture (e.g. CNN with batchnorm for images). Also picking appropriate learning rates.
Picking the output with the highest validation accuracy is generally a good approach.
Lower Learning Rate
In [15]:
tf.reset_default_graph()
with tf.Session() as sess:
m = TransferModel()
m.build(input_size=7 * 7 * 512, num_hidden=1, hidden_layer_size=256, use_batchnorm=False, use_dropout=True)
m.run_training(
sess=sess, num_epochs=5, learning_rate=0.001, keep_prob=0.8, batch_size=64,
fn_get_batches=lambda batch_size: get_batches(training_x, training_y),
validation_images=validation_x, validation_labels=validation_y)
Lowering the learning rate works wonders, looks like the model wasn't underfitting before.
Result look better than with batchnorm.
In [14]:
tf.reset_default_graph()
with tf.Session() as sess:
m = TransferModel()
m.build(input_size=7 * 7 * 512, num_hidden=1, hidden_layer_size=256, use_batchnorm=True, use_dropout=False)
m.run_training(
sess=sess, num_epochs=5, learning_rate=0.01, keep_prob=1, batch_size=64,
fn_get_batches=lambda batch_size: get_batches(training_x, training_y),
validation_images=validation_x, validation_labels=validation_y)
Works nearly as good as batchnorm + dropout