Convolutional Autoencoder

Sticking with the MNIST dataset, let's improve our autoencoder's performance using convolutional layers. Again, loading modules and the data.


In [3]:
%matplotlib inline

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

In [4]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', validation_size=0)


---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\genericpath.py in exists(path)
     18     try:
---> 19         os.stat(path)
     20     except OSError:

OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: '<frozen importlib._bootstrap>'

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-4-637ee478dcdd> in <module>()
----> 1 from tensorflow.examples.tutorials.mnist import input_data
      2 mnist = input_data.read_data_sets('MNIST_data', validation_size=0)

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\examples\tutorials\mnist\__init__.py in <module>()
     19 from __future__ import print_function
     20 
---> 21 from tensorflow.examples.tutorials.mnist import input_data
     22 from tensorflow.examples.tutorials.mnist import mnist

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\examples\tutorials\mnist\input_data.py in <module>()
     27 from six.moves import xrange  # pylint: disable=redefined-builtin
     28 import tensorflow as tf
---> 29 from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\contrib\__init__.py in <module>()
     29 from tensorflow.contrib import deprecated
     30 from tensorflow.contrib import distributions
---> 31 from tensorflow.contrib import factorization
     32 from tensorflow.contrib import framework
     33 from tensorflow.contrib import graph_editor

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\contrib\factorization\__init__.py in <module>()
     22 from tensorflow.contrib.factorization.python.ops.clustering_ops import *
     23 from tensorflow.contrib.factorization.python.ops.factorization_ops import *
---> 24 from tensorflow.contrib.factorization.python.ops.gmm import *
     25 from tensorflow.contrib.factorization.python.ops.gmm_ops import *
     26 from tensorflow.contrib.factorization.python.ops.wals import *

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\contrib\factorization\python\ops\gmm.py in <module>()
     25 from tensorflow.contrib.framework.python.framework import checkpoint_utils
     26 from tensorflow.contrib.framework.python.ops import variables
---> 27 from tensorflow.contrib.learn.python.learn.estimators import estimator
     28 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
     29 from tensorflow.python.framework import constant_op

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\contrib\learn\__init__.py in <module>()
     86 
     87 # pylint: disable=wildcard-import
---> 88 from tensorflow.contrib.learn.python.learn import *
     89 # pylint: enable=wildcard-import
     90 

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\contrib\learn\python\__init__.py in <module>()
     21 
     22 # pylint: disable=wildcard-import
---> 23 from tensorflow.contrib.learn.python.learn import *
     24 # pylint: enable=wildcard-import

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\contrib\learn\python\learn\__init__.py in <module>()
     23 from tensorflow.contrib.learn.python.learn import basic_session_run_hooks
     24 from tensorflow.contrib.learn.python.learn import datasets
---> 25 from tensorflow.contrib.learn.python.learn import estimators
     26 from tensorflow.contrib.learn.python.learn import graph_actions
     27 from tensorflow.contrib.learn.python.learn import learn_io as io

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\__init__.py in <module>()
    295 from tensorflow.contrib.learn.python.learn.estimators._sklearn import NotFittedError
    296 from tensorflow.contrib.learn.python.learn.estimators.constants import ProblemType
--> 297 from tensorflow.contrib.learn.python.learn.estimators.dnn import DNNClassifier
    298 from tensorflow.contrib.learn.python.learn.estimators.dnn import DNNEstimator
    299 from tensorflow.contrib.learn.python.learn.estimators.dnn import DNNRegressor

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\dnn.py in <module>()
     21 import six
     22 
---> 23 from tensorflow.contrib import layers
     24 from tensorflow.contrib.framework import deprecated
     25 from tensorflow.contrib.framework import deprecated_arg_values

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\contrib\layers\__init__.py in <module>()
     94 
     95 # pylint: disable=unused-import,wildcard-import
---> 96 from tensorflow.contrib.layers.python.layers import *
     97 # pylint: enable=unused-import,wildcard-import
     98 

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\contrib\layers\python\layers\__init__.py in <module>()
     22 from tensorflow.contrib.layers.python.layers.embedding_ops import *
     23 from tensorflow.contrib.layers.python.layers.encoders import *
---> 24 from tensorflow.contrib.layers.python.layers.feature_column import *
     25 from tensorflow.contrib.layers.python.layers.feature_column_ops import *
     26 from tensorflow.contrib.layers.python.layers.initializers import *

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\contrib\layers\python\layers\feature_column.py in <module>()
    136 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
    137 from tensorflow.contrib.layers.python.layers import embedding_ops
--> 138 from tensorflow.contrib.layers.python.layers import layers
    139 from tensorflow.contrib.layers.python.ops import bucketization_op
    140 from tensorflow.contrib.layers.python.ops import sparse_feature_cross_op

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\contrib\layers\python\layers\layers.py in <module>()
   1086     outputs_collections=None,
   1087     trainable=True,
-> 1088     scope=None):
   1089   """Adds a convolution2d_transpose with an optional batch normalization layer.
   1090 

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\contrib\framework\python\ops\arg_scope.py in add_arg_scope(func)
    182   _add_op(func)
    183   setattr(func_with_args, '_key_op', _key_op(func))
--> 184   return tf_decorator.make_decorator(func, func_with_args)
    185 
    186 

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\site-packages\tensorflow\python\util\tf_decorator.py in make_decorator(target, decorator_func, decorator_name, decorator_doc, decorator_argspec)
     84   """
     85   if decorator_name is None:
---> 86     decorator_name = _inspect.stack()[1][3]  # Caller's name.
     87   decorator = TFDecorator(decorator_name, target, decorator_doc,
     88                           decorator_argspec)

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\inspect.py in stack(context)
   1462 def stack(context=1):
   1463     """Return a list of records for the stack above the caller's frame."""
-> 1464     return getouterframes(sys._getframe(1), context)
   1465 
   1466 def trace(context=1):

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\inspect.py in getouterframes(frame, context)
   1439     framelist = []
   1440     while frame:
-> 1441         frameinfo = (frame,) + getframeinfo(frame, context)
   1442         framelist.append(FrameInfo(*frameinfo))
   1443         frame = frame.f_back

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\inspect.py in getframeinfo(frame, context)
   1408         raise TypeError('{!r} is not a frame or traceback object'.format(frame))
   1409 
-> 1410     filename = getsourcefile(frame) or getfile(frame)
   1411     if context > 0:
   1412         start = lineno - 1 - context//2

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\inspect.py in getsourcefile(object)
    667                  importlib.machinery.EXTENSION_SUFFIXES):
    668         return None
--> 669     if os.path.exists(filename):
    670         return filename
    671     # only return a non-existent filename if the module has a PEP 302 loader

C:\Users\Abdul\Anaconda3\envs\dlnd-tf-lab\lib\genericpath.py in exists(path)
     17     """Test whether a path exists.  Returns False for broken symbolic links"""
     18     try:
---> 19         os.stat(path)
     20     except OSError:
     21         return False

KeyboardInterrupt: 

In [3]:
img = mnist.train.images[2]
plt.imshow(img.reshape((28, 28)), cmap='Greys_r')


Out[3]:
<matplotlib.image.AxesImage at 0x7f4631f1a4e0>

Network Architecture

The encoder part of the network will be a typical convolutional pyramid. Each convolutional layer will be followed by a max-pooling layer to reduce the dimensions of the layers. The decoder though might be something new to you. The decoder needs to convert from a narrow representation to a wide reconstructed image. For example, the representation could be a 4x4x8 max-pool layer. This is the output of the encoder, but also the input to the decoder. We want to get a 28x28x1 image out from the decoder so we need to work our way back up from the narrow decoder input layer. A schematic of the network is shown below.

Here our final encoder layer has size 4x4x8 = 128. The original images have size 28x28 = 784, so the encoded vector is roughlt 16% the size of the original image. These are just suggested sizes for each of the layers. Feel free to change the depths and sizes, but remember our goal here is to find a small representation of the input data.

What's going on with the decoder

Okay, so the decoder has these "Upsample" layers that you might not have seen before. First off, I'll discuss a bit what these layers aren't. Usually, you'll see deconvolutional layers used to increase the width and height of the layers. They work almost exactly the same as convolutional layers, but it reverse. A stride in the input layer results in a larger stride in the deconvolutional layer. For example, if you have a 3x3 kernel, a 3x3 patch in the input layer will be reduced to one unit in a convolutional layer. Comparatively, one unit in the input layer will be expanded to a 3x3 path in a deconvolutional layer. Deconvolution is often called "transpose convolution" which is what you'll find the TensorFlow API, with tf.nn.conv2d_transpose.

However, deconvolutional layers can lead to artifacts in the final images, such as checkerboard patterns. This is due to overlap in the kernels which can be avoided by setting the stride and kernel size equal. In this Distill article from Augustus Odena, et al, the authors show that these checkerboard artifacts can be avoided by resizing the layers using nearest neighbor or bilinear interpolation (upsampling) followed by a convolutional layer. In TensorFlow, this is easily done with tf.image.resize_images, followed by a convolution. Be sure to read the Distill article to get a better understanding of deconvolutional layers and why we're using upsampling.

Exercise: Build the network shown above. Remember that a convolutional layer with strides of 1 and 'same' padding won't reduce the height and width. That is, if the input is 28x28 and the convolution layer has stride = 1 and 'same' padding, the convolutional layer will also be 28x28. The max-pool layers are used the reduce the width and height. A stride of 2 will reduce the size by 2. Odena et al claim that nearest neighbor interpolation works best for the upsampling, so make sure to include that as a parameter in tf.image.resize_images or use tf.image.resize_nearest_neighbor.


In [9]:
inputs_ = tf.placeholder(tf.float32, (None, 28, 28, 1), name='inputs')
targets_ = tf.placeholder(tf.float32, (None, 28, 28, 1), name='targets')

### Encoder
conv1 = tf.layers.conv2d(inputs_, 16, (3,3), padding='same', activation=tf.nn.relu)
# Now 28x28x16
maxpool1 = tf.layers.max_pooling2d(conv1, (2,2), (2,2), padding='same')
# Now 14x14x16
conv2 = tf.layers.conv2d(maxpool1, 8, (3,3), padding='same', activation=tf.nn.relu)
# Now 14x14x8
maxpool2 = tf.layers.max_pooling2d(conv2, (2,2), (2,2), padding='same')
# Now 7x7x8
conv3 = tf.layers.conv2d(maxpool2, 8, (3,3), padding='same', activation=tf.nn.relu)
# Now 7x7x8
encoded = tf.layers.max_pooling2d(conv3, (2,2), (2,2), padding='same')
# Now 4x4x8

### Decoder
upsample1 = tf.image.resize_nearest_neighbor(encoded, (7,7))
# Now 7x7x8
conv4 = tf.layers.conv2d(upsample1, 8, (3,3), padding='same', activation=tf.nn.relu)
# Now 7x7x8
upsample2 = tf.image.resize_nearest_neighbor(conv4, (14,14))
# Now 14x14x8
conv5 = tf.layers.conv2d(upsample2, 8, (3,3), padding='same', activation=tf.nn.relu)
# Now 14x14x8
upsample3 = tf.image.resize_nearest_neighbor(conv5, (28,28))
# Now 28x28x8
conv6 = tf.layers.conv2d(upsample3, 16, (3,3), padding='same', activation=tf.nn.relu)
# Now 28x28x16

logits = tf.layers.conv2d(conv6, 1, (3,3), padding='same', activation=None)
#Now 28x28x1

decoded = tf.nn.sigmoid(logits, name='decoded')

loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=targets_, logits=logits)
cost = tf.reduce_mean(loss)
opt = tf.train.AdamOptimizer(0.001).minimize(cost)

Training

As before, here wi'll train the network. Instead of flattening the images though, we can pass them in as 28x28x1 arrays.


In [1]:
sess = tf.Session()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-61056e0b8e9b> in <module>()
----> 1 sess = tf.Session()

NameError: name 'tf' is not defined

In [2]:
epochs = 20
batch_size = 200
sess.run(tf.global_variables_initializer())
for e in range(epochs):
    for ii in range(mnist.train.num_examples//batch_size):
        batch = mnist.train.next_batch(batch_size)
        imgs = batch[0].reshape((-1, 28, 28, 1))
        batch_cost, _ = sess.run([cost, opt], feed_dict={inputs_: imgs,
                                                         targets_: imgs})

        print("Epoch: {}/{}...".format(e+1, epochs),
              "Training loss: {:.4f}".format(batch_cost))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-ee1b2a582e4e> in <module>()
      1 epochs = 20
      2 batch_size = 200
----> 3 sess.run(tf.global_variables_initializer())
      4 for e in range(epochs):
      5     for ii in range(mnist.train.num_examples//batch_size):

NameError: name 'sess' is not defined

In [13]:
fig, axes = plt.subplots(nrows=2, ncols=10, sharex=True, sharey=True, figsize=(20,4))
in_imgs = mnist.test.images[:10]
reconstructed = sess.run(decoded, feed_dict={inputs_: in_imgs.reshape((10, 28, 28, 1))})

for images, row in zip([in_imgs, reconstructed], axes):
    for img, ax in zip(images, row):
        ax.imshow(img.reshape((28, 28)), cmap='Greys_r')
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)


fig.tight_layout(pad=0.1)



In [19]:
sess.close()

Denoising

As I've mentioned before, autoencoders like the ones you've built so far aren't too useful in practive. However, they can be used to denoise images quite successfully just by training the network on noisy images. We can create the noisy images ourselves by adding Gaussian noise to the training images, then clipping the values to be between 0 and 1. We'll use noisy images as input and the original, clean images as targets. Here's an example of the noisy images I generated and the denoised images.

Since this is a harder problem for the network, we'll want to use deeper convolutional layers here, more feature maps. I suggest something like 32-32-16 for the depths of the convolutional layers in the encoder, and the same depths going backward through the decoder. Otherwise the architecture is the same as before.

Exercise: Build the network for the denoising autoencoder. It's the same as before, but with deeper layers. I suggest 32-32-16 for the depths, but you can play with these numbers, or add more layers.


In [21]:
inputs_ = tf.placeholder(tf.float32, (None, 28, 28, 1), name='inputs')
targets_ = tf.placeholder(tf.float32, (None, 28, 28, 1), name='targets')

### Encoder
conv1 = tf.layers.conv2d(inputs_, 32, (3,3), padding='same', activation=tf.nn.relu)
# Now 28x28x32
maxpool1 = tf.layers.max_pooling2d(conv1, (2,2), (2,2), padding='same')
# Now 14x14x32
conv2 = tf.layers.conv2d(maxpool1, 32, (3,3), padding='same', activation=tf.nn.relu)
# Now 14x14x32
maxpool2 = tf.layers.max_pooling2d(conv2, (2,2), (2,2), padding='same')
# Now 7x7x32
conv3 = tf.layers.conv2d(maxpool2, 16, (3,3), padding='same', activation=tf.nn.relu)
# Now 7x7x16
encoded = tf.layers.max_pooling2d(conv3, (2,2), (2,2), padding='same')
# Now 4x4x16

### Decoder
upsample1 = tf.image.resize_nearest_neighbor(encoded, (7,7))
# Now 7x7x16
conv4 = tf.layers.conv2d(upsample1, 16, (3,3), padding='same', activation=tf.nn.relu)
# Now 7x7x16
upsample2 = tf.image.resize_nearest_neighbor(conv4, (14,14))
# Now 14x14x16
conv5 = tf.layers.conv2d(upsample2, 32, (3,3), padding='same', activation=tf.nn.relu)
# Now 14x14x32
upsample3 = tf.image.resize_nearest_neighbor(conv5, (28,28))
# Now 28x28x32
conv6 = tf.layers.conv2d(upsample3, 32, (3,3), padding='same', activation=tf.nn.relu)
# Now 28x28x32

logits = tf.layers.conv2d(conv6, 1, (3,3), padding='same', activation=None)
#Now 28x28x1

decoded = tf.nn.sigmoid(logits, name='decoded')

loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=targets_, logits=logits)
cost = tf.reduce_mean(loss)
opt = tf.train.AdamOptimizer(0.001).minimize(cost)

In [22]:
sess = tf.Session()

In [ ]:
epochs = 100
batch_size = 200
# Set's how much noise we're adding to the MNIST images
noise_factor = 0.5
sess.run(tf.global_variables_initializer())
for e in range(epochs):
    for ii in range(mnist.train.num_examples//batch_size):
        batch = mnist.train.next_batch(batch_size)
        # Get images from the batch
        imgs = batch[0].reshape((-1, 28, 28, 1))
        
        # Add random noise to the input images
        noisy_imgs = imgs + noise_factor * np.random.randn(*imgs.shape)
        # Clip the images to be between 0 and 1
        noisy_imgs = np.clip(noisy_imgs, 0., 1.)
        
        # Noisy images as inputs, original images as targets
        batch_cost, _ = sess.run([cost, opt], feed_dict={inputs_: noisy_imgs,
                                                         targets_: imgs})

        print("Epoch: {}/{}...".format(e+1, epochs),
              "Training loss: {:.4f}".format(batch_cost))

Checking out the performance

Here I'm adding noise to the test images and passing them through the autoencoder. It does a suprising great job of removing the noise, even though it's sometimes difficult to tell what the original number is.


In [29]:
fig, axes = plt.subplots(nrows=2, ncols=10, sharex=True, sharey=True, figsize=(20,4))
in_imgs = mnist.test.images[:10]
noisy_imgs = in_imgs + noise_factor * np.random.randn(*in_imgs.shape)
noisy_imgs = np.clip(noisy_imgs, 0., 1.)

reconstructed = sess.run(decoded, feed_dict={inputs_: noisy_imgs.reshape((10, 28, 28, 1))})

for images, row in zip([noisy_imgs, reconstructed], axes):
    for img, ax in zip(images, row):
        ax.imshow(img.reshape((28, 28)), cmap='Greys_r')
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)

fig.tight_layout(pad=0.1)