Stacked Denoising AutoEncoder for MNIST

Papers

0 Setup notebook


In [ ]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

0.2 Imports


In [ ]:
import tensorflow as tf
assert tf.__version__=="1.3.0" # the version we used
from tensorflow.contrib.tensorboard.plugins import projector # for visualizing embeddings

import numpy as np # computation

import os # creat dirs
from os.path import join as jp # join pathes
import logging # print info

import sklearn # datasets, clustering
from sklearn.datasets import fetch_mldata

from IPython.display import Image # displaying images inline

import matplotlib # plotting stuff
matplotlib.use('Agg') # for displaying plots in console without display
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import time # estimating ETA

0.3 Configurations


In [ ]:
# fix training
RANDOM_SEED = 0 
# configure numpy 
np.set_printoptions(precision=3, suppress=True)
np.random.seed(RANDOM_SEED)

# configure tensorflow
tf.set_random_seed(RANDOM_SEED)

# configure 
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

0.4 Helper functions


In [ ]:
def create_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)
        logger.info("Created directory: %s"%path)

def plot_reconstruction( samples, epoch=20,  size_x=3, size_y=3, name="reconstruction"):
    if not (size_x * size_y) == len(samples): # 
        l = min(len(samples),200)        
        size_x = int(np.sqrt(l))
        size_y = int(np.sqrt(l))
        samples = samples[0:(size_x*size_y)]
    
    fig = plt.figure(figsize=(size_x, size_y))
    gs = gridspec.GridSpec(size_x, size_y)
    gs.update(wspace=0.05, hspace=0.05)
    

    for i, sample in enumerate(samples):
        
        ax = plt.subplot(gs[i])
        plt.axis('off')
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.set_aspect('equal')
        plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
    
    outfile= "visualizations/%s_%0.4d.png"%(name, epoch)
    plt.savefig(outfile)
    plt.close(fig)
    try: # only in ipython notebook
        display(Image(filename=outfile)) 
    except: pass

def link_embedding_to_metadata(embedding_var, metadata_file, graph_dir):
    from tensorflow.contrib.tensorboard.plugins import projector # for visualizing embeddings
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = embedding_var.name.replace(":0","")
    embedding.metadata_path = metadata_file
    summary_writer = tf.summary.FileWriter(graph_dir)
    projector.visualize_embeddings(summary_writer, config)

0.5 Create directories


In [ ]:
MODEL_NAME = "mlp-sdae"
DATA_DIR = "data"
VIZUALIZATIONS_DIR = "visualizations"

create_dir(DATA_DIR)  # data is stored here
create_dir(VIZUALIZATIONS_DIR) # plots we generate
create_dir("graphs") 

TAG = "%0.3d"%(len(os.listdir("graphs"))+1)
GRAPH_DIR = jp("graphs", "%s-%s"%(MODEL_NAME, TAG))
create_dir(GRAPH_DIR) # store tensorflow graph here

1 Hyper Parameters


In [ ]:
BATCH_SIZE = 200 
NUM_FEATURES = 784 # pixels in the mnist
NUM_ENCODING_DIMENSIONS = 10 # encoded space dimensions

NUM_PRETRAIN_STEPS = 700    # trainsteps per layer
NUM_FINETUNE_STEPS = 1400   # finetuning steps for the whole SDAE
DECREASE_LR_STEPS =  350  #after how many steps the lr should be decreased (now once every epoch)
NUM_DISPLAY_LOSS_STEPS = max(int( NUM_PRETRAIN_STEPS / 50),1) # very two percent of the steps

2 Data


In [ ]:
mnist = fetch_mldata('MNIST original', data_home=DATA_DIR)
p = np.random.permutation(mnist.data.shape[0]) # shuffle data because it is ordered
X = mnist.data[p].astype(np.float32)*0.02
Y = mnist.target[p]
logger.info(p.shape)
NUM_IMAGES = mnist.data.shape[0]

3 Model

The model is divided into two parts - first four stacked layers which is defined in Section 3.2, and then the whole Stacked Denoising Auto-Encoder in Section 3.3.

The weights of the layers defined Section 3.2. get reused in the full SDAE in Section 3.3.

In this model, $x$ is our input image, $\hat{x}$ is our reconstructed image, and $h$ the encoded layer. E1..E4 are the fully connected encoder layers, D1 .. D4 are the fully connected decoder layers. he model architecture of the whole stacked denoising autoencoder looks like this:

$x$ => E1 => E2 => E3 => E4 => $h$ => D1 => D2 => D3 => D4 = $\hat{x}$

The first layer stack looks like this:

$x$ => E1 => $h_1$ => DO => D4 => $\hat{x}$

3.1 Input


In [ ]:
x_input = tf.placeholder(shape=[None, NUM_FEATURES], dtype=tf.float32, name="X_INPUT")
tf_apply_dropout = tf.placeholder_with_default(True, shape=(), name="APPLY_DROPOUT")

3.2 Stacked layers


In [ ]:
layer_stacks = {
    0:{'name':'Layer_1'}, 
    1:{'name':'Layer_2'},
    2:{'name':'Layer_3'},
    3:{'name':'Layer_4'},
    4:{'name':'Stacked AE'},
}

3.2.1 Stack 1 - Encoder Layer 1 => Decoder layer 4


In [ ]:
with tf.variable_scope("Encoder"):
    s1_enc_l1_act = tf.layers.dense(
        inputs=x_input,
        units=500,
        activation=tf.nn.relu,
        kernel_initializer=tf.contrib.layers.xavier_initializer(seed=RANDOM_SEED),
        name="FC_1",
    )

with tf.variable_scope("Decoder"):
    s1_dec_noisy_input = tf.layers.dropout(inputs=s1_enc_l1_act, rate=0.2, seed=RANDOM_SEED, training=tf_apply_dropout)
    s1_dec_l4_act = tf.layers.dense(
        inputs=s1_dec_noisy_input,
        units=NUM_FEATURES,
        activation=None,
        kernel_initializer=tf.contrib.layers.xavier_initializer(seed=RANDOM_SEED),
        name="FC_4",
    )

3.2.1 Stack 2 - Encoder Layer 2 => Decoder layer 3


In [ ]:
with tf.variable_scope("Encoder"):
    s2_enc_l2_act = tf.layers.dense(
        inputs=tf.stop_gradient(s1_dec_noisy_input),
        units=500,
        activation=tf.nn.relu,
        kernel_initializer=tf.contrib.layers.xavier_initializer(seed=RANDOM_SEED),
        name="FC_2",
    )

with tf.variable_scope("Decoder"):
    s2_dec_noisy_input = tf.layers.dropout(inputs=s2_enc_l2_act, rate=0.2, seed=RANDOM_SEED, training=tf_apply_dropout)
    s2_dec_l3_act = tf.layers.dense(
        inputs=s2_dec_noisy_input,
        units=500,
        activation=tf.nn.relu,
        kernel_initializer=tf.contrib.layers.xavier_initializer(seed=RANDOM_SEED),
        name="FC_3",
    )

3.2.1 Stack 3 - Encoder Layer 3 => Decoder layer 2


In [ ]:
with tf.variable_scope("Encoder"):
    s3_enc_l3_act = tf.layers.dense(
        inputs=tf.stop_gradient(s2_dec_noisy_input),
        units=2000,
        activation=tf.nn.relu,
        kernel_initializer=tf.contrib.layers.xavier_initializer(seed=RANDOM_SEED),
        name="FC_3",
    )

with tf.variable_scope("Decoder"):
    s3_dec_noisy_input = tf.layers.dropout(inputs=s3_enc_l3_act, rate=0.2, seed=RANDOM_SEED, training=tf_apply_dropout)
    s3_dec_l2_act = tf.layers.dense(
        inputs=s3_dec_noisy_input,
        units=500,
        activation=tf.nn.relu,
        kernel_initializer=tf.contrib.layers.xavier_initializer(seed=RANDOM_SEED),
        name="FC_2",
    )

3.2.1 Stack 4 - Encoder Layer 4 => Decoder layer 1


In [ ]:
with tf.variable_scope("Encoder"):
    s4_enc_l4_act = tf.layers.dense(
        inputs=tf.stop_gradient(s3_dec_noisy_input),
        units=NUM_ENCODING_DIMENSIONS,
        activation=None,
        kernel_initializer=tf.contrib.layers.xavier_initializer(seed=RANDOM_SEED),
        name="FC_4",
    )

with tf.variable_scope("Decoder"):
    s4_dec_noisy_input = tf.layers.dropout(inputs=s4_enc_l4_act, rate=0.2, seed=RANDOM_SEED, training=tf_apply_dropout)
    s4_dec_l1_act = tf.layers.dense(
        inputs=s4_dec_noisy_input,
        units=2000,
        activation=tf.nn.relu,
        kernel_initializer=tf.contrib.layers.xavier_initializer(seed=RANDOM_SEED),
        name="FC_1",
    )

3.3 Full Stacked Autoencoder


In [ ]:
with tf.variable_scope("Encoder"):
    enc_l1_act = tf.layers.dense( inputs=x_input,    units=500 , activation=tf.nn.relu, name="FC_1", reuse=True )
    enc_l2_act = tf.layers.dense( inputs=enc_l1_act, units=500 , activation=tf.nn.relu, name="FC_2", reuse=True )
    enc_l3_act = tf.layers.dense( inputs=enc_l2_act, units=2000, activation=tf.nn.relu, name="FC_3", reuse=True ) 
    enc_l4_act = tf.layers.dense( inputs=enc_l3_act, units=NUM_ENCODING_DIMENSIONS  , activation=None      , name="FC_4", reuse=True )
    
h = tf.identity(enc_l4_act, "embedded_x")

with tf.variable_scope("Decoder"):
    dec_l1_act = tf.layers.dense( inputs=h,          units=2000, activation=tf.nn.relu, name="FC_1", reuse=True )
    dec_l2_act = tf.layers.dense( inputs=dec_l1_act, units=500 , activation=tf.nn.relu, name="FC_2", reuse=True )
    dec_l3_act = tf.layers.dense( inputs=dec_l2_act, units=500 , activation=tf.nn.relu, name="FC_3", reuse=True ) 
    dec_l4_act = tf.layers.dense( inputs=dec_l3_act, units=NUM_FEATURES , activation=None      , name="FC_4", reuse=True )

x_reconstructed =  tf.identity(dec_l4_act, "reconstructed_x")

4 Cost functions

4.1 Stacks


In [ ]:
stack_1_loss_op = tf.losses.mean_squared_error(
    predictions=s1_dec_l4_act, 
    labels=x_input
)
stack_2_loss_op = tf.losses.mean_squared_error(
    predictions=s2_dec_l3_act, 
    labels=tf.stop_gradient(s1_enc_l1_act)
)
stack_3_loss_op = tf.losses.mean_squared_error(
    predictions=s3_dec_l2_act, 
    labels=tf.stop_gradient(s2_enc_l2_act)
)
stack_4_loss_op = tf.losses.mean_squared_error(
    predictions=s4_dec_l1_act, 
    labels=tf.stop_gradient(s3_enc_l3_act)
)

4.2 Stacked AutoEncoder


In [ ]:
sae_loss_op = tf.losses.mean_squared_error(
    predictions=x_reconstructed, 
    labels=x_input
)

4.3 Encoded Images


In [ ]:
with tf.variable_scope("ImageEncodings"):
    encoded_images = tf.get_variable( name="encoded_images", shape=[NUM_IMAGES, NUM_ENCODING_DIMENSIONS], dtype=tf.float32, initializer=tf.zeros_initializer(), trainable=False ) 
    tf_batch_idx = tf.Variable(0, trainable=False)
    update_indices = tf.range(tf_batch_idx*BATCH_SIZE, tf_batch_idx*BATCH_SIZE+BATCH_SIZE)
    update_indices = tf.reshape(update_indices, shape = [-1, 1])
    encode_images_op = tf.scatter_nd_update(ref=encoded_images, indices=update_indices, updates=h, name="encode_images_op")

5 Training steps


In [ ]:
tf_learning_rate = tf.Variable(0.1, trainable=False, name="LR")
new_lr = tf.placeholder(shape=(), dtype=tf.float32)
set_learning_rate_op = tf_learning_rate.assign(new_lr)

In [ ]:
sdg = tf.train.GradientDescentOptimizer(learning_rate=tf_learning_rate)
stack_1_trainstep_op = sdg.minimize(stack_1_loss_op) 
stack_2_trainstep_op = sdg.minimize(stack_2_loss_op) 
stack_3_trainstep_op = sdg.minimize(stack_3_loss_op) 
stack_4_trainstep_op = sdg.minimize(stack_4_loss_op) 
sae_trainstep_op = sdg.minimize(sae_loss_op)

6 Training

6.1 Sumamries


In [ ]:
s1_loss_summary = tf.summary.scalar("Stack_1_Loss",tf.cast(stack_1_loss_op, tf.float32)) # summary for reconstruction loss
s2_loss_summary = tf.summary.scalar("Stack_2_Loss",tf.cast(stack_2_loss_op, tf.float32)) # summary for reconstruction loss
s3_loss_summary = tf.summary.scalar("Stack_3_Loss",tf.cast(stack_3_loss_op, tf.float32)) # summary for reconstruction loss
s4_loss_summary = tf.summary.scalar("Stack_4_Loss",tf.cast(stack_4_loss_op, tf.float32)) # summary for reconstruction loss

sae_loss_summary = tf.summary.scalar("StackedAE_Loss",tf.cast(sae_loss_op, tf.float32)) # summary for reconstruction loss
# https://www.tensorflow.org/api_docs/python/tf/summary/merge

6.1 Pretraining

  • TODO: embedd variables (create helper function for that)
  • TODO: visualize

In [ ]:
## create fill layer stacks
layer_stacks[0].update({
    "trainstep":stack_1_trainstep_op, 
    "loss_op":stack_1_loss_op,
    "loss_summary":s1_loss_summary 
})
layer_stacks[1].update({
    "trainstep":stack_2_trainstep_op, 
    "loss_op":stack_2_loss_op, 
    "loss_summary":s2_loss_summary 
})
layer_stacks[2].update({
    "trainstep":stack_3_trainstep_op, 
    "loss_op":stack_3_loss_op, 
    "loss_summary":s3_loss_summary 
})
layer_stacks[3].update({
    "trainstep":stack_4_trainstep_op, 
    "loss_op":stack_4_loss_op, 
    "loss_summary":s4_loss_summary
})
layer_stacks[4].update({
    "trainstep":sae_trainstep_op, 
    "loss_op":sae_loss_op, 
    "loss_summary":sae_loss_summary
})

Session Setup


In [ ]:
saver = tf.train.Saver(tf.global_variables()) # Saver

session = tf.Session()
session.run(tf.global_variables_initializer())

summary_writer = tf.summary.FileWriter(GRAPH_DIR)
summary_writer.add_graph(session.graph)

tf.trainable_variables()

Pretraining Helpers


In [ ]:
def abs_sum_diff(w1, w2):
    return np.sum(np.abs(w1-w2))

def print_weight_change(func):
    import time
    def decorated(*args, **kwargs):
        trainable_weights_start = session.run([v for v in tf.trainable_variables() if "kernel" in v.name])
        
        result = func(*args, **kwargs)
        
        trainable_weights = session.run([v for v in tf.trainable_variables() if "kernel" in v.name])
        weight_names = [v.name for v in tf.trainable_variables() if "kernel" in v.name]
        tw_pairs = zip(weight_names, trainable_weights_start, trainable_weights)
        
        logger.info("Weights that changed:")
        for tw_name, tw_start, tw in tw_pairs:
            logger.info("%s: delta: %.3f"%(tw_name, abs_sum_diff(tw_start, tw)))
        logger.info("")
        return result
    return decorated

@print_weight_change
def train(stack, num_steps):
    logger.info("Started training %s"%stack["name"])
    logger.info("Decreasing LR every %i steps "%DECREASE_LR_STEPS)
    
    session.run(set_learning_rate_op, feed_dict={new_lr:0.1})
    start_time = time.time()
    for step in range(num_steps):
        # decrease learning rate after ever a certain amount of steps 
        if (step+1)%DECREASE_LR_STEPS==0:
            current_lr = session.run(tf_learning_rate)
            decreased_lr = current_lr/10
            session.run(set_learning_rate_op, feed_dict={new_lr:decreased_lr})
            logger.info("%s: Decreasing learning_rate from %0.5f to %0.5f"%(stack["name"], current_lr, decreased_lr))

        # get batch data
        start_index = (step*BATCH_SIZE)%NUM_IMAGES # rotate over images in trainingsdata
        end_index = ((step+1)*BATCH_SIZE)%NUM_IMAGES
        feed_dict = {x_input:X[start_index:end_index]}

        # execute training step
        _, loss, summary = session.run([stack["trainstep"], stack["loss_op"], stack["loss_summary"]], feed_dict=feed_dict)
        summary_writer.add_summary(summary, step)

        # print status every now and then
        if step%NUM_DISPLAY_LOSS_STEPS==0:
            step_time = (time.time() - start_time) / (step+1) / 60.0 # time per step in minutes
            eta = (num_steps - step) * step_time # eta = steps left * step _time
            logger.info("%s, Step %i/%i, Trainingsloss %0.5f, ETA~%0.2fm"%(stack["name"], step,num_steps,loss, eta))

    saver.save(session, jp(GRAPH_DIR, MODEL_NAME), global_step=0)
    logger.info("Finished training of %s"%stack["name"])

In [ ]:
s1a, s1do = session.run([s1_enc_l1_act,s1_dec_noisy_input], feed_dict= {x_input:X[5*BATCH_SIZE:6*BATCH_SIZE]})
print("Measured dropout keep rate: %0.3f"%(np.count_nonzero(s1do) /  np.count_nonzero(s1a) )  )

Execute Pretraining


In [ ]:
train(layer_stacks[0], NUM_PRETRAIN_STEPS)
train(layer_stacks[1], NUM_PRETRAIN_STEPS)
train(layer_stacks[2], NUM_PRETRAIN_STEPS)
train(layer_stacks[3], NUM_PRETRAIN_STEPS)

Execute Finetuning


In [ ]:
train(layer_stacks[4], NUM_FINETUNE_STEPS)

Visualize Reconstruction


In [ ]:
print("Original")
sample_images = X[0:50]
plot_reconstruction(sample_images, name="original")
print("Recfeed_dict=ed")
reconstructed_samples = session.run(x_reconstructed, feed_dict={x_input:sample_images})
plot_reconstruction(reconstructed_samples, name="reconstructed")

Embed images into embedded space


In [ ]:
for i in range(0, NUM_IMAGES, BATCH_SIZE):
    session.run(encode_images_op, feed_dict={x_input:X[i:i+BATCH_SIZE], tf_batch_idx:int(i/BATCH_SIZE) })

saver.save(session, jp(GRAPH_DIR, MODEL_NAME), global_step=0)
logger.info("Encoded %i images"%NUM_IMAGES)

Link encoding to metadata


In [ ]:
# write metadata
IMAGES_EMBEDDINGS_METADATA_FN = '%s.tsv'%("image_embeddings")

with open( jp(GRAPH_DIR, IMAGES_EMBEDDINGS_METADATA_FN) , "w") as mdf:
    for line_id, line in enumerate(Y):
        mdf.write("%s\n"%(Y[line_id]))

link_embedding_to_metadata(
    embedding_var=encoded_images, 
    metadata_file=IMAGES_EMBEDDINGS_METADATA_FN,
    graph_dir=GRAPH_DIR
)

# => you can now look at the encoded images starting tensorboard using 'tensorboard --logdir=graphs/

In [ ]: