Imports


In [1]:
import sys
sys.path.append('..')

import os
import json
from time import time
import numpy as np
from tqdm import tqdm

import theano
import theano.tensor as T
from theano.sandbox.cuda.dnn import dnn_conv

from PIL import Image


Using gpu device 0: Graphics Device (CNMeM is disabled)

N.B. The code from the following imports is lifted from the original dcgan project


In [2]:
from lib import activations
from lib import updates
from lib import inits
from lib.rng import py_rng, np_rng
from lib.ops import batchnorm, conv_cond_concat, deconv, dropout, l2normalize
from lib.metrics import nnc_score, nnd_score
from lib.theano_utils import floatX, sharedX
from lib.data_utils import OneHot, shuffle, iter_data, center_crop, patch

In [3]:
from fuel.datasets.hdf5 import H5PYDataset
from fuel.schemes import ShuffledScheme, SequentialScheme
from fuel.streams import DataStream

Data Stuff


In [30]:
import h5py
try:
    hf["target"].shape
except:
    hf = h5py.File('faces.hdf5','r+')
num_samples = hf["input"].shape[0]

print "number of samples in dataset : %i" %num_samples


number of samples in dataset : 9216

In [32]:
split_dict = {
     'train': {'input': (2000, num_samples), 'target': (2000, num_samples)},
     'test': {'input': (0, 1000), 'target': (0, 1000)},
     'val': {'input': (1000, 2000), 'target': (1000, 2000)}
}
hf.attrs['split'] = H5PYDataset.create_split_array(split_dict)
train_set = H5PYDataset('faces.hdf5', which_sets=('train',))
test_set = H5PYDataset('faces.hdf5', which_sets=('test',))
val_set = H5PYDataset('faces.hdf5', which_sets=('val',))

batch_size = 128

#TODO : use shuffledscheme instead?  Seems slower, might have screwed up the chunksize in the HDF5 files?

tr_scheme = SequentialScheme(examples=train_set.num_examples, batch_size=batch_size)
tr_stream = DataStream(train_set, iteration_scheme=tr_scheme)

val_scheme = SequentialScheme(examples=val_set.num_examples, batch_size=batch_size)
val_stream = DataStream(val_set, iteration_scheme=val_scheme)

test_scheme = SequentialScheme(examples=test_set.num_examples, batch_size=batch_size)
test_stream = DataStream(test_set, iteration_scheme=test_scheme)

Check data looks sensible


In [6]:
for x_train, x_target in tr_stream.get_epoch_iterator():
    break
print "EXAMPLE TARGET IMAGE:"

Image.fromarray(x_target[3].astype(np.uint8))


EXAMPLE TARGET IMAGE:
Out[6]:

In [7]:
print "EXAMPLE INPUT IMAGE:"
Image.fromarray(x_train[3].astype(np.uint8))


EXAMPLE INPUT IMAGE:
Out[7]:

Setup Neural Network


In [ ]:
def target_transform(X):
    return floatX(X).transpose(0, 3, 1, 2)/127.5 - 1.

def input_transform(X):
    return target_transform(X)

In [8]:
l2 = 1e-5         # l2 weight decay
nvis = 196        # # of samples to visualize during training
b1 = 0.5          # momentum term of adam
nc = 3            # # of channels in image
nbatch = 128      # # of examples in batch
npx = 64          # # of pixels width/height of images

nx = npx*npx*nc   # # of dimensions in X
niter = 1000        # # of iter at starting learning rate
niter_decay = 30   # # of iter to linearly decay learning rate to zero
lr = 0.0002       # initial learning rate for adam
ntrain = 25000   # # of examples to train on

relu = activations.Rectify()
sigmoid = activations.Sigmoid()
lrelu = activations.LeakyRectify()
tanh = activations.Tanh()
bce = T.nnet.binary_crossentropy

def mse(x,y):
    return T.sum(T.pow(x-y,2), axis = 1)

gifn = inits.Normal(scale=0.02)
difn = inits.Normal(scale=0.02)
sigma_ifn = inits.Normal(loc = -100., scale=0.02)
gain_ifn = inits.Normal(loc=1., scale=0.02)
bias_ifn = inits.Constant(c=0.)

The following methods are to help adjust the sizes of the convolutional layers in the generator and discriminator, which is very fiddly to do otherwise. The (overloaded) method make_conv_set can be used to create both the conv and deconv sets of layers. Note that the 'size' of the images is the size of the shortest side (32 in the input set, 128 in the target set). Only use powers of 2 here.


In [11]:
def make_conv_layer(X, input_size, output_size, input_filters, 
                    output_filters, name, index,
                    weights = None, filter_sz = 5):
    
    is_deconv = output_size >= input_size

    w_size = (input_filters, output_filters, filter_sz, filter_sz) \
            if is_deconv else (output_filters, input_filters, filter_sz, filter_sz)
    
    if weights is None:
        w = gifn(w_size, '%sw%i' %(name, index))
        g = gain_ifn((output_filters), '%sg%i' %(name, index))
        b = bias_ifn((output_filters), '%sb%i' %(name, index))
    else:
        w,g,b = weights
    
    conv_method = deconv if is_deconv else dnn_conv
    activation = relu if is_deconv else lrelu
    
    sub = output_size / input_size if is_deconv else input_size / output_size
    
    if filter_sz == 3:
        bm = 1
    else:
        bm = 2
    
    layer = activation(batchnorm(conv_method(X, w, subsample=(sub, sub), border_mode=(bm, bm)), g=g, b=b))
    
    return layer, [w,g,b]

def make_conv_set(input, layer_sizes, num_filters, name, weights = None, filter_szs = None):
    assert(len(layer_sizes) == len(num_filters))
    
    vars_ = []
    layers_ = []
    current_layer = input
    
    for i in range(len(layer_sizes) - 1):
        input_size = layer_sizes[i]
        output_size = layer_sizes[i + 1]
        input_filters = num_filters[i]
        output_filters = num_filters[i + 1]
        
        if weights is not None:
            this_wts = weights[i * 3 : i * 3 + 3]
        else:
            this_wts = None
            
        if filter_szs != None:
            filter_sz = filter_szs[i]
        else:
            filter_sz = 5
        
        layer, new_vars = make_conv_layer(current_layer, input_size, output_size, 
                                          input_filters, output_filters, name, i, 
                                          weights = this_wts, filter_sz = filter_sz)
        
        vars_ += new_vars
        layers_ += [layer]
        current_layer = layer
        
    return current_layer, vars_, layers_

In [ ]:
# inputs
X = T.tensor4()

## encode layer
e_layer_sizes = [32, 16, 8]
e_filter_sizes = [3, 256, 1024]

eX, e_params, e_layers = make_conv_set(X, e_layer_sizes, e_filter_sizes, "e")

## generative layer
g_layer_sizes = [8, 16, 32, 64, 128]
g_num_filters = [1024, 512, 256, 256, 128]

g_out, g_params, g_layers = make_conv_set(eX, g_layer_sizes, g_num_filters, "g")
gwx = gifn((128, nc, 5, 5), 'gwx')
g_params += [gwx]
gX = tanh(deconv(g_out, gwx, subsample=(1, 1), border_mode=(2, 2)))

## discrim layer(s)

df1 = 128
d_layer_sizes = [128, 64, 32, 16, 8]
d_filter_sizes = [3, df1, 2 * df1, 4 * df1, 8 * df1]

dwy = difn((df1 * 8 * 10 * 8, 1), 'dwy')

def discrim(input, name, weights=None):
    d_out, disc_params, d_layers = make_conv_set(input, d_layer_sizes, d_filter_sizes, name, weights = weights)
    d_flat = T.flatten(d_out, 2)
    
    disc_params += [dwy]
    y = sigmoid(T.dot(d_flat, dwy))
    
    return y, disc_params, d_layers

# target outputs
target = T.tensor4()

p_real, d_params, d_layers = discrim(target, "d")
#we need to make sure the p_gen params are the same as the p_real params
p_gen , d_params2, d_layers = discrim(gX, "d", weights=d_params)

In [21]:
# test everything working so far (errors are most likely size mismatches)
f = theano.function([X], p_gen)
f(input_transform(x_train)).shape


Out[21]:
(128, 1)

Next we set up the various cost functions we need


In [23]:
from theano.tensor.signal.downsample import max_pool_2d

## GAN costs
d_cost_real = bce(p_real, T.ones(p_real.shape)).mean()
d_cost_gen = bce(p_gen, T.zeros(p_gen.shape)).mean()
g_cost_d = bce(p_gen, T.ones(p_gen.shape)).mean()

## MSE encoding cost is done on an (averaged) downscaling of the image
target_pool = max_pool_2d(target, (4,4), mode="average_exc_pad",ignore_border=True)
target_flat = T.flatten(target_pool, 2)
gX_pool = max_pool_2d(gX, (4,4), mode="average_exc_pad",ignore_border=True)
gX_flat = T.flatten(gX_pool,2)
enc_cost = mse(gX_flat, target_flat).mean() 

## generator cost is a linear combination of the discrim cost plus the MSE enocding cost
d_cost = d_cost_real + d_cost_gen
g_cost = g_cost_d + enc_cost / 100   ## if the enc_cost is weighted too highly it will take a long time to train

## N.B. e_cost and e_updates will only try and minimise MSE loss on the autoencoder (for debugging)
e_cost = enc_cost

cost = [g_cost_d, d_cost_real, enc_cost]

elrt = sharedX(0.002)
lrt = sharedX(lr)
d_updater = updates.Adam(lr=lrt, b1=b1, regularizer=updates.Regularizer(l2=l2))
g_updater = updates.Adam(lr=lrt, b1=b1, regularizer=updates.Regularizer(l2=l2))
e_updater = updates.Adam(lr=elrt, b1=b1, regularizer=updates.Regularizer(l2=l2))

d_updates = d_updater(d_params, d_cost)
g_updates = g_updater(e_params + g_params, g_cost)
e_updates = e_updater(e_params, e_cost)

In [24]:
print 'COMPILING'
t = time()
_train_g = theano.function([X, target], cost, updates=g_updates)
_train_d = theano.function([X, target], cost, updates=d_updates)
_train_e = theano.function([X, target], cost, updates=e_updates)
_get_cost = theano.function([X, target], cost)
print '%.2f seconds to compile theano functions'%(time()-t)


COMPILING
30.91 seconds to compile theano functions

Training code

Code for generating the images every 100 batches or so.


In [25]:
img_dir = "gen_images/"

if not os.path.exists(img_dir):
    os.makedirs(img_dir)

ae_encode = theano.function([X, target], [gX, target])

def inverse(X):
    X_pred = (X.transpose(0, 2, 3, 1) + 1) * 127.5
    X_pred = np.rint(X_pred).astype(int)
    X_pred = np.clip(X_pred, a_min = 0, a_max = 255)
    return X_pred.astype('uint8')


def save_sample_pictures():
    for te_train, te_target in test_stream.get_epoch_iterator():
        break
    te_out, te_ta = ae_encode(input_transform(te_train), target_transform(te_target))
    te_reshape = inverse(te_out)
    te_target_reshape = inverse(te_ta)

    new_size = (128 * 6, 160 * 12)
    new_im = Image.new('RGB', new_size)
    r = np.random.choice(128, 24, replace=False).reshape(2,12)
    for i in range(2):
        for j in range(12):
            index =  r[i][j]
            
            target_im = Image.fromarray(te_target_reshape[index])
            train_im = Image.fromarray(te_train[index].repeat(axis=0,repeats=4).repeat(axis=1,repeats=4).astype(np.uint8))
            im = Image.fromarray(te_reshape[index])
            
            new_im.paste(target_im, (128 * i * 3, 160 * j))
            new_im.paste(train_im, (128 * (i * 3 + 1), 160 * j))
            new_im.paste(im, (128 * (i * 3 + 2), 160 * j))
    img_loc = "gen_images/%i.png" %int(time())     
    print "saving images to %s" %img_loc
    new_im.save(img_loc)
    
save_sample_pictures()

In [26]:
def mn(l):
    if sum(l) == 0:
        return 0
    return sum(l) / len(l)

## TODO : nicer way of coding these means?

def get_test_errors():
    print "getting test error"
    g_costs = []
    d_costs = []
    e_costs = []
    k_costs = []
    for i in range(20):
        try:
            x_train, x_target  = te_iterator.next()
        except:
            te_iterator = val_stream.get_epoch_iterator()
            x_train, x_target  = te_iterator.next()
        x = input_transform(x_train)
        t = target_transform(x_target)
        cost = _get_cost(x,t)
        g_cost, d_cost, enc_cost = cost
        g_costs.append(g_cost)
        d_costs.append(d_cost)
        e_costs.append(enc_cost)
        
    s= " ,".join(["test errors :", str(mn(g_costs)), str(mn(d_costs)), str(mn(e_costs))])
    return s

Train Model

Finally, we come to the actual training of the model. This code can be keyboard interrupted, and the weights will be stored in memory, allowing us to stop, adjust and restart the training (this is how I got the model to train). For advice on training see the blog post at (#TODO)


In [ ]:
iterator = tr_stream.get_epoch_iterator()

# you may wish to reset the learning rate to something of your choosing if you feel it is too high/low
lrt = sharedX(lr)

In [33]:
from time import time

n_updates = 0
t = time()

n_epochs = 200

print "STARTING"



for epoch in range(n_epochs):
    
    
    tm = time()

    g_costs = []
    d_costs = []
    e_costs = []
    
    ## TODO : produces pretty ugly output, redo this?
    for i in tqdm(range(num_samples/128)):
        
        try:
            x_train, x_target  = iterator.next()
        except:
            iterator = tr_stream.get_epoch_iterator()
            x_train, x_target  = iterator.next()
        x = input_transform(x_train)
        t = target_transform(x_target)

        ## optional - change the criteria for how often we train the generator or discriminator
        if n_updates % 2 == 1:
            cost = _train_g(x,t) 
        else:
            cost = _train_d(x,t)
            
        # optional - only train the generator on MSE cost
        #cost = _train_e(x,t)
        g_cost, d_cost, enc_cost = cost
        g_costs.append(g_cost)
        d_costs.append(d_cost)
        e_costs.append(enc_cost)

        if n_updates % 100 == 0:
            s= " ,".join(["training errors :", str(mn(g_costs)), str(mn(d_costs)), str(mn(e_costs))])
            g_costs = []
            d_costs = []
            e_costs = []
            print get_test_errors()
            print s
            sys.stdout.flush()
            save_sample_pictures()
        n_updates += 1  

    print "epoch %i of %i took %.2f seconds" %(epoch, n_epochs, time() - tm)
    
    ## optional - reduce the learning rate as you go
    #lrt.set_value(floatX(lrt.get_value() * 0.95))
    #print lrt.get_value()
    
    
    sys.stdout.flush()


  0%|          | 0/72 [00:00<?, ?it/s]
STARTING
getting test error
test errors : ,48.5995481491 ,3.94193813801 ,645.511212158
training errors : ,6.17664718628 ,6.20431184769 ,654.965026855
                                               
epoch 0 of 200 took 308.92 seconds
 39%|███▉      | 28/72 [01:43<02:43,  0.27it/s]
getting test error
test errors : ,8.7343829155 ,4.93261101246 ,168.448952484
training errors : ,4.3123496329 ,1.67833978529 ,178.185795751
                                               
epoch 1 of 200 took 301.45 seconds
 78%|███████▊  | 56/72 [03:26<00:59,  0.27it/s]
getting test error
test errors : ,4.42765614986 ,2.67707374096 ,119.657383728
training errors : ,2.4958941113 ,1.4228379116 ,147.280639648
                                               
epoch 2 of 200 took 300.45 seconds
                                               
epoch 3 of 200 took 265.26 seconds
 17%|█▋        | 12/72 [00:44<03:41,  0.27it/s]
getting test error
test errors : ,5.20457861423 ,2.05324191451 ,116.56504097
training errors : ,2.26672356862 ,1.0022797596 ,117.856253991
 36%|███▌      | 26/72 [02:11<02:50,  0.27it/s]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-33-65f523c7e6dd> in <module>()
     32             cost = _train_g(x,t)
     33         else:
---> 34             cost = _train_d(x,t)
     35         #cost = _train_e(x,t)
     36         g_cost, d_cost, enc_cost = cost

/home/mike/envs/lasagne/lib/python2.7/site-packages/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    857         t0_fn = time.time()
    858         try:
--> 859             outputs = self.fn()
    860         except Exception:
    861             if hasattr(self.fn, 'position_of_error'):

KeyboardInterrupt: 

Save weights if wanted

You can reuse them by using the weights in the make_conv_set method #TODO - actually try this!


In [ ]:
import pickle

all_params = [e_params, g_params, d_params]

pickle.dump(all_params, open("faces_dcgan.pkl", 'w'))

In [ ]: