Intro

Exploratory notebook related to the theory and concepts behind Style Transfer using CNNs. Includes toy examples implementation and visualization.

(FastAI - Lesson 8)

Style Transfer

About the generation of new images by weighted combination of a target visual style and a target semantic content. The process tries to optimize both style and content by refining the input data; it generally uses information extracted from internal layer of an already trained CNN to obtain a representation of the style component.


In [ ]:
from __future__ import print_function

import time
from PIL import Image
import numpy as np

import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

import keras
from keras import backend as K
from keras.models import Model
from keras import metrics
from keras.applications.vgg16 import VGG16

import scipy
from scipy.optimize import fmin_l_bfgs_b
from scipy.misc import imsave

#backend.set_image_data_format('channels_last')
#keras.backend.set_image_dim_ordering('tf')

import os
import sys
sys.path.append(os.path.join(os.getcwd(), os.pardir))

from utils.vgg_utils import preprocess, deprocess, gram_matrix
from utils.vgg16_avg import VGG16_Avg

RES_DIR = os.path.join('resources')

%load_ext autoreload
%autoreload 2

Load Data


In [ ]:
height = 256
width = 256

In [ ]:
# load content image
content_image = None
with Image.open(os.path.join(RES_DIR, 'superman.jpg')) as img:
    img = img.resize((height, width))
    content_image = np.asarray(img, dtype='float32')
    plt.imshow(img.convert(mode='RGB'))
    plt.show()

In [ ]:
# load style image
style_image = None
with Image.open(os.path.join(RES_DIR, 'comics_style.jpg')) as img:
    img = img.resize((height, width))
    style_image = np.asarray(img, dtype='float32')
    plt.imshow(img.convert(mode='RGB'))
    plt.show()

In [ ]:
content_image.shape

Recreate Input

In this first step I am going to simply recreate an image from noise using the content loss.


In [ ]:
# define input image
img_arr = preprocess(np.expand_dims(style_image, axis=0))
#img_arr = preproc(np.expand_dims(np.array(Image.open(os.path.join(RES_DIR, 'simpsons_style.jpg'))), axis=0))
shp = img_arr.shape

print(shp)

In [ ]:
# get VGG model
model = VGG16(include_top=False)

In [ ]:
# define layer model (VGG model input and intermediate layer output)
layer = model.get_layer('block5_conv1').output
layer_model = Model(model.input, layer)
targ = K.variable(layer_model.predict(img_arr))

In [ ]:
# define our loss and gradients
loss = metrics.mse(layer, targ)
grads = K.gradients(loss, model.input)
fn = K.function([model.input], [loss]+grads)

In [ ]:
# utility function to hold loss and gradients
class Evaluator(object):
    def __init__(self, f, shp): self.f, self.shp = f, shp
        
    def loss(self, x):
        loss_, self.grad_values = self.f([x.reshape(self.shp)])
        return loss_.astype(np.float64)

    def grads(self, x): return self.grad_values.flatten().astype(np.float64)

In [ ]:
evaluator = Evaluator(fn, shp)

In [ ]:
# run optimization process and save result image at each iteration
def solve_image(eval_obj, iterations, x, img_shape, dest_dir=''):
    for i in range(iterations):
        start_time = time.time()
        x, min_val, info = fmin_l_bfgs_b(eval_obj.loss, x.flatten(),
                                         fprime=eval_obj.grads, maxfun=20)
        x = np.clip(x, -127,127)
        print('Current loss value:', min_val)
        end_time = time.time()
        print('Iteration {} completed in {:.1f}s'.format(i, end_time - start_time))
        img = deprocess(x.copy(), img_shape)[0]
        img_filepath = os.path.join(dest_dir, "res_at_iteration_{}.png".format(i))
        imsave(img_filepath, img)
    return x

In [ ]:
x = np.random.uniform(-2.5, 2.5, shp)
#x = np.random.uniform(0, 255, shp) - 128.
plt.imshow(x[0]);

In [ ]:
x = solve_image(evaluator, 5, x, 'recreate_input')

In [ ]:
plt.imshow(deproc(x,shp)[0].astype('uint8'))

Recreate Style

While in previous section we recreated the input from noise, here we are actually recreating the style from noise.


In [ ]:
# load and process input content
style_arr = preprocess(np.expand_dims(style_image, axis=0)[:,:,:,:3])
shp = style_arr.shape

print(shp)

In [ ]:
# get VGG model
#model = VGG16(include_top=False, pooling='avg', input_shape=shp[1:]) #input_tensor=input_tensor
model = VGG16_Avg(include_top=False, input_shape=shp[1:])

In [ ]:
model.summary()

In [ ]:
model.summary()

In [ ]:
outputs = {l.name: l.output for l in model.layers}
layers = [outputs['block{}_conv1'.format(o)] for o in range(1,3)]

In [ ]:
layers_model = Model(model.input, layers)
targs = [K.variable(o) for o in layers_model.predict(style_arr)]

In [ ]:
def style_loss(x, targ):
    return metrics.mse(gram_matrix(x), gram_matrix(targ))
    #S = gram_matrix(style)
    #C = gram_matrix(combination)
    #channels = 3
    #size = height * width
    #return K.sum(K.square(S - C)) / (4. * (channels ** 2) * (size ** 2))

In [ ]:
loss = sum(style_loss(l1[0], l2[0]) for l1,l2 in zip(layers, targs))
grads = K.gradients(loss, model.input)
style_fn = K.function([model.input], [loss]+grads)
evaluator = Evaluator(style_fn, shp)

In [ ]:
rand_img = lambda shape: np.random.uniform(-2.5, 2.5, shape)/1
x = rand_img(shp)
#x = scipy.ndimage.filters.gaussian_filter(x, [0,2,2,0])

In [ ]:
plt.imshow(x[0]);

In [ ]:
iterations=10
x = rand_img(shp)
x = solve_image(evaluator, iterations, x, folder_name='recreate_style')

Style Transfer

Here we are finally use both the content and style images to operate the style transfer task.


In [ ]:
# load and process input content
content_arr = preprocess(np.expand_dims(content_image, axis=0))
style_arr = preprocess(np.expand_dims(style_image, axis=0))
shp = content_arr.shape

print(content_arr.shape)
print(style_arr.shape)

In [ ]:
# get VGG model
# later versions of Keras use pooling='avg'
model = VGG16(include_top=False, input_shape=shp[1:])

In [ ]:
outputs = {l.name: l.output for l in model.layers}
style_layers = [outputs['block{}_conv2'.format(o)] for o in range(1,6)]
content_name = 'block4_conv2'
content_layer = outputs[content_name]

In [ ]:
style_model = Model(model.input, style_layers)
style_targs = [K.variable(o) for o in style_model.predict(style_arr)]

In [ ]:
content_model = Model(model.input, content_layer)
content_targ = K.variable(content_model.predict(content_arr))

In [ ]:
style_wgts = [0.05,0.2,0.2,0.25,0.3]

In [ ]:
loss = sum(style_loss(l1[0], l2[0])*w
           for l1,l2,w in zip(style_layers, style_targs, style_wgts))
loss += metrics.mse(content_layer, content_targ)/2
grads = K.gradients(loss, model.input)
transfer_fn = K.function([model.input], [loss]+grads)

In [ ]:
evaluator = Evaluator(transfer_fn, shp)

In [ ]:
iterations=10
x = np.random.uniform(-2.5, 2.5, shp)
plt.imshow(x[0]);

In [ ]:
x = solve_image(evaluator, iterations, x, shp, dest_dir=os.path.join('results', 'style_transfer'))

Different Approach (TOFIX)

See Keras example

Feed concatenation of images directly to the network. The previous approach builds two different models and combines their loss.


In [ ]:
feature_layers = ['block1_conv2', 'block2_conv2',
                  'block3_conv3', 'block4_conv3',
                  'block5_conv3']
for layer_name in feature_layers:
    layer_features = layers[layer_name]
    style_features = layer_features[1, :, :, :]
    combination_features = layer_features[2, :, :, :]
    sl = style_loss(style_features, combination_features)
    loss += (style_weight / len(feature_layers)) * sl

In [ ]:
content_image = backend.variable(content_array)
style_image = backend.variable(style_array)
combination_image = backend.placeholder((1, height, width, 3))
#if backend.image_data_format() == 'channels_first':
#    combination_image = backend.placeholder((1, 3, height, width))
#else:
#    combination_image = backend.placeholder((1, height, width, 3))

input_tensor = backend.concatenate([content_image,
                                    style_image,
                                    combination_image], axis=0)

In [ ]:
content_weight = 0.025
style_weight = 5.0
total_variation_weight = 1.0

In [ ]:
loss = backend.variable(0.)

In [ ]:
layer_features = layers['block2_conv2']
content_image_features = layer_features[0, :, :, :]
combination_features = layer_features[2, :, :, :]

loss += content_weight * content_loss(content_image_features,
                                      combination_features)

In [ ]:
def total_variation_loss(x):
    a = backend.square(x[:, :height-1, :width-1, :] - x[:, 1:, :width-1, :])
    b = backend.square(x[:, :height-1, :width-1, :] - x[:, :height-1, 1:, :])
    return backend.sum(backend.pow(a + b, 1.25))

loss += total_variation_weight * total_variation_loss(combination_image)

In [ ]:
grads = backend.gradients(loss, combination_image)

In [ ]:
outputs = [loss]
outputs += grads
f_outputs = backend.function([combination_image], outputs)

def eval_loss_and_grads(x):
    x = x.reshape((1, height, width, 3))
    outs = f_outputs([x])
    loss_value = outs[0]
    if len(outs[1:]) == 1:
        grad_values = outs[1].flatten().astype('float64')
    else:
        grad_values = np.array(outs[1:]).flatten().astype('float64')
    return loss_value, grad_values

In [ ]:
x = np.random.uniform(0, 255, (1, height, width, 3)) - 128.