In [ ]:

    
import os

import tensorflow as tf
import numpy as np
from PIL import Image
from chainer.functions import caffe
import matplotlib.pyplot as plt



In [ ]:

    
# 準備用関数
def load_image(path, size=None):
    """sizeがNoneのときは画像のそのままのサイズで読み込む"""
    img = Image.open(os.path.expanduser(path)).convert("RGB")
    if size is not None:
        img = img.resize(size, Image.BILINEAR)
    return tf.constant(transform_for_train(np.array([np.array(img)[:, :, :3]], dtype=np.float32)))


def transform_for_train(img):
    """
    読み込む画像がRGBなのに対し、VGGなどのパラメータがBGRの順なので、順番を入れ替える。
    ImageNetの色の平均値を引く。
    """
    return img[..., ::-1] - 120


def transform_from_train(img):
    """
    transform_for_trainの逆操作。
    """
    data = img[:, :, ::-1] + 120
    return data.clip(0, 255)



In [ ]:

    
class Conv:
    def __init__(self, chainer_conv):
        W = chainer_conv.W.data
        b = chainer_conv.b.data
        self.W = tf.constant(np.transpose(W, [2, 3, 1, 0]))
        self.b = tf.constant(b)
    
    def __call__(self, x, stride=1, activation_fn=tf.nn.relu, padding="SAME"):
        y = tf.nn.conv2d(x, self.W, strides=[1, stride, stride, 1], padding=padding) + self.b
        return activation_fn(y) if activation_fn else y


def pool(x, ksize, stride, padding="SAME"):
    return tf.nn.max_pool(x, ksize=[1, ksize, ksize, 1],
                           strides=[1, stride, stride, 1],
                           padding=padding)


def load_caffemodel(caffemodel):
    print("load model... %s" % caffemodel)
    model = caffe.CaffeFunction(caffemodel)
    return lambda layer_name: Conv(getattr(model, layer_name))



In [ ]:

    
class BaseModel:
    """
    特徴量を得るためのモデルのAbstract class
    """
    default_caffemodel = None
    default_alpha = None
    default_beta = None

    def __init__(self, caffemodel=None, alpha=None, beta=None):
        self.conv = load_caffemodel(caffemodel or self.default_caffemodel)
        self.alpha = alpha or self.default_alpha
        self.beta = beta or self.default_beta
    
        
class NIN(BaseModel):
    """
    NINを用いた特徴量
    """
    default_caffemodel = "nin_imagenet.caffemodel"
    default_alpha = [0., 0., 1., 1.]
    default_beta = [1., 1., 1., 1.]
    
    def __call__(self, x):
        """NINの特徴量"""
        x0 = self.conv("conv1")(x, stride=4, padding="VALID")
        
        y1 = self.conv("cccp2")(self.conv("cccp1")(x0), activation_fn=None)
        pool1 = pool(tf.nn.relu(y1), ksize=3, stride=2)
        x1 = self.conv("conv2")(pool1, stride=1)
        
        y2 = self.conv("cccp4")(self.conv("cccp3")(x1), activation_fn=None)
        pool2 = pool(tf.nn.relu(y2), ksize=3, stride=2)
        x2 = self.conv("conv3")(pool2, stride=1)

        y3 = self.conv("cccp6")(self.conv("cccp5")(x2), activation_fn=None)
        pool3 = pool(tf.nn.relu(y3), ksize=3, stride=2)
        
        drop = tf.nn.dropout(pool3, 0.5)
        x3 = self.conv("conv4-1024")(drop)
        
        return [x0, x1, x2, x3]


class VGG(BaseModel):
    """
    VGGを用いた特徴量
    """
    default_caffemodel = "VGG_ILSVRC_16_layers.caffemodel"
    default_alpha = [0., 0., 1., 1.]
    default_beta = [1., 1., 1., 1.]

    def __call__(self, x):
        """VGGの特徴量"""
        y1 = self.conv("conv1_2")(self.conv("conv1_1")(x), activation_fn=None)
        x1 = pool(tf.nn.relu(y1), ksize=2, stride=2)

        y2 = self.conv("conv2_2")(self.conv("conv2_1")(x1), activation_fn=None)
        x2 = pool(tf.nn.relu(y2), ksize=2, stride=2)

        y3 = self.conv("conv3_3")(self.conv("conv3_2")(self.conv("conv3_1")(x2)), activation_fn=None)
        x3 = pool(tf.nn.relu(y3), ksize=2, stride=2)

        y4 = self.conv("conv4_3")(self.conv("conv4_2")(self.conv("conv4_1")(x3)), activation_fn=None)

        return [y1, y2, y3, y4]



In [ ]:

    
def style_matrix(y):
  """画風を表現する行列"""
  _, h, w, c = y.get_shape().as_list()
  y_reshaped = tf.reshape(y, [-1, h * w, c])
  if tf.__version__[0] == '1':
    return tf.matmul(y_reshaped, y_reshaped, adjoint_a=True) / (h * w * c)
  elif tf.__version__[0] == '0':
    return tf.batch_matmul(y_reshaped, y_reshaped, adj_x=True) / (h * w * c)
  else:
    raise

y: shape [batch_size, h, w, c]のテンソル（実はbatch_size = 1）

style_matrix(y): shape [batch_size, c, c]のテンソル（実質、$c\times c$の正方行列）

$y_{ijk}$ = y[0, i, j, k]とおと、

$$ \mathrm{style}(y)_{0ij} = \frac{1}{hwc}\sum_{p,q} y_{pqi}y_{pqj} $$

を意味する。 $hwc$で割っているのはこの値が大きくなりすぎないようにしている。

この式は共分散行列の式と比較すると理解しやすいと思います。

$m\times n$行列$\{X_{ij}\}$を $(X_{11}, X_{12}, \dots, X_{1n})$, …, $(X_{m1}, X_{m2}, \dots, X_{mn})$というように、 $m$個の$n$次元ベクトルと解釈します。その共分散行列は、 $$ \frac{1}{m}\sum_k X_{ki}X_{kj} - \frac{1}{m^2}\sum_k X_{ki}\sum_k X_{kj}. $$ この式の一つ目の項がstyle_matrixの式で使っているものに相当します。



In [ ]:

    
class Generator:
    def __init__(self, base_model, img_orig, img_style, config):
        # 特徴抽出を行う
        mids_orig = base_model(img_orig)
        mids_style = base_model(img_style)

        # 損失関数に使うものを作る
        prods_style = [style_matrix(y) for y in mids_style]

        # img_genを初期化する
        img_gen = tf.Variable(tf.random_uniform(config.output_shape, -20, 20))

        self.img_gen = img_gen
        mids = base_model(img_gen)

        self.loss_orig = []
        self.loss_style = []

        for mid, mid_orig in zip(mids, mids_orig):
            shape = mid.get_shape().as_list()
            self.loss_orig.append(tf.nn.l2_loss(mid - mid_orig) / np.prod(shape))

        for mid, prod_style in zip(mids, prods_style):
            shape = prod_style.get_shape().as_list()
            self.loss_style.append(tf.nn.l2_loss(style_matrix(mid) - prod_style) / np.prod(shape))

        total_loss = 0
        for l, a in zip(self.loss_orig, base_model.alpha):
            if a != 0:
                total_loss += l * (a * config.lam)

        for l, b in zip(self.loss_style, base_model.beta):
            if b != 0:
                total_loss += l * b

        self.total_loss = total_loss
        self.total_train = config.optimizer.minimize(self.total_loss)
        clipped = tf.clip_by_value(self.img_gen, -120., 135.)  # 0〜255の範囲に収まるようにする操作
        self.clip = tf.assign(self.img_gen, clipped)
        
    def generate(self, config):
        with tf.Session() as sess:
            if hasattr(tf, "global_variables_initializer"):
                sess.run(tf.global_variables_initializer())
            else:
                sess.run(tf.initialize_all_variables())

            print("start")
            # 学習開始
            for i in range(config.iteration):
                sess.run([self.total_train, self.clip])
                if (i + 1) % 50 == 0:
                    # l, l1, l2 = sess.run([self.total_loss, self.loss_orig, self.loss_style])
                    # print("%d| loss: %f, loss_orig: %f, loss_style: %f" % (i + 1, l, sum(l1), sum(l2)))
                    # for l1_, l2_ in zip(l1, l2):
                    #     print("loss_orig: %f, loss_style: %f" % (l1_, l2_))

                    self.save_image(sess, config.save_path % (i + 1))
                    
    def save_image(self, sess, path):
        data = sess.run(self.img_gen)[0]
        data = transform_from_train(data)
        img = Image.fromarray(data.astype(np.uint8))
        plt.imshow(img)
        plt.show()
        print("save %s" % path)
        img.save(path)



In [ ]:

    
def generate_model(model_name, **args):
    if model_name == 'nin':
        return NIN(**args)
    if model_name == 'vgg':
        return VGG(**args)



In [ ]:

    
# 設定
class Config:
    batch_size = 1
    iteration = 5000
    lr = 1.0  # 学習レート
    lam = 0.05  # 元画像と画風画像のバランスです。大きくすると元画像寄りになります。
    width = 300  # 生成画像の横幅です。小さくすると雑になる代わりに速くなります。
    height = 300  # 生成画像の縦幅です。小さくすると雑になる代わりに速くなります。
    output_shape = [1, height, width, 3]
    output_dir = "_output"
    model = "nin"
    # model = "vgg"
    original_image = "./images/cat.png"  # ここに元画像のパスを指定してください
    style_image = "./images/gogh.png"  # ここに画風画像のパスを指定してください
    save_path = os.path.expanduser(os.path.join(output_dir, "%05d.png"))
    optimizer = tf.train.AdamOptimizer(lr)
    no_resize_style = False  # Trueにすると画風画像をリサイズせずに利用する（開始が遅くなる）



In [ ]:

    
config = Config()
os.makedirs(config.output_dir, exist_ok=True)
img_orig = load_image(config.original_image, [config.width, config.height])
img_style = load_image(config.style_image, [config.width, config.height] if not config.no_resize_style else None)



In [ ]:

    
model = generate_model(config.model)
# 読み込みに時間がかかります。

# nin = generate_model('nin')
# vgg = generate_model('vgg')



In [ ]:

    
# 画像生成
generator = Generator(model, img_orig, img_style, config)

generator.generate(config)