In [ ]:
import os
import tensorflow as tf
import numpy as np
from PIL import Image
from chainer.functions import caffe
import matplotlib.pyplot as plt
In [ ]:
# 準備用関数
def load_image(path, size=None):
"""sizeがNoneのときは画像のそのままのサイズで読み込む"""
img = Image.open(os.path.expanduser(path)).convert("RGB")
if size is not None:
img = img.resize(size, Image.BILINEAR)
return tf.constant(transform_for_train(np.array([np.array(img)[:, :, :3]], dtype=np.float32)))
def transform_for_train(img):
"""
読み込む画像がRGBなのに対し、VGGなどのパラメータがBGRの順なので、順番を入れ替える。
ImageNetの色の平均値を引く。
"""
return img[..., ::-1] - 120
def transform_from_train(img):
"""
transform_for_trainの逆操作。
"""
data = img[:, :, ::-1] + 120
return data.clip(0, 255)
In [ ]:
class Conv:
def __init__(self, chainer_conv):
W = chainer_conv.W.data
b = chainer_conv.b.data
self.W = tf.constant(np.transpose(W, [2, 3, 1, 0]))
self.b = tf.constant(b)
def __call__(self, x, stride=1, activation_fn=tf.nn.relu, padding="SAME"):
y = tf.nn.conv2d(x, self.W, strides=[1, stride, stride, 1], padding=padding) + self.b
return activation_fn(y) if activation_fn else y
def pool(x, ksize, stride, padding="SAME"):
return tf.nn.max_pool(x, ksize=[1, ksize, ksize, 1],
strides=[1, stride, stride, 1],
padding=padding)
def load_caffemodel(caffemodel):
print("load model... %s" % caffemodel)
model = caffe.CaffeFunction(caffemodel)
return lambda layer_name: Conv(getattr(model, layer_name))
In [ ]:
class BaseModel:
"""
特徴量を得るためのモデルのAbstract class
"""
default_caffemodel = None
default_alpha = None
default_beta = None
def __init__(self, caffemodel=None, alpha=None, beta=None):
self.conv = load_caffemodel(caffemodel or self.default_caffemodel)
self.alpha = alpha or self.default_alpha
self.beta = beta or self.default_beta
class NIN(BaseModel):
"""
NINを用いた特徴量
"""
default_caffemodel = "nin_imagenet.caffemodel"
default_alpha = [0., 0., 1., 1.]
default_beta = [1., 1., 1., 1.]
def __call__(self, x):
"""NINの特徴量"""
x0 = self.conv("conv1")(x, stride=4, padding="VALID")
y1 = self.conv("cccp2")(self.conv("cccp1")(x0), activation_fn=None)
pool1 = pool(tf.nn.relu(y1), ksize=3, stride=2)
x1 = self.conv("conv2")(pool1, stride=1)
y2 = self.conv("cccp4")(self.conv("cccp3")(x1), activation_fn=None)
pool2 = pool(tf.nn.relu(y2), ksize=3, stride=2)
x2 = self.conv("conv3")(pool2, stride=1)
y3 = self.conv("cccp6")(self.conv("cccp5")(x2), activation_fn=None)
pool3 = pool(tf.nn.relu(y3), ksize=3, stride=2)
drop = tf.nn.dropout(pool3, 0.5)
x3 = self.conv("conv4-1024")(drop)
return [x0, x1, x2, x3]
class VGG(BaseModel):
"""
VGGを用いた特徴量
"""
default_caffemodel = "VGG_ILSVRC_16_layers.caffemodel"
default_alpha = [0., 0., 1., 1.]
default_beta = [1., 1., 1., 1.]
def __call__(self, x):
"""VGGの特徴量"""
y1 = self.conv("conv1_2")(self.conv("conv1_1")(x), activation_fn=None)
x1 = pool(tf.nn.relu(y1), ksize=2, stride=2)
y2 = self.conv("conv2_2")(self.conv("conv2_1")(x1), activation_fn=None)
x2 = pool(tf.nn.relu(y2), ksize=2, stride=2)
y3 = self.conv("conv3_3")(self.conv("conv3_2")(self.conv("conv3_1")(x2)), activation_fn=None)
x3 = pool(tf.nn.relu(y3), ksize=2, stride=2)
y4 = self.conv("conv4_3")(self.conv("conv4_2")(self.conv("conv4_1")(x3)), activation_fn=None)
return [y1, y2, y3, y4]
In [ ]:
def style_matrix(y):
"""画風を表現する行列"""
_, h, w, c = y.get_shape().as_list()
y_reshaped = tf.reshape(y, [-1, h * w, c])
if tf.__version__[0] == '1':
return tf.matmul(y_reshaped, y_reshaped, adjoint_a=True) / (h * w * c)
elif tf.__version__[0] == '0':
return tf.batch_matmul(y_reshaped, y_reshaped, adj_x=True) / (h * w * c)
else:
raise
y: shape [batch_size, h, w, c]のテンソル (実はbatch_size = 1)
style_matrix(y): shape [batch_size, c, c]のテンソル (実質、$c\times c$の正方行列)
$y_{ijk}$ = y[0, i, j, k]とおと、
$$ \mathrm{style}(y)_{0ij} = \frac{1}{hwc}\sum_{p,q} y_{pqi}y_{pqj} $$を意味する。 $hwc$で割っているのはこの値が大きくなりすぎないようにしている。
この式は共分散行列の式と比較すると理解しやすいと思います。
$m\times n$行列$\{X_{ij}\}$を $(X_{11}, X_{12}, \dots, X_{1n})$, …, $(X_{m1}, X_{m2}, \dots, X_{mn})$というように、 $m$個の$n$次元ベクトルと解釈します。その共分散行列は、 $$ \frac{1}{m}\sum_k X_{ki}X_{kj} - \frac{1}{m^2}\sum_k X_{ki}\sum_k X_{kj}. $$ この式の一つ目の項がstyle_matrixの式で使っているものに相当します。
In [ ]:
class Generator:
def __init__(self, base_model, img_orig, img_style, config):
# 特徴抽出を行う
mids_orig = base_model(img_orig)
mids_style = base_model(img_style)
# 損失関数に使うものを作る
prods_style = [style_matrix(y) for y in mids_style]
# img_genを初期化する
img_gen = tf.Variable(tf.random_uniform(config.output_shape, -20, 20))
self.img_gen = img_gen
mids = base_model(img_gen)
self.loss_orig = []
self.loss_style = []
for mid, mid_orig in zip(mids, mids_orig):
shape = mid.get_shape().as_list()
self.loss_orig.append(tf.nn.l2_loss(mid - mid_orig) / np.prod(shape))
for mid, prod_style in zip(mids, prods_style):
shape = prod_style.get_shape().as_list()
self.loss_style.append(tf.nn.l2_loss(style_matrix(mid) - prod_style) / np.prod(shape))
total_loss = 0
for l, a in zip(self.loss_orig, base_model.alpha):
if a != 0:
total_loss += l * (a * config.lam)
for l, b in zip(self.loss_style, base_model.beta):
if b != 0:
total_loss += l * b
self.total_loss = total_loss
self.total_train = config.optimizer.minimize(self.total_loss)
clipped = tf.clip_by_value(self.img_gen, -120., 135.) # 0〜255の範囲に収まるようにする操作
self.clip = tf.assign(self.img_gen, clipped)
def generate(self, config):
with tf.Session() as sess:
if hasattr(tf, "global_variables_initializer"):
sess.run(tf.global_variables_initializer())
else:
sess.run(tf.initialize_all_variables())
print("start")
# 学習開始
for i in range(config.iteration):
sess.run([self.total_train, self.clip])
if (i + 1) % 50 == 0:
# l, l1, l2 = sess.run([self.total_loss, self.loss_orig, self.loss_style])
# print("%d| loss: %f, loss_orig: %f, loss_style: %f" % (i + 1, l, sum(l1), sum(l2)))
# for l1_, l2_ in zip(l1, l2):
# print("loss_orig: %f, loss_style: %f" % (l1_, l2_))
self.save_image(sess, config.save_path % (i + 1))
def save_image(self, sess, path):
data = sess.run(self.img_gen)[0]
data = transform_from_train(data)
img = Image.fromarray(data.astype(np.uint8))
plt.imshow(img)
plt.show()
print("save %s" % path)
img.save(path)
In [ ]:
def generate_model(model_name, **args):
if model_name == 'nin':
return NIN(**args)
if model_name == 'vgg':
return VGG(**args)
In [ ]:
# 設定
class Config:
batch_size = 1
iteration = 5000
lr = 1.0 # 学習レート
lam = 0.05 # 元画像と画風画像のバランスです。大きくすると元画像寄りになります。
width = 300 # 生成画像の横幅です。小さくすると雑になる代わりに速くなります。
height = 300 # 生成画像の縦幅です。小さくすると雑になる代わりに速くなります。
output_shape = [1, height, width, 3]
output_dir = "_output"
model = "nin"
# model = "vgg"
original_image = "./images/cat.png" # ここに元画像のパスを指定してください
style_image = "./images/gogh.png" # ここに画風画像のパスを指定してください
save_path = os.path.expanduser(os.path.join(output_dir, "%05d.png"))
optimizer = tf.train.AdamOptimizer(lr)
no_resize_style = False # Trueにすると画風画像をリサイズせずに利用する(開始が遅くなる)
In [ ]:
config = Config()
os.makedirs(config.output_dir, exist_ok=True)
img_orig = load_image(config.original_image, [config.width, config.height])
img_style = load_image(config.style_image, [config.width, config.height] if not config.no_resize_style else None)
In [ ]:
model = generate_model(config.model)
# 読み込みに時間がかかります。
# nin = generate_model('nin')
# vgg = generate_model('vgg')
In [ ]:
# 画像生成
generator = Generator(model, img_orig, img_style, config)
generator.generate(config)