The goal of this project is to discover aesthetically interesting digital video feedback processes by incorporating learned features into a hand constructed feedback process.
Consider a video feedback process defined by the mapping from images to images $x_t = \Delta_\phi(x_{t-1})$, where $\Delta$ is a transition function, $\phi$ is a parameterization which may be spatially varying or interactively controlled, and $x_t$ is the image at time step $t$.
Additionally suppose we have a deep autoencoder $\gamma$ for images: $$h^{\ell+1} = \gamma_\ell(h^\ell)$$ $$h^{\ell} \approx \gamma_\ell^{-1}(h^{\ell+1})$$ $$h^0 = x$$
Combining these two concepts, we can define a new feedback process where position in the feature hierarchy acts like another spatial dimension: $$h_t^\ell = \Delta_\phi( h_{t-1}^\ell, \gamma_{\ell-1}(h_{t-1}^{l-1}), \gamma_\ell^{-1}(h_{t-1}^{\ell+1}) )$$
The goal then is to learn a deep autoencoder which represents abstract image features and admits layer-wise encoding and decoding as above. I propose a convolutional pooling autoencoder based on the convolutional autoencoders of Masci et al. and the upsampling layers of Long et al..
Below are a number of experiments training pooled convolutional autoencoders on the CIFAR-10 dataset using caffe. The caffe model definitions are available at my GitHub.
In [3]:
#get caffe and pycaffe set up
import numpy as np
import matplotlib.pyplot as plt
import scipy.ndimage
%matplotlib inline
#assuming feature-feedback repo and caffe root are in the same directory
caffe_root = '../../caffe/'
import sys
sys.path.insert(0, caffe_root+'python')
import caffe
from caffe.proto import caffe_pb2
#I have compiled caffe for CPU only (nvidia GPUs only)
caffe.set_mode_cpu()
In [4]:
#load the cifar mean into numpy array
blob = caffe_pb2.BlobProto()
data = open('../../caffe/examples/cifar10/mean.binaryproto').read()
blob.ParseFromString(data)
mean = caffe.io.blobproto_to_array(blob)[0].transpose([1,2,0])/256
In [37]:
def get_reconstructions(net, mean, n, compare=0):
inputs = np.hstack([ np.copy(net.blobs['data'].data[i]).transpose([1,2,0])+mean for i in range(n)])
outputs = np.hstack([ np.copy(net.blobs['decode1neuron'].data[i]).transpose([1,2,0])+mean for i in range(n)])
#clamp the reconstruction to [0,1]
#even with tanh activation outputs can be out of bounds once mean is added back
np.clip(outputs, 0, 1, outputs)
#compare to linear interpolation through the intermediate spatial resolution
#this is a good baseline for how well spatial information is stored and
#recovered by the convolutional layers
if compare>0:
comparisons = np.dsplit(np.copy(inputs), inputs.shape[2])
comparisons = [scipy.ndimage.zoom(np.squeeze(c), 1./compare, order=3) for c in comparisons]
comparisons = [scipy.ndimage.zoom(c, compare, order=3) for c in comparisons]
comparisons = np.dstack(comparisons)
np.clip(comparisons, 0, 1, comparisons)
return (inputs, outputs, comparisons)
return (inputs, outputs)
def vis_reconstructions(rec):
disp = np.vstack(rec)
plt.figure(figsize=(10,10))
plt.imshow(disp, interpolation='None')
In [64]:
def get_filters(net, layer = 'encode1'):
filters = np.copy(net.params[layer][0].data).transpose([0,2,3,1])
biases = np.copy(net.params[layer][1].data)
print biases
return filters
def vis_filters(filters, rows):
#normalize preserving 0 = 50% gray
filters/=2*abs(filters).max()
filters+=.5
disp = np.hstack([np.pad(f,[(1,1),(1,1),(0,0)],'constant', constant_values=[.5]) for f in filters])
disp = np.vstack(np.hsplit(disp,rows))
return disp
In [35]:
def get_responses(net, layer, filts, n):
reps = np.hstack([ net.blobs[layer].data[i].transpose([1,2,0]) for i in range(n)])
# normalize preserving 0 = 50% gray
reps/=2*abs(reps).max()
reps+=.5
reps = np.vstack(np.dsplit(reps, filts))
return reps.squeeze()
def vis_responses(reps):
plt.figure(figsize=(10,10))
plt.imshow(reps, interpolation='none', cmap='coolwarm')
In [ ]:
#run this cell to solve the model defined in the solver_file
solver_file = 'autoencoder-0-solver.prototxt'
solver = caffe.get_solver(solver_file);
solver.solve();
In [25]:
#load the model trained by the previous cell
#(and saved elsewhere in the repo) and set it up on test data
model_def_file = 'autoencoder-0.prototxt'
model_file = '../bin/cifar-tanh-20epoch-unregularized.caffemodel'
net = caffe.Net(model_def_file, model_file, caffe.TEST)
#run a batch
net.forward()
Out[25]:
In [38]:
rec = get_reconstructions(net, mean, 8, compare=2)
vis_reconstructions(rec)
In [154]:
filters = get_filters(net)
disp = vis_filters(filters, 3)
plt.imshow(disp, interpolation='none')
Out[154]:
In [156]:
reps = get_responses(net, 'pool1', 12, 8)
vis_responses(reps)
Out[156]:
In [3]:
solver_file = 'autoencoder-1-solver.prototxt'
solver = caffe.get_solver(solver_file)
solver.solve()
In [41]:
model_def_file = 'autoencoder-1.prototxt'
model_file = '../bin/cifar-tanh-20epoch-squeezing.caffemodel'
net = caffe.Net(model_def_file, model_file, caffe.TEST)
#run a batch
net.forward()
Out[41]:
In [43]:
rec = get_reconstructions(net, mean, 8, compare=2)
vis_reconstructions(rec)
In [42]:
filters = get_filters(net)
disp = vis_filters(filters, 2)
plt.imshow(disp, interpolation='none')
Out[42]:
These filters appear to be learning color gradients in a subtractive color space
In [ ]:
In [44]:
reps = get_responses(net, 'pool1', 6, 8)
vis_responses(reps)
In [ ]:
solver_file = 'autoencoder-2-solver.prototxt'
solver = caffe.get_solver(solver_file)
solver.solve('autoencoder-2_iter_20000.solverstate')
In [45]:
model_def_file = 'autoencoder-2.prototxt'
#model_file = '../bin/cifar-tanh-20epoch-squeezing-pool3.caffemodel'
model_file = 'autoencoder-2_iter_20000.caffemodel'
net = caffe.Net(model_def_file, model_file, caffe.TEST)
#run a batch
net.forward()
Out[45]:
In [46]:
rec = get_reconstructions(net, mean, 8, compare=4)
vis_reconstructions(rec)
In [47]:
filters = get_filters(net)
disp = vis_filters(filters, 6)
plt.imshow(disp, interpolation='none')
Out[47]:
In [48]:
reps = get_responses(net, 'pool1', 16, 8)
vis_responses(reps)
In [ ]:
solver_file = 'autoencoder-6-solver.prototxt'
solver = caffe.get_solver(solver_file)
solver.solve('autoencoder-6_iter_10000.solverstate')
In [49]:
model_def_file = 'autoencoder-6.prototxt'
model_file = 'autoencoder-6_iter_20000.caffemodel'
net = caffe.Net(model_def_file, model_file, caffe.TEST)
#run a batch
net.forward()
Out[49]:
In [54]:
rec = get_reconstructions(net, mean, 8, compare=2)
vis_reconstructions(rec)
In [51]:
filters = get_filters(net)
disp = vis_filters(filters, 3)
plt.imshow(disp, interpolation='none')
Out[51]:
Interesting--these look like 3x3 filters with a random fringe. Curiously it learned better than the first architecture above, even though the extra pixels appear to be wasted. Perhaps it got a better random initialization, or the filter noisiness acts like a kind of regularization. It may have learned small filters because the reconstruction filter size was too small. Let's bump that up too:
In [9]:
solver_file = 'autoencoder-7-solver.prototxt'
solver = caffe.get_solver(solver_file)
solver.solve('autoencoder-7_iter_10000.solverstate')
In [52]:
model_def_file = 'autoencoder-7.prototxt'
model_file = 'autoencoder-7_iter_40000.caffemodel'
net = caffe.Net(model_def_file, model_file, caffe.TEST)
#run a batch
net.forward()
Out[52]:
In [55]:
rec = get_reconstructions(net, mean, 8, compare=2)
vis_reconstructions(rec)
In [12]:
filters = get_filters(net)
disp = vis_filters(filters, 3)
plt.imshow(disp, interpolation='none')
Out[12]:
The more expressive decoder did reduce error and visual fidelity is now very close to perfect. It did not change the noisy-fringed character of the learned filters. The center filters are mostly in pairs which appear to be mirrors, rotations and/or or color inverses. neat!
In [66]:
filters = get_filters(net, 'decode1')
disp = vis_filters(filters, 3)
plt.imshow(disp, interpolation='none')
Out[66]:
In [58]:
reps = get_responses(net, 'pool1', 12, 8)
vis_responses(reps)
We could keep going to 7x7 encoders and 8x8 decoders; but at some point I expect larger filters to have trouble with CIFAR since the images are so tiny. With 7x7 filters about half of all convolutions are going to include some padding.
In [ ]:
solver_file = 'autoencoder-8-solver.prototxt'
solver = caffe.get_solver(solver_file)
#initialize the first layer with previously trained weights
#first let's try stacking with the lower weights frozen
pre_net = caffe.Net('autoencoder-7.prototxt', 'autoencoder-7_iter_40000.caffemodel', caffe.TEST)
for layer in ['encode1', 'decode1']:
solver.net.params[layer][0].data[:] = pre_net.params[layer][0].data
solver.net.params[layer][1].data[:] = pre_net.params[layer][1].data
solver.solve()
In [59]:
model_def_file = 'autoencoder-8.prototxt'
#model_file = '../bin/cifar-tanh-20epoch-2layer.caffemodel'
model_file = 'autoencoder-8_iter_40000.caffemodel'
net = caffe.Net(model_def_file, model_file, caffe.TEST)
#run a batch
net.forward()
Out[59]:
In [60]:
rec = get_reconstructions(net, mean, 8, compare=4)
vis_reconstructions(rec)
In [2]:
solver_file = 'autoencoder-9-solver.prototxt'
solver = caffe.get_solver(solver_file)
#initialize the first layer with previously trained weights
#this time bring over all the parameters
pre_net = caffe.Net('autoencoder-8.prototxt', 'autoencoder-8_iter_40000.caffemodel', caffe.TEST)
for layer in ['encode1', 'decode1', 'encode2', 'decode2']:
solver.net.params[layer][0].data[:] = pre_net.params[layer][0].data
solver.net.params[layer][1].data[:] = pre_net.params[layer][1].data
solver.solve()
In [61]:
model_def_file = 'autoencoder-9.prototxt'
model_file = '../bin/cifar-tanh-60epoch-2layer-finetuned-dualobjective.caffemodel'
net = caffe.Net(model_def_file, model_file, caffe.TEST)
#run a batch
net.forward()
Out[61]:
In [62]:
rec = get_reconstructions(net, mean, 8, compare=4)
vis_reconstructions(rec)
Fine tuning reduced both parts of the loss, but still looks much worse than the single layer model
In [63]:
filters = get_filters(net)
disp = vis_filters(filters, 3)
plt.imshow(disp, interpolation='none')
Out[63]:
Fine tuning on the first layer appears to have corrupted the nice filters we had before
In [ ]:
In [9]:
#map triples of filters to colors
reps = get_responses(net, 'pool2', 16, 8)
vis_responses(reps)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: