Today we're going to train deep autoencoders and deploy them to faces and search for similar images.
Our new test subjects are human faces from the lfw dataset.
Collab setting
In [ ]:
# if you're running in colab,
# 1. go to Runtime -> Change Runtimy Type -> GPU
# 2. uncomment this:
#!wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/hw3_19/homework03/lfw_dataset.py -O lfw_dataset.py
In [1]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
EPOCHS = 100
BATCH_SIZE = 32
LEARNING_RATE = 1e-3
LATENT_DIMENSION = 4
BATCH_SIZE = 32
device = torch.device("cuda")
In [2]:
import numpy as np
from lfw_dataset import fetch_lfw_dataset
from sklearn.model_selection import train_test_split
X, attr = fetch_lfw_dataset(use_raw=True,dimx=38,dimy=38)
X = X.transpose([0,3,1,2]).astype('float32') / 256.0
img_shape = X.shape[1:]
X_train, X_test = train_test_split(X, test_size=0.1,random_state=42)
In [3]:
X_train_tensor = torch.from_numpy(X_train).type(torch.DoubleTensor)
X_test_tensor = torch.Tensor(X_test).type(torch.DoubleTensor)
In [5]:
img_shape
Out[5]:
In [19]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(18,7))
plt.title('sample image')
for i in range(6):
plt.subplot(2,3,i+1)
plt.axis('off')
plt.ioff()
plt.imshow(X[i].transpose([1,2,0]))
print("X shape:",X.shape)
print("attr shape:",attr.shape)
Principial Component Analysis is a popular dimensionality reduction method.
Under the hood, PCA attempts to decompose object-feature matrix $X$ into two smaller matrices: $W$ and $\hat W$ minimizing mean squared error:
$$\|(X W) \hat{W} - X\|^2_2 \to_{W, \hat{W}} \min$$In geometric terms, we want to find d axes along which most of variance occurs. The "natural" axes, if you wish.
PCA can also be seen as a special case of an autoencoder.
Where Dense is a fully-connected layer with linear activaton: $f(X) = W \cdot X + \vec b $
Note: the bias term in those layers is responsible for "centering" the matrix i.e. substracting mean.
In [26]:
# this class corresponds to view-function and may be used as a reshape layer
class View(nn.Module):
def __init__(self, *shape):
super(View, self).__init__()
self.shape = shape
def forward(self, input):
return input.view(*self.shape)
In [58]:
class pca_autoencoder(nn.Module):
"""
Here we define a simple linear autoencoder as described above.
We also flatten and un-flatten data to be compatible with image shapes
"""
def __init__(self, code_size=32):
super(pca_autoencoder, self).__init__()
self.enc = nn.Sequential(
View((-1, 38*38*3)),
nn.Linear(38*38*3, code_size, bias=True)
)
self.dec = nn.Sequential(
nn.Linear(code_size, 38*38*3, bias=True),
View((-1, 3, 38, 38))
)
def batch_loss(self, batch):
code = self.enc(batch)
reconstruction = self.dec(code)
return torch.mean((batch - reconstruction)**2)
In [23]:
def train(model, dataset, num_epoch=32):
model.double()
model.to(device)
gd = optim.Adamax(model.parameters(), lr=0.002)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
losses = []
for epoch in range(num_epoch):
for i, (batch) in enumerate(dataloader):
gd.zero_grad()
loss = model.batch_loss(batch.cuda())
(loss).backward()
losses.append(loss.detach().cpu().numpy())
gd.step()
gd.zero_grad()
print("#%i, Train loss: %.7f"%(epoch+1,np.mean(losses)),flush=True)
In [67]:
def visualize(img, model):
"""Draws original, encoded and decoded images"""
code = model.enc(img.cuda().unsqueeze(0))
reco = model.dec(code)
plt.subplot(1,3,1)
plt.title("Original")
plt.imshow(img.cpu().numpy().transpose([1, 2, 0]).clip(0, 1))
plt.subplot(1,3,2)
plt.title("Code")
plt.imshow(code.cpu().detach().numpy().reshape([code.shape[-1] // 2, -1]))
plt.subplot(1,3,3)
plt.title("Reconstructed")
plt.imshow(reco[0].cpu().detach().numpy().transpose([1, 2, 0]).clip(0, 1))
plt.show()
In [72]:
aenc = pca_autoencoder()
train(aenc, X_train_tensor, 40)
In [73]:
dataloader_test = DataLoader(X_test_tensor, batch_size=BATCH_SIZE, shuffle=True)
scores = []
for i, (batch) in enumerate(dataloader_test):
scores.append(aenc.batch_loss(batch.cuda()).data.cpu().numpy())
print (np.mean(scores))
In [74]:
for i in range(5):
img = X_test_tensor[i]
visualize(img, aenc)
PCA is neat but surely we can do better. This time we want you to build a deep autoencoder by... stacking more layers.
In particular, your encoder and decoder should be at least 3 layers deep each. You can use any nonlinearity you want and any number of hidden units in non-bottleneck layers provided you can actually afford training it.
A few sanity checks:
In [76]:
class pca_autoencoder_deep(nn.Module):
def __init__(self, code_size=32):
super(pca_autoencoder_deep, self).__init__()
self.enc = nn.Sequential(
nn.Conv2d(3, 32, 3),
nn.ReLU(),
nn.MaxPool2d((2,2)),
nn.Conv2d(32, 64, 3),
nn.ReLU(),
nn.MaxPool2d((2,2)),
nn.Conv2d(64, 128, 3),
View(-1, 128 * 6 * 6),
nn.Linear(128 * 6 * 6, code_size)
)
self.dec = nn.Sequential(
nn.Linear(code_size, 128 * 6 * 6),
View(-1, 128, 6, 6),
nn.ConvTranspose2d(128, 64, 3),
nn.ReLU(),
nn.UpsamplingBilinear2d(scale_factor=2),
nn.ConvTranspose2d(64, 32, 3),
nn.ReLU(),
nn.UpsamplingBilinear2d(scale_factor=2),
nn.ConvTranspose2d(32, 3, 3),
nn.Sigmoid(),
)
def batch_loss(self, batch):
code = self.enc(batch)
reconstruction = self.dec(code)
return torch.mean((batch - reconstruction)**2)
In [78]:
aenc_deep = pca_autoencoder_deep()
train(aenc_deep, X_train_tensor, 50)
Training may take long, it's okay.
Check autoencoder shapes along different code_sizes. Check architecture of you encoder-decoder network is correct
In [79]:
get_dim = lambda layer: np.prod(layer.output_shape[1:])
for code_size in [1,8,32,128,512,1024]:
help_tensor = next(iter(DataLoader(X_train_tensor, batch_size=BATCH_SIZE)))
model = pca_autoencoder_deep(code_size).to(device)
encoder_out = model.enc(help_tensor.type('torch.FloatTensor').cuda())
decoder_out = model.dec(encoder_out)
print("Testing code size %i" % code_size)
assert encoder_out.shape[1:]==torch.Size([code_size]),"encoder must output a code of required size"
assert decoder_out.shape[1:]==img_shape, "decoder must output an image of valid shape"
assert (len(list(model.dec.children())) >= 6), "decoder must contain at least 3 dense layers"
print("All tests passed!")
Hint: if you're getting "Encoder layer is smaller than bottleneck" error, use code_size when defining intermediate layers.
For example, such layer may have code_size*2 units.
Lets check you model's score. You should beat value of 0.005
In [80]:
dataloader_test = DataLoader(X_test_tensor, batch_size=BATCH_SIZE, shuffle=True)
scores = []
for i, (batch) in enumerate(dataloader_test):
scores.append(aenc_deep.batch_loss(batch.cuda(device = device)).data.cpu().numpy())
encoder_out = aenc_deep.enc(batch.cuda(device = device))
reconstruction_mse = np.mean(scores)
assert reconstruction_mse <= 0.005, "Compression is too lossy. See tips below."
assert len(encoder_out.shape)==2 and encoder_out.shape[1]==32, "Make sure encoder has code_size units"
print("Final MSE:", reconstruction_mse)
for i in range(5):
img = X_test_tensor[i]
visualize(img,aenc_deep)
Tips: If you keep getting "Compression to lossy" error, there's a few things you might try:
Make sure it converged. Some architectures need way more than 32 epochs to converge. They may fluctuate a lot, but eventually they're going to get good enough to pass. You may train your network for as long as you want.
Complexity. If you already have, like, 152 layers and still not passing threshold, you may wish to start from something simpler instead and go in small incremental steps.
Architecture. You can use any combination of layers (including convolutions, normalization, etc) as long as encoder output only stores 32 numbers per training object.
A cunning learner can circumvent this last limitation by using some manual encoding strategy, but he is strongly recommended to avoid that.
Let's now make our model into a denoising autoencoder.
We'll keep your model architecture, but change the way it trains. In particular, we'll corrupt it's input data randomly before each epoch.
There are many strategies to apply noise. We'll implement two popular one: adding gaussian noise and using dropout.
In [99]:
def apply_gaussian_noise(X,sigma=0.1):
"""
adds noise from normal distribution with standard deviation sigma
:param X: image tensor of shape [batch,height,width,3]
"""
noise = np.random.normal(0, sigma, X.shape)
return X + noise
noise tests
In [100]:
theoretical_std = (X[:100].std()**2 + 0.5**2)**.5
our_std = apply_gaussian_noise(X[:100],sigma=0.5).std()
assert abs(theoretical_std - our_std) < 0.01, "Standard deviation does not match it's required value. Make sure you use sigma as std."
assert abs(apply_gaussian_noise(X[:100],sigma=0.5).mean() - X[:100].mean()) < 0.01, "Mean has changed. Please add zero-mean noise"
In [102]:
plt.figure(figsize=(17,5))
plt.subplot(1,4,1)
plt.imshow(X[0].transpose([1,2,0]))
plt.subplot(1,4,2)
plt.imshow(apply_gaussian_noise(X[:1],sigma=0.01)[0].transpose([1,2,0]).clip(0, 1))
plt.subplot(1,4,3)
plt.imshow(apply_gaussian_noise(X[:1],sigma=0.1)[0].transpose([1,2,0]).clip(0, 1))
plt.subplot(1,4,4)
plt.imshow(apply_gaussian_noise(X[:1],sigma=0.5)[0].transpose([1,2,0]).clip(0, 1))
Out[102]:
In [106]:
def train_noise(model, dataset, num_epoch=50):
model.double()
model.to(device)
gd = optim.Adamax(model.parameters(), lr=0.002)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
losses = []
for epoch in range(num_epoch):
for i, (batch) in enumerate(dataloader):
gd.zero_grad()
loss = model.batch_loss(batch.cuda())
(loss).backward()
losses.append(loss.detach().cpu().numpy())
gd.step()
gd.zero_grad()
print("#%i, Train loss: %.7f"%(epoch+1,np.mean(losses)),flush=True)
In [107]:
X_train_noise = apply_gaussian_noise(X_train)
X_test_noise = apply_gaussian_noise(X_test)
In [109]:
X_train_tensor_n = torch.from_numpy(X_train_noise).type(torch.DoubleTensor)
X_test_tensor_n = torch.Tensor(X_test_noise).type(torch.DoubleTensor)
In [112]:
aenc = pca_autoencoder()
train(aenc, X_train_tensor_n, 75)
Note: if it hasn't yet converged, increase the number of iterations.
Bonus: replace gaussian noise with masking random rectangles on image.
Let's evaluate!!!
In [113]:
dataloader_test = DataLoader(X_test_tensor_n, batch_size=BATCH_SIZE, shuffle=True)
scores = []
for i, (batch) in enumerate(dataloader_test):
scores.append(aenc.batch_loss(batch.cuda(device = device)).data.cpu().numpy())
encoder_out = aenc.enc(batch.cuda(device = device))
reconstruction_mse = np.mean(scores)
print("Final MSE:", reconstruction_mse)
for i in range(5):
img = X_test_tensor_n[i]
visualize(img,aenc)
So we've just trained a network that converts image into itself imperfectly. This task is not that useful in and of itself, but it has a number of awesome side-effects. Let's see it in action.
First thing we can do is image retrieval aka image search. We give it an image and find similar images in latent space.
To speed up retrieval process, we shall use Locality-Sensitive Hashing on top of encoded vectors. We'll use scikit-learn's implementation for simplicity. In practical scenario, you may want to use specialized libraries for better performance and customization.
In [140]:
#encodes batch of images into a codes
codes = aenc.enc(X_train_tensor.cuda(device))
In [141]:
assert codes.shape[0] == X_train_tensor.shape[0]
In [142]:
from sklearn.neighbors import LSHForest
lshf = LSHForest(n_estimators=50).fit(codes.detach().cpu().numpy())
In [144]:
images = torch.from_numpy(X_train).type(torch.DoubleTensor)
In [148]:
def get_similar(image, n_neighbors=5):
assert len(image.shape)==3,"image must be [batch,height,width,3]"
code = aenc.enc(image.cuda(device)).detach().cpu().numpy()
(distances,),(idx,) = lshf.kneighbors(code, n_neighbors)
return distances,images[idx]
In [151]:
def show_similar(image):
distances,neighbors = get_similar(image,n_neighbors=11)
plt.figure(figsize=[17,8])
plt.subplot(3,4,1)
plt.axis('off')
plt.imshow(image.cpu().numpy().transpose([1,2,0]))
plt.title("Original image")
for i in range(11):
plt.subplot(3,4,i+2)
plt.axis('off')
plt.imshow(neighbors[i].cpu().numpy().transpose([1,2,0]))
plt.title("Dist=%.3f"%distances[i])
plt.show()
In [150]:
#smiles
show_similar(X_test_tensor[2])
In [152]:
#ethnicity
show_similar(X_test_tensor[500])
In [153]:
#glasses
show_similar(X_test_tensor[66])
Here you should take two full-sized objects, code it and obtain intermediate object by decoding an intermixture code.
$Code_{mixt} = a1\cdot code1 + a2\cdot code2$
In [169]:
for _ in range(5):
idx1, idx2 = np.random.randint(0, X_test_noise.shape[0], 2)
image1,image2 = X_test_tensor[idx1], X_test_tensor[idx2]
code1, code2 = aenc.enc(image1.cuda()), aenc.enc(image2.cuda())
plt.figure(figsize=[16,4])
for i,a in enumerate(np.linspace(0,1,num=7)):
output_code = code1 * (1-a) + code2 * a
output_image = aenc.dec(output_code[None])[0]
plt.subplot(1,7,i+1)
plt.axis("off")
plt.imshow(output_image.cpu().detach().numpy().transpose([1,2,0]).clip(0,1))
plt.title("a=%.2f"%a)
plt.show()
Of course there's a lot more you can do with autoencoders.
If you want to generate images from scratch, however, we recommend you our honor track seminar about generative adversarial networks.