In [2]:
from IPython.display import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
%matplotlib inline
Again we need functions for shuffling the data and calculating classification errrors.
In [3]:
### function for shuffling the data and labels
def shuffle_in_unison(features, labels):
rng_state = np.random.get_state()
np.random.shuffle(features)
np.random.set_state(rng_state)
np.random.shuffle(labels)
### calculate classification errors
# return a percentage: (number misclassified)/(total number of datapoints)
def calc_classification_error(predictions, class_labels):
n = predictions.size
num_of_errors = 0.
for idx in xrange(n):
if (predictions[idx] >= 0.5 and class_labels[idx]==0) or (predictions[idx] < 0.5 and class_labels[idx]==1):
num_of_errors += 1
return num_of_errors/n
We are going to use the MNIST dataset throughout this session. Let's load the data...
In [16]:
mnist = pd.read_csv('../data/mnist_train_100.csv', header=None)
In [22]:
# load the 70,000 x 784 matrix
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original').data
# reduce to 5k instances
np.random.shuffle(mnist)
#mnist = mnist[:5000,:]/255.
print "Dataset size: %d x %d"%(mnist.shape)
# subplot containing first image
ax1 = plt.subplot(1,2,1)
digit = mnist[1,:]
ax1.imshow(np.reshape(digit, (28, 28)), cmap='Greys_r')
# subplot containing second image
ax2 = plt.subplot(1,2,2)
digit = mnist[2,:]
ax2.imshow(np.reshape(digit, (28, 28)), cmap='Greys_r')
plt.show()
Recall the Principal Component Analysis model we covered in the last session. Again, the goal of PCA is for a given datapoint $\mathbf{x}_{i}$, find a lower-dimensional representation $\mathbf{h}_{i}$ such that $\mathbf{x}_{i}$ can be 'predicted' from $\mathbf{h}_{i}$ using a linear transformation. Again, the loss function can be written as: $$ \mathcal{L}_{\text{PCA}} = \sum_{i=1}^{N} (\mathbf{x}_{i} - \mathbf{x}_{i}\mathbf{W}\mathbf{W}^{T})^{2}.$$
Instead of using the closed-form solution we discussed in the previous session, here we'll use gradient descent. The reason for doing this will become clear later in the session, as we move on to cover a non-linear version of PCA. To run gradient descent, we of course need the derivative of the loss w.r.t. the parameters, which are in this case, the transformation matrix $\mathbf{W}$:
$$ \nabla_{\mathbf{W}} \mathcal{L}_{\text{PCA}} = -4\sum_{i=1}^{N} (\mathbf{x}_{i} - \mathbf{\tilde x}_{i})^{T}\mathbf{h}_{i} $$
Now let's run our stochastic gradient PCA on the MNIST dataset...
In [ ]:
# set the random number generator for reproducability
np.random.seed(49)
# define the dimensionality of the hidden rep.
n_components = 200
# Randomly initialize the Weight matrix
W = np.random.uniform(low=-4 * np.sqrt(6. / (n_components + mnist.shape[1])),\
high=4 * np.sqrt(6. / (n_components + mnist.shape[1])), size=(mnist.shape[1], n_components))
# Initialize the step-size
alpha = 1e-3
# Initialize the gradient
grad = np.infty
# Set the tolerance
tol = 1e-8
# Initialize error
old_error = 0
error = [np.infty]
batch_size = 250
### train with stochastic gradients
start_time = time.time()
iter_idx = 1
# loop until gradient updates become small
while (alpha*np.linalg.norm(grad) > tol) and (iter_idx < 300):
for batch_idx in xrange(mnist.shape[0]/batch_size):
x = mnist[batch_idx*batch_size:(batch_idx+1)*batch_size, :]
h = np.dot(x, W)
x_recon = np.dot(h, W.T)
# compute gradient
diff = x - x_recon
grad = (-4./batch_size)*np.dot(diff.T, h)
# update parameters
W = W - alpha*grad
# track the error
if iter_idx % 25 == 0:
old_error = error[-1]
diff = mnist - np.dot(np.dot(mnist, W), W.T)
recon_error = np.mean( np.sum(diff**2, 1) )
error.append(recon_error)
print "Epoch %d, Reconstruction Error: %.3f" %(iter_idx, recon_error)
iter_idx += 1
end_time = time.time()
print
print "Training ended after %i iterations, taking a total of %.2f seconds." %(iter_idx, end_time-start_time)
print "Final Reconstruction Error: %.2f" %(error[-1])
reduced_mnist = np.dot(mnist, W)
print "Dataset is now of size: %d x %d"%(reduced_mnist.shape)
Let's visualize a reconstruction...
In [ ]:
img_idx = 2
reconstructed_img = np.dot(reduced_mnist[img_idx,:], W.T)
original_img = mnist[img_idx,:]
# subplot for original image
ax1 = plt.subplot(1,2,1)
ax1.imshow(np.reshape(original_img, (28, 28)), cmap='Greys_r')
ax1.set_title("Original Painting")
# subplot for reconstruction
ax2 = plt.subplot(1,2,2)
ax2.imshow(np.reshape(reconstructed_img, (28, 28)), cmap='Greys_r')
ax2.set_title("Reconstruction")
plt.show()
We can again visualize the transformation matrix $\mathbf{W}^{T}$. It's rows act as 'filters' or 'feature detectors'. However, without the orthogonality constraint, we've loss the identifiably of the components...
In [ ]:
# two components to show
comp1 = 0
comp2 = 150
# subplot
ax1 = plt.subplot(1,2,1)
filter1 = W[:, comp1]
ax1.imshow(np.reshape(filter1, (28, 28)), cmap='Greys_r')
# subplot
ax2 = plt.subplot(1,2,2)
filter2 = W[:, comp2]
ax2.imshow(np.reshape(filter2, (28, 28)), cmap='Greys_r')
plt.show()
In the last session (and section) we learned about Principal Component Analysis, a technique that finds some linear projection that reduces the dimensionality of the data while preserving its variance. We looked at it as a form of unsupervised linear regression, where we predict the data itself instead of some associated value (i.e. a label). In this section, we will move on to a nonlinear dimensionality reduction technique called an Autoencoder and derive it's optimization procedure.
Recall that PCA is comprised of a linear projection step followed by application of the inverse projection. An Autoencoder is the same model but with a non-linear transformation placed on the hidden representation. To reiterate, our goal is: for a datapoint $\mathbf{x}_{i}$, find a lower-dimensional representation $\mathbf{h}_{i}$ such that $\mathbf{x}_{i}$ can be 'predicted' from $\mathbf{h}_{i}$---but this time, not necessarily with a linear transformation. In math, this statement can be written as $$\mathbf{\tilde x}_{i} = \mathbf{h}_{i} \mathbf{W}^{T} \text{ where } \mathbf{h}_{i} = f(\mathbf{x}_{i} \mathbf{W}). $$ $\mathbf{W}$ is a $D \times K$ matrix of parameters that need to be learned--much like the $\beta$ vector in regression models. $D$ is the dimensionality of the original data, and $K$ is the dimensionality of the compressed representation $\mathbf{h}_{i}$. Lastly, we have the new component, the transformation function $f$. There are many possible function to choose for $f$; yet we'll use a framilar one, the logistic function $$f(z) = \frac{1}{1+\exp(-z)}.$$ The graphic below depicts the autoencoder's computation path:
Having defined the Autoencoder model, we look to write learning as an optimization process. Recall that we wish to make a reconstruction of the data, denoted $\mathbf{\tilde x}_{i}$, as close as possible to the original input: $$\mathcal{L}_{\text{AE}} = \sum_{i=1}^{N} (\mathbf{x}_{i} - \mathbf{\tilde x}_{i})^{2}.$$ We can make a substitution for $\mathbf{\tilde x}_{i}$ from the equation above: $$ = \sum_{i=1}^{N} (\mathbf{x}_{i} - \mathbf{h}_{i}\mathbf{W}^{T})^{2}.$$ And we can make another substitution for $\mathbf{h}_{i}$, bringing us to the final form of the loss function: $$ = \sum_{i=1}^{N} (\mathbf{x}_{i} - f(\mathbf{x}_{i}\mathbf{W})\mathbf{W}^{T})^{2}.$$
Derive an expression for the gradient: $$ \nabla_{W}\mathcal{L}_{\text{AE}} = ? $$ Take $f$ to be the logistic function, which has a derivative of $f'(z) = f(z)(1-f(z))$. Those functions are provided for you below.
In [ ]:
def logistic(x):
return 1./(1+np.exp(-x))
def logistic_derivative(x):
z = logistic(x)
return np.multiply(z, 1-z)
def compute_gradient(x, x_recon, h, a):
# parameters:
# x: the original data
# x_recon: the reconstruction of x
# h: the hidden units (after application of f)
# a: the pre-activations (before the application of f)
return #TODO
np.random.seed(39)
# dummy variables for testing
x = np.random.normal(size=(5,3))
x_recon = x + np.random.normal(size=x.shape)
W = np.random.normal(size=(x.shape[1], 2))
a = np.dot(x, W)
h = logistic(a)
compute_gradient(x, x_recon, h, a)
Should print
array([[ 4.70101821, 2.26494039],
[ 2.86585042, 0.0731302 ],
[ 0.79869215, 0.15570277]])
Data
We observe $\mathbf{x}_{i}$ where \begin{eqnarray*} \mathbf{x}_{i} = (x_{i,1}, \dots, x_{i,D}) &:& \mbox{set of $D$ explanatory variables (aka features). No labels.} \end{eqnarray*}
Parameters
$\mathbf{W}$: Matrix with dimensionality $D \times K$, where $D$ is the dimensionality of the original data and $K$ the dimensionality of the new features. The matrix encodes the transformation between the original and new feature spaces.
Error Function
\begin{eqnarray*} \mathcal{L} = \sum_{i=1}^{N} ( \mathbf{x}_{i} - f(\mathbf{x}_{i} \mathbf{W}) \mathbf{W}^{T})^{2} \end{eqnarray*}Now let's train an Autoencoder...
In [ ]:
# set the random number generator for reproducability
np.random.seed(39)
# define the dimensionality of the hidden rep.
n_components = 200
# Randomly initialize the transformation matrix
W = np.random.uniform(low=-4 * np.sqrt(6. / (n_components + mnist.shape[1])),\
high=4 * np.sqrt(6. / (n_components + mnist.shape[1])), size=(mnist.shape[1], n_components))
# Initialize the step-size
alpha = .01
# Initialize the gradient
grad = np.infty
# Initialize error
old_error = 0
error = [np.infty]
batch_size = 250
### train with stochastic gradients
start_time = time.time()
iter_idx = 1
# loop until gradient updates become small
while (alpha*np.linalg.norm(grad) > tol) and (iter_idx < 300):
for batch_idx in xrange(mnist.shape[0]/batch_size):
x = mnist[batch_idx*batch_size:(batch_idx+1)*batch_size, :]
pre_act = np.dot(x, W)
h = logistic(pre_act)
x_recon = np.dot(h, W.T)
# compute gradient
grad = compute_gradient(x, x_recon, h, pre_act)
# update parameters
W = W - alpha/batch_size * grad
# track the error
if iter_idx % 25 == 0:
old_error = error[-1]
diff = mnist - np.dot(logistic(np.dot(mnist, W)), W.T)
recon_error = np.mean( np.sum(diff**2, 1) )
error.append(recon_error)
print "Epoch %d, Reconstruction Error: %.3f" %(iter_idx, recon_error)
iter_idx += 1
end_time = time.time()
print
print "Training ended after %i iterations, taking a total of %.2f seconds." %(iter_idx, end_time-start_time)
print "Final Reconstruction Error: %.2f" %(error[-1])
reduced_mnist = np.dot(mnist, W)
print "Dataset is now of size: %d x %d"%(reduced_mnist.shape)
In [ ]:
img_idx = 2
reconstructed_img = np.dot(logistic(reduced_mnist[img_idx,:]), W.T)
original_img = mnist[img_idx,:]
# subplot for original image
ax1 = plt.subplot(1,2,1)
ax1.imshow(np.reshape(original_img, (28, 28)), cmap='Greys_r')
ax1.set_title("Original Digit")
# subplot for reconstruction
ax2 = plt.subplot(1,2,2)
ax2.imshow(np.reshape(reconstructed_img, (28, 28)), cmap='Greys_r')
ax2.set_title("Reconstruction")
plt.show()
In [ ]:
# two components to show
comp1 = 0
comp2 = 150
# subplot
ax1 = plt.subplot(1,2,1)
filter1 = W[:, comp1]
ax1.imshow(np.reshape(filter1, (28, 28)), cmap='Greys_r')
# subplot
ax2 = plt.subplot(1,2,2)
filter2 = W[:, comp2]
ax2.imshow(np.reshape(filter2, (28, 28)), cmap='Greys_r')
plt.show()
In [ ]:
from sklearn.neural_network import MLPRegressor
# set the random number generator for reproducability
np.random.seed(39)
# define the dimensionality of the hidden rep.
n_components = 200
# define model
ae = MLPRegressor(hidden_layer_sizes=(n_components,), activation='logistic')
### train Autoencoder
start_time = time.time()
ae.fit(mnist, mnist)
end_time = time.time()
recon_error = np.mean(np.sum((mnist - ae.predict(mnist))**2, 1))
W = ae.coefs_[0]
b = ae.intercepts_[0]
reduced_mnist = logistic(np.dot(mnist, W) + b)
print
print "Training ended after a total of %.2f seconds." %(end_time-start_time)
print "Final Reconstruction Error: %.2f" %(recon_error)
print "Dataset is now of size: %d x %d"%(reduced_mnist.shape)
In [ ]:
img_idx = 5
reconstructed_img = np.dot(reduced_mnist[img_idx,:], ae.coefs_[1]) + ae.intercepts_[1]
original_img = mnist[img_idx,:]
# subplot for original image
ax1 = plt.subplot(1,2,1)
ax1.imshow(np.reshape(original_img, (28, 28)), cmap='Greys_r')
ax1.set_title("Original Digit")
# subplot for reconstruction
ax2 = plt.subplot(1,2,2)
ax2.imshow(np.reshape(reconstructed_img, (28, 28)), cmap='Greys_r')
ax2.set_title("Reconstruction")
plt.show()
In [ ]:
# two components to show
comp1 = 0
comp2 = 150
# subplot
ax1 = plt.subplot(1,2,1)
filter1 = W[:, comp1]
ax1.imshow(np.reshape(filter1, (28, 28)), cmap='Greys_r')
# subplot
ax2 = plt.subplot(1,2,2)
filter2 = W[:, comp2]
ax2.imshow(np.reshape(filter2, (28, 28)), cmap='Greys_r')
plt.show()
Lastly, we are going to examine an extension to the Autoencoder called a Denoising Autoencoder (DAE). It has the following loss fuction: $$\mathcal{L}_{\text{DAE}} = \sum_{i=1}^{N} (\mathbf{x}_{i} - f((\hat{\boldsymbol{\zeta}} \odot \mathbf{x}_{i})\mathbf{W})\mathbf{W}^{T})^{2} \ \text{ where } \hat{\boldsymbol{\zeta}} \sim \text{Bernoulli}(p).$$ In words, what we're doing is drawning a Bernoulli (i.e. binary) matrix the same size as the input features, and feeding a corrupted version of $\mathbf{x}_{i}$. The Autoencoder, then, must try to recreate the original data from a lossy representation. This has the effect of forcing the Autoencoder to use features that better generalize.
Let's make the simple change that implements a DAE below...
In [ ]:
# set the random number generator for reproducability
np.random.seed(39)
# define the dimensionality of the hidden rep.
n_components = 200
# Randomly initialize the Beta vector
W = np.random.uniform(low=-4 * np.sqrt(6. / (n_components + mnist.shape[1])),\
high=4 * np.sqrt(6. / (n_components + mnist.shape[1])), size=(mnist.shape[1], n_components))
# Initialize the step-size
alpha = .01
# Initialize the gradient
grad = np.infty
# Set the tolerance
tol = 1e-8
# Initialize error
old_error = 0
error = [np.infty]
batch_size = 250
### train with stochastic gradients
start_time = time.time()
iter_idx = 1
# loop until gradient updates become small
while (alpha*np.linalg.norm(grad) > tol) and (iter_idx < 300):
for batch_idx in xrange(mnist.shape[0]/batch_size):
x = mnist[batch_idx*batch_size:(batch_idx+1)*batch_size, :]
# add noise to features
x_corrupt = np.multiply(x, np.random.binomial(n=1, p=.8, size=x.shape))
pre_act = np.dot(x_corrupt, W)
h = logistic(pre_act)
x_recon = np.dot(h, W.T)
# compute gradient
diff = x - x_recon
grad = -2.*(np.dot(diff.T, h) + np.dot(np.multiply(np.dot(diff, W), logistic_derivative(pre_act)).T, x_corrupt).T)
# NOTICE: during the 'backward pass', use the uncorrupted features
# update parameters
W = W - alpha/batch_size * grad
# track the error
if iter_idx % 25 == 0:
old_error = error[-1]
diff = mnist - np.dot(logistic(np.dot(mnist, W)), W.T)
recon_error = np.mean( np.sum(diff**2, 1) )
error.append(recon_error)
print "Epoch %d, Reconstruction Error: %.3f" %(iter_idx, recon_error)
iter_idx += 1
end_time = time.time()
print
print "Training ended after %i iterations, taking a total of %.2f seconds." %(iter_idx, end_time-start_time)
print "Final Reconstruction Error: %.2f" %(error[-1])
reduced_mnist = np.dot(mnist, W)
print "Dataset is now of size: %d x %d"%(reduced_mnist.shape)
In [ ]:
img_idx = 5
reconstructed_img = np.dot(logistic(reduced_mnist[img_idx,:]), W.T)
original_img = mnist[img_idx,:]
# subplot for original image
ax1 = plt.subplot(1,2,1)
ax1.imshow(np.reshape(original_img, (28, 28)), cmap='Greys_r')
ax1.set_title("Original Painting")
# subplot for reconstruction
ax2 = plt.subplot(1,2,2)
ax2.imshow(np.reshape(reconstructed_img, (28, 28)), cmap='Greys_r')
ax2.set_title("Reconstruction")
plt.show()
In [ ]:
# two components to show
comp1 = 0
comp2 = 150
# subplot
ax1 = plt.subplot(1,2,1)
filter1 = W[:, comp1]
ax1.imshow(np.reshape(filter1, (28, 28)), cmap='Greys_r')
# subplot
ax2 = plt.subplot(1,2,2)
filter2 = W[:, comp2]
ax2.imshow(np.reshape(filter2, (28, 28)), cmap='Greys_r')
plt.show()
When training larger autoencoders, you'll see filters that look like these...
Regular Autoencoder:
Denoising Autoencoder:
In [ ]:
from sklearn.datasets import fetch_olivetti_faces
faces_dataset = fetch_olivetti_faces(shuffle=True)
faces = faces_dataset.data # 400 flattened 64x64 images
person_ids = faces_dataset.target # denotes the identity of person (40 total)
print "Dataset size: %d x %d" %(faces.shape)
print "And the images look like this..."
plt.imshow(np.reshape(faces[200,:], (64, 64)), cmap='Greys_r')
plt.show()
In [ ]:
### Your code goes here ###
# train Autoencoder model on 'faces'
###########################
print "Training took a total of %.2f seconds." %(end_time-start_time)
print "Final reconstruction error: %.2f%%" %(recon_error)
print "Dataset is now of size: %d x %d"%(faces_reduced.shape)
In [ ]:
### Your code goes here ###
# Use learned transformation matrix to project back to the original 4096-dimensional space
# Remember you need to use np.reshape()
###########################
In [ ]:
### Your code goes here ###
###########################
In [ ]:
### Your code goes here ###
# Run AE for 2 components
# Generate plot
# Bonus: color the scatter plot according to the person_ids to see if any structure can be seen
###########################
In [ ]:
### Your code goes here ###
# Run PCA but add noise to the input first
###########################