version 0.1, May 2016
This notebook is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License
Based on the slides and presentation by Alec Radford github
For this class you must install theno
pip instal theano
In [1]:
import numpy as np
In [2]:
from load import mnist
X_train, X_test, y_train2, y_test2 = mnist(onehot=True)
In [3]:
y_train = np.argmax(y_train2, axis=1)
y_test = np.argmax(y_test2, axis=1)
In [4]:
X_train[1].reshape((28, 28)).round(2)[:, 4:9].tolist()
Out[4]:
In [7]:
from pylab import imshow, show, cm
import matplotlib.pylab as plt
%matplotlib inline
def view_image(image, label="", predicted='', size=4):
"""View a single image."""
plt.figure(figsize = (size, size))
plt.imshow(image.reshape((28, 28)), cmap=cm.gray, )
plt.tick_params(axis='x',which='both', bottom='off',top='off', labelbottom='off')
plt.tick_params(axis='y',which='both', left='off',top='off', labelleft='off')
show()
if predicted == '':
print("Label: %s" % label)
else:
print('Label: ', str(label), 'Predicted: ', str(predicted))
In [8]:
view_image(X_train[1], y_train[1])
In [9]:
view_image(X_train[40000], y_train[40000])
In [11]:
def similarity(image, images):
similarities = []
image = image.reshape((28, 28))
images = images.reshape((-1, 28, 28))
for i in range(images.shape[0]):
distance = np.sqrt(np.sum(image - images[i]) ** 2)
sim = 1 / distance
similarities.append(sim)
return similarities
In [12]:
np.random.seed(52)
small_train = np.random.choice(X_train.shape[0], 100)
In [13]:
view_image(X_test[0])
In [14]:
similarities = similarity(X_test[0], X_train[small_train])
In [15]:
view_image(X_train[small_train[np.argmax(similarities)]])
Lets try an other example
In [16]:
view_image(X_test[200])
In [17]:
similarities = similarity(X_test[200], X_train[small_train])
view_image(X_train[small_train[np.argmax(similarities)]])
Logistic regression is a probabilistic, linear classifier. It is parametrized by a weight matrix $W$ and a bias vector $b$ Classification is done by projecting data points onto a set of hyperplanes, the distance to which is used to determine a class membership probability.
Mathematically, this can be written as:
$$ P(Y=i\vert x, W,b) = softmax_i(W x + b) $$$$ P(Y=i|x, W,b) = \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} $$The output of the model or prediction is then done by taking the argmax of the vector whose i'th element is $P(Y=i|x)$.
$$ y_{pred} = argmax_i P(Y=i|x,W,b) $$
In [18]:
import theano
from theano import tensor as T
import numpy as np
import datetime as dt
In [19]:
theano.config.floatX = 'float32'
Theano is a Python library that lets you to define, optimize, and evaluate mathematical expressions, especially ones with multi-dimensional arrays (numpy.ndarray). Using Theano it is possible to attain speeds rivaling hand-crafted C implementations for problems involving large amounts of data. It can also surpass C on a CPU by many orders of magnitude by taking advantage of recent GPUs.
Theano combines aspects of a computer algebra system (CAS) with aspects of an optimizing compiler. It can also generate customized C code for many mathematical operations. This combination of CAS with optimizing compilation is particularly useful for tasks in which complicated mathematical expressions are evaluated repeatedly and evaluation speed is critical. For situations where many different expressions are each evaluated once Theano can minimize the amount of compilation/analysis overhead, but still provide symbolic features such as automatic differentiation.
In [22]:
def floatX(X):
# return np.asarray(X, dtype='float32')
return np.asarray(X, dtype=theano.config.floatX)
def init_weights(shape):
return theano.shared(floatX(np.random.randn(*shape) * 0.01))
def model(X, w):
return T.nnet.softmax(T.dot(X, w))
In [23]:
X = T.fmatrix()
Y = T.fmatrix()
w = init_weights((784, 10))
In [24]:
w.get_value()
Out[24]:
initialize model
In [25]:
py_x = model(X, w)
y_pred = T.argmax(py_x, axis=1)
In [26]:
cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
gradient = T.grad(cost=cost, wrt=w)
update = [[w, w - gradient * 0.05]]
In [27]:
train = theano.function(inputs=[X, Y], outputs=cost, updates=update, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_pred, allow_input_downcast=True)
One iteration
In [28]:
for start, end in zip(range(0, X_train.shape[0], 128), range(128, X_train.shape[0], 128)):
cost = train(X_train[start:end], y_train2[start:end])
In [29]:
errors = [(np.mean(y_train != predict(X_train)),
np.mean(y_test != predict(X_test)))]
errors
Out[29]:
Now for 100 epochs
In [30]:
t0 = dt.datetime.now()
for i in range(100):
for start, end in zip(range(0, X_train.shape[0], 128),
range(128, X_train.shape[0], 128)):
cost = train(X_train[start:end], y_train2[start:end])
errors.append((np.mean(y_train != predict(X_train)),
np.mean(y_test != predict(X_test))))
print(i, errors[-1])
print('Total time: ', (dt.datetime.now()-t0).seconds / 60.)
In [31]:
res = np.array(errors)
plt.plot(np.arange(res.shape[0]), res[:, 0], label='train error')
plt.plot(np.arange(res.shape[0]), res[:, 1], label='test error')
plt.legend()
Out[31]:
In [32]:
y_pred = predict(X_test)
In [33]:
np.random.seed(2)
small_test = np.random.choice(X_test.shape[0], 10)
for i in small_test:
view_image(X_test[i], label=y_test[i], predicted=y_pred[i], size=1)
In [34]:
def sgd(cost, params, lr=0.05):
grads = T.grad(cost=cost, wrt=params)
updates = []
for p, g in zip(params, grads):
updates.append([p, p - g * lr])
return updates
def model(X, w_h, w_o):
h = T.nnet.sigmoid(T.dot(X, w_h))
pyx = T.nnet.softmax(T.dot(h, w_o))
return pyx
w_h = init_weights((784, 625))
w_o = init_weights((625, 10))
py_x = model(X, w_h, w_o)
y_x = T.argmax(py_x, axis=1)
cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
params = [w_h, w_o]
updates = sgd(cost, params)
train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)
In [30]:
t0 = dt.datetime.now()
errors = []
for i in range(100):
for start, end in zip(range(0, X_train.shape[0], 128),
range(128, X_train.shape[0], 128)):
cost = train(X_train[start:end], y_train2[start:end])
errors.append((np.mean(y_train != predict(X_train)),
np.mean(y_test != predict(X_test))))
print(i, errors[-1])
print('Total time: ', (dt.datetime.now()-t0).seconds / 60.)
In [31]:
res = np.array(errors)
plt.plot(np.arange(res.shape[0]), res[:, 0], label='train error')
plt.plot(np.arange(res.shape[0]), res[:, 1], label='test error')
plt.legend()
Out[31]:
In [35]:
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
srng = RandomStreams()
def rectify(X):
return T.maximum(X, 0.)
In [36]:
def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6):
grads = T.grad(cost=cost, wrt=params)
updates = []
for p, g in zip(params, grads):
acc = theano.shared(p.get_value() * 0.)
acc_new = rho * acc + (1 - rho) * g ** 2
gradient_scaling = T.sqrt(acc_new + epsilon)
g = g / gradient_scaling
updates.append((acc, acc_new))
updates.append((p, p - lr * g))
return updates
RMSprop is an unpublished, adaptive learning rate method proposed by Geoff Hinton in Lecture 6e of his Coursera Class
RMSprop and Adadelta have both been developed independently around the same time stemming from the need to resolve Adagrad's radically diminishing learning rates. RMSprop in fact is identical to the first update vector of Adadelta that we derived above:
$$ E[g^2]_t = 0.9 E[g^2]_{t-1} + 0.1 g^2_t. $$$$\theta_{t+1} = \theta_{t} - \frac{\eta}{\sqrt{E[g^2]_t + \epsilon}} g_{t}.$$RMSprop as well divides the learning rate by an exponentially decaying average of squared gradients. Hinton suggests $\gamma$ to be set to 0.9, while a good default value for the learning rate $\eta$ is 0.001.
In [37]:
def dropout(X, p=0.):
if p > 0:
retain_prob = 1 - p
X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
X /= retain_prob
return X
def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
X = dropout(X, p_drop_input)
h = rectify(T.dot(X, w_h))
h = dropout(h, p_drop_hidden)
h2 = rectify(T.dot(h, w_h2))
h2 = dropout(h2, p_drop_hidden)
py_x = softmax(T.dot(h2, w_o))
return h, h2, py_x
def softmax(X):
e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')
In [35]:
w_h = init_weights((784, 625))
w_h2 = init_weights((625, 625))
w_o = init_weights((625, 10))
noise_h, noise_h2, noise_py_x = model(X, w_h, w_h2, w_o, 0.2, 0.5)
h, h2, py_x = model(X, w_h, w_h2, w_o, 0., 0.)
y_x = T.argmax(py_x, axis=1)
cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, w_h2, w_o]
updates = RMSprop(cost, params, lr=0.001)
train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)
In [36]:
t0 = dt.datetime.now()
errors = []
for i in range(100):
for start, end in zip(range(0, X_train.shape[0], 128),
range(128, X_train.shape[0], 128)):
cost = train(X_train[start:end], y_train2[start:end])
errors.append((np.mean(y_train != predict(X_train)),
np.mean(y_test != predict(X_test))))
print(i, errors[-1])
print('Total time: ', (dt.datetime.now()-t0).seconds / 60.)
In [37]:
res = np.array(errors)
plt.plot(np.arange(res.shape[0]), res[:, 0], label='train error')
plt.plot(np.arange(res.shape[0]), res[:, 1], label='test error')
plt.legend()
Out[37]:
In machine learning, a convolutional neural network (CNN, or ConvNet) is a type of feed-forward artificial neural network in which the connectivity pattern between its neurons is inspired by the organization of the animal visual cortex, whose individual neurons are arranged in such a way that they respond to overlapping regions tiling the visual field. Convolutional networks were inspired by biological processes and are variations of multilayer perceptrons designed to use minimal amounts of preprocessing. (Wikipedia)
Convolutional Neural Networks (CNN) are biologically-inspired variants of MLPs. From Hubel and Wiesel's early work on the cat's visual cortex, we know the visual cortex contains a complex arrangement of cells. These cells are sensitive to small sub-regions of the visual field, called a receptive field. The sub-regions are tiled to cover the entire visual field. These cells act as local filters over the input space and are well-suited to exploit the strong spatially local correlation present in natural images.
Additionally, two basic cell types have been identified: Simple cells respond maximally to specific edge-like patterns within their receptive field. Complex cells have larger receptive fields and are locally invariant to the exact position of the pattern.
The animal visual cortex being the most powerful visual processing system in existence, it seems natural to emulate its behavior. Hence, many neurally-inspired models can be found in the literature.
CNNs exploit spatially-local correlation by enforcing a local connectivity pattern between neurons of adjacent layers. In other words, the inputs of hidden units in layer m are from a subset of units in layer m-1, units that have spatially contiguous receptive fields. We can illustrate this graphically as follows:
Imagine that layer m-1 is the input retina. In the above figure, units in layer m have receptive fields of width 3 in the input retina and are thus only connected to 3 adjacent neurons in the retina layer. Units in layer m+1 have a similar connectivity with the layer below. We say that their receptive field with respect to the layer below is also 3, but their receptive field with respect to the input is larger (5). Each unit is unresponsive to variations outside of its receptive field with respect to the retina. The architecture thus ensures that the learnt "filters" produce the strongest response to a spatially local input pattern.
However, as shown above, stacking many such layers leads to (non-linear) "filters" that become increasingly "global" (i.e. responsive to a larger region of pixel space). For example, the unit in hidden layer m+1 can encode a non-linear feature of width 5 (in terms of pixel space).
In addition, in CNNs, each filter $h_i$ is replicated across the entire visual field. These replicated units share the same parameterization (weight vector and bias) and form a feature map.
In the above figure, we show 3 hidden units belonging to the same feature map. Weights of the same color are shared---constrained to be identical. Gradient descent can still be used to learn such shared parameters, with only a small change to the original algorithm. The gradient of a shared weight is simply the sum of the gradients of the parameters being shared.
Replicating units in this way allows for features to be detected regardless of their position in the visual field. Additionally, weight sharing increases learning efficiency by greatly reducing the number of free parameters being learnt. The constraints on the model enable CNNs to achieve better generalization on vision problems.
A feature map is obtained by repeated application of a function across sub-regions of the entire image, in other words, by convolution of the input image with a linear filter, adding a bias term and then applying a non-linear function. If we denote the k-th feature map at a given layer as $h^k$, whose filters are determined by the weights $W^k$ and bias $b_k$, then the feature map $h^k$ is obtained as follows (for $tanh$ non-linearities):
Note
Recall the following definition of convolution for a 1D signal. $$ o[n] = f[n]*g[n] = \sum_{u=-\infty}^{\infty} f[u] g[n-u] = \sum_{u=-\infty}^{\infty} f[n-u] g[u]`. $$
This can be extended to 2D as follows:
To form a richer representation of the data, each hidden layer is composed of multiple feature maps, $\{h^{(k)}, k=0..K\}$. The weights $W$ of a hidden layer can be represented in a 4D tensor containing elements for every combination of destination feature map, source feature map, source vertical position, and source horizontal position. The biases $b$ can be represented as a vector containing one element for every destination feature map. We illustrate this graphically as follows:
Figure 1: example of a convolutional layer
The figure shows two layers of a CNN. Layer m-1 contains four feature maps. Hidden layer m contains two feature maps ($h^0$ and $h^1$). Pixels (neuron outputs) in $h^0$ and $h^1$ (outlined as blue and red squares) are computed from pixels of layer (m-1) which fall within their 2x2 receptive field in the layer below (shown as colored rectangles). Notice how the receptive field spans all four input feature maps. The weights $W^0$ and $W^1$ of $h^0$ and $h^1$ are thus 3D weight tensors. The leading dimension indexes the input feature maps, while the other two refer to the pixel coordinates.
Putting it all together, $W^{kl}_{ij}$ denotes the weight connecting each pixel of the k-th feature map at layer m, with the pixel at coordinates (i,j) of the l-th feature map of layer (m-1).
ConvOp is the main workhorse for implementing a convolutional layer in Theano.
ConvOp is used by theano.tensor.signal.conv2d
, which takes two symbolic inputs:
a 4D tensor corresponding to a mini-batch of input images. The shape of the tensor is as follows: [mini-batch size, number of input feature maps, image height, image width].
a 4D tensor corresponding to the weight matrix $W$. The shape of the tensor is: [number of feature maps at layer m, number of feature maps at layer m-1, filter height, filter width]
Another important concept of CNNs is max-pooling, which is a form of non-linear down-sampling. Max-pooling partitions the input image into a set of non-overlapping rectangles and, for each such sub-region, outputs the maximum value.
Max-pooling is useful in vision for two reasons:
By eliminating non-maximal values, it reduces computation for upper layers.
It provides a form of translation invariance. Imagine cascading a max-pooling layer with a convolutional layer. There are 8 directions in which one can translate the input image by a single pixel. If max-pooling is done over a 2x2 region, 3 out of these 8 possible configurations will produce exactly the same output at the convolutional layer. For max-pooling over a 3x3 window, this jumps to 5/8.
Since it provides additional robustness to position, max-pooling is a "smart" way of reducing the dimensionality of intermediate representations.
Max-pooling is done in Theano by way of
theano.tensor.signal.downsample.max_pool_2d
. This function takes as input
an N dimensional tensor (where N >= 2) and a downscaling factor and performs
max-pooling over the 2 trailing dimensions of the tensor.
Sparse, convolutional layers and max-pooling are at the heart of the LeNet family of models. While the exact details of the model will vary greatly, the figure below shows a graphical depiction of a LeNet model.
The lower-layers are composed to alternating convolution and max-pooling layers. The upper-layers however are fully-connected and correspond to a traditional MLP (hidden layer + logistic regression). The input to the first fully-connected layer is the set of all features maps at the layer below.
From an implementation point of view, this means lower-layers operate on 4D tensors. These are then flattened to a 2D matrix of rasterized feature maps, to be compatible with our previous MLP implementation.
In [38]:
# from theano.tensor.nnet.conv import conv2d
from theano.tensor.nnet import conv2d
from theano.tensor.signal.downsample import max_pool_2d
Modify dropout function
In [39]:
def model(X, w, w2, w3, w4, w_o, p_drop_conv, p_drop_hidden):
l1a = rectify(conv2d(X, w, border_mode='full'))
l1 = max_pool_2d(l1a, (2, 2))
l1 = dropout(l1, p_drop_conv)
l2a = rectify(conv2d(l1, w2))
l2 = max_pool_2d(l2a, (2, 2))
l2 = dropout(l2, p_drop_conv)
l3a = rectify(conv2d(l2, w3))
l3b = max_pool_2d(l3a, (2, 2))
# convert from 4tensor to normal matrix
l3 = T.flatten(l3b, outdim=2)
l3 = dropout(l3, p_drop_conv)
l4 = rectify(T.dot(l3, w4))
l4 = dropout(l4, p_drop_hidden)
pyx = softmax(T.dot(l4, w_o))
return l1, l2, l3, l4, pyx
reshape into conv 4tensor (b, c, 0, 1) format
In [40]:
X_train2 = X_train.reshape(-1, 1, 28, 28)
X_test2 = X_test.reshape(-1, 1, 28, 28)
In [41]:
# now 4tensor for conv instead of matrix
X = T.ftensor4()
Y = T.fmatrix()
In [42]:
w = init_weights((32, 1, 3, 3))
w2 = init_weights((64, 32, 3, 3))
w3 = init_weights((128, 64, 3, 3))
w4 = init_weights((128 * 3 * 3, 625))
w_o = init_weights((625, 10))
In [43]:
noise_l1, noise_l2, noise_l3, noise_l4, noise_py_x = model(X, w, w2, w3, w4, w_o, 0.2, 0.5)
l1, l2, l3, l4, py_x = model(X, w, w2, w3, w4, w_o, 0., 0.)
y_x = T.argmax(py_x, axis=1)
cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w, w2, w3, w4, w_o]
updates = RMSprop(cost, params, lr=0.001)
train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)
In [45]:
t0 = dt.datetime.now()
errors = []
for i in range(100):
t1 = dt.datetime.now()
for start, end in zip(range(0, X_train.shape[0], 128),
range(128, X_train.shape[0], 128)):
cost = train(X_train2[start:end], y_train2[start:end])
errors.append((np.mean(y_train != predict(X_train2)),
np.mean(y_test != predict(X_test2))))
print(i, errors[-1])
print('Current iter time: ', (dt.datetime.now()-t1).seconds / 60.)
print('Total time: ', (dt.datetime.now()-t0).seconds / 60.)
In [46]:
print('Total time: ', (dt.datetime.now()-t0).seconds / 60.)
In [47]:
res = np.array(errors)
plt.plot(np.arange(res.shape[0]), res[:, 0], label='train error')
plt.plot(np.arange(res.shape[0]), res[:, 1], label='test error')
plt.legend()
Out[47]: