In [ ]:
import numpy as np
import pandas
where $\beta > 0$
In [ ]:
fig, ax = plt.subplots()
g_func = lambda h, beta : 1/(1 + np.exp(-beta*h))
x = np.linspace(-3, 3, 100)
ax.plot(x, g_func(x, 2))
ax.hlines([0, 1], -3, 3, color="r")
ax.set_ylim(-.1, 1.1);
ax.set_title(r"$g(h;\, \beta)$");
Training (Repeat)
For each input vector
Forwards phase
$$h_j=\sum_ix_iv_{ij}$$ $$a_j=g(h_j)=\frac{1}{1+e^{-\beta h_j}}$$
$$h_k=\sum_j{a_jw_{jk}}$$ $$y_k=g(h_k)=\frac{1}{1+e^{-\beta h_k}}$$
Backwards phase
compute the error at the output using
$$\delta_{ok}=(t_k-y_k)y_k(1-y_k)$$
compute the error in the hidden layer(s) using
$$\delta_{hj}=a_j(1-a_j)\sum_k w_{jk}\delta_{ok}$$
update the output layer weights using
$$w_{jk}\leftarrow w_{jk} + \eta \delta_{ok}a_j$$
update the hidden layer weights using
$$v_{ij}\leftarrow v_{ij}+\eta \delta_{hj}x_i$$
Randomize the order of the input vectors so that you don't train in exactly the same order each iteration
Batch vs. Online algorithms
Initial weights
Using different activation functions
[[1, 0, 0],
[0, 1, 0],
[0, 0, 1]]
Local Minima
where $0 < \alpha < 1$ is the momentum constant
In [ ]:
np.set_printoptions(suppress=True)
In [ ]:
from perceptron import Perceptron, add_bias_node
Functions for internal use to avoid spaghetti code
In [ ]:
def _linear_delta(targets, outputs, nobs):
return (targets - outputs)/nobs
def _logistic_delta(targets, outputs, *args):
return (targets - outputs)*outputs
def _softmax_delta(targets, outputs, nobs):
return (targets - outputs)/nobs
_calc_deltao = {
"linear" : _linear_delta,
"logistic" : _logistic_delta,
"softmax" : _softmax_delta
}
def _linear_activation(outputs, *args):
return outputs
def _logistic_activation(outputs, beta, *args):
return 1/(1+np.exp(-beta*outputs))
def _softmax_activation(outputs, *args):
# this is multinomial logit
eX = np.exp(outputs)
return eX/eX.sum(axis=1)[:,None]
_activation_funcs = {
"linear" : _linear_activation,
"logistic" : _logistic_activation,
"softmax" : _softmax_activation,
}
In [ ]:
from perceptron import Perceptron, add_bias_node
class MLP(Perceptron):
"""
A Multi-Layer Perceptron
"""
def __init__(self, nhidden, eta, beta=1, momentum=0.9, outtype='logistic'):
# Set up network size
self.nhidden = nhidden
self.eta = eta
self.beta = beta
self.momentum = momentum
self.outtype = outtype
def _init_weights(self):
# Initialise network
weights1 = np.random.rand(self.m+1, self.nhidden)-0.5
weights1 *= 2/np.sqrt(self.m)
weights2 = np.random.rand(self.nhidden+1,self.n)-0.5
weights2 *= 2/np.sqrt(self.nhidden)
self.weights1 = weights1
self.weights2 = weights2
def earlystopping(self, inputs, targets, valid_input, valid_target,
max_iter=100, epsilon=1e-3, disp=True):
self._initialize(inputs, targets)
valid_input = add_bias_node(valid_input)
# 2 iterations ago, last iteration, current iteration
# current iteration, last iteration, 2 iterations ago
last_errors = [0, np.inf, np.inf]
count = 0
while np.any(np.diff(last_errors) > epsilon):
count += 1
if disp:
print count
# train the network
self.fit(inputs, targets, max_iter, init=False, disp=disp)
last_errors[2] = last_errors[1]
last_errors[1] = last_errors[0]
# check on the validation set
valid_output = self.predict(valid_input, add_bias=False)
errors = valid_target - valid_output
last_errors[0] = 0.5*np.sum(errors**2)
if disp:
print "Stopped in %d iterations" % count, last_errors
return last_errors[0]
def fit(self, inputs, targets, max_iter, disp=True, init=True):
"""
Train the network
Parameters
----------
inputs : array-like
The inputs data
targets : array-like
The targets to train on
max_iter : int
The number of iterations to perform
init : bool
Whether to initialize the weights or not.
"""
if init:
self._initialize(inputs, targets)
inputs = self.inputs
targets = self.targets
weights1 = self.weights1
weights2 = self.weights2
eta = self.eta
momentum = self.momentum
nobs = self.nobs
outtype = self.outtype
# Add the inputs that match the bias node
inputs = add_bias_node(inputs)
change = range(self.nobs)
updatew1 = np.zeros_like(weights1)
updatew2 = np.zeros_like(weights2)
for n in range(1, max_iter+1):
# predict attaches hidden
outputs = self.predict(inputs, add_bias=False)
error = targets - outputs
obj = .5 * np.sum(error**2)
# Different types of output neurons
deltao = _calc_deltao[outtype](targets, outputs, nobs)
hidden = self.hidden
deltah = hidden * (1. - hidden) * np.dot(deltao, weights2.T)
updatew1 = (eta*(np.dot(inputs.T, deltah[:,:-1])) +
momentum*updatew1)
updatew2 = (eta*(np.dot(self.hidden.T, deltao)) +
momentum*updatew2)
weights1 += updatew1
weights2 += updatew2
# Randomise order of inputs
np.random.shuffle(change)
inputs = inputs[change,:]
targets = targets[change,:]
if disp:
print "Iteration: ", n, " Objective: ", obj
# attach results
self.weights1 = weights1
self.weights2 = weights2
self.outputs = outputs
def predict(self, inputs=None, add_bias=True):
"""
Run the network forward.
"""
if inputs is None:
inputs = self.inputs
if add_bias:
inputs = add_bias_node(inputs)
hidden = np.dot(inputs, self.weights1)
hidden = _activation_funcs["logistic"](hidden, self.beta)
hidden = add_bias_node(hidden)
self.hidden = hidden
outputs = np.dot(self.hidden, self.weights2)
outtype = self.outtype
# Different types of output neurons
return _activation_funcs[self.outtype](outputs, self.beta)
def confusion_matrix(self, inputs, targets, summary=True):
"""
Confusion matrix.
"""
targets = np.asarray(targets)
# Add the inputs that match the bias node
inputs = add_bias_node(inputs)
# predict attaches hidden
outputs = self.predict(inputs, add_bias=False)
n_classes = targets.ndim == 1 and 1 or targets.shape[1]
if n_classes==1:
n_classes = 2
# 50% cut-off with continuous activation function
outputs = np.where(outputs > 0.5, 1, 0)
else:
# 1-of-N encoding
outputs = np.argmax(outputs, 1)
targets = np.argmax(targets, 1)
outputs = np.squeeze(outputs)
targets = np.squeeze(targets)
cm = np.histogram2d(targets, outputs, bins=n_classes)[0]
if not summary:
return cm
else:
return np.trace(cm)/np.sum(cm)*100
In [ ]:
X = [[0., 0.],
[0., 1.],
[1., 0.],
[1., 1.]]
In [ ]:
target = [0, 0, 0, 1]
In [ ]:
and_clf = MLP(2, .25)
and_clf.fit(X, target, 5001)
In [ ]:
and_clf.predict(X)
In [ ]:
and_clf.confusion_matrix(X, target, summary=False)
In [ ]:
target = [0., 1., 1., 0.]
In [ ]:
xor_clf = MLP(2, .25)
xor_clf.fit(X, target, 5001)
In [ ]:
xor_clf.predict()
In [ ]:
np.random.seed(12345)
x = np.linspace(0, 1, 40)[:,None] # make 2D
t = (np.sin(2*np.pi*x) +
np.cos(4*np.pi*x) +
np.random.normal(0, .2, size=(40,1)))
x = (x - .5)*2
In [ ]:
fig, ax = plt.subplots()
ax.plot(x, t, 'o');
Split into 50:25:25
In [ ]:
train = x[::2]
test = x[1::4]
validate = x[3::4]
train_target = t[::2]
test_target = t[1::4]
validate_target = t[3::4]
In [ ]:
net = MLP(3, .25, outtype="linear")
net.fit(train, train_target, 100)
In [ ]:
net.earlystopping(train, train_target, validate, validate_target)
In [ ]:
nhiddens = [1, 2, 3, 5, 10, 25, 50, 100]
all_errors = []
nruns = 10
for nhidden in nhiddens:
errors = []
mlp = MLP(nhidden, .25, outtype="linear")
for i in range(nruns):
error = mlp.earlystopping(train, train_target, validate, validate_target, disp=False)
errors.append(error)
all_errors.append(errors)
In [ ]:
all_errors = np.array(all_errors)
print " n mean std min max"
print np.column_stack((nhiddens, all_errors.mean(1), all_errors.std(1),
all_errors.min(1), all_errors.max(1)))
In [ ]:
from sklearn import cross_validation
a, b = np.arange(16).reshape((8, 2)), np.arange(8)
In [ ]:
print a
print b
In [ ]:
(a_train,
a_test,
b_train,
b_test) = cross_validation.train_test_split(a, b,
train_size=.75,
random_state=123)
In [ ]:
a_train, b_train
In [ ]:
a_test, b_test
Aside: Python Generators
K-Folds Cross-Validation
In [ ]:
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
kf = cross_validation.KFold(10, n_folds=5)
for train_index, test_index in kf:
print "TRAIN:", train_index, "TEST:", test_index
In [ ]:
kf = cross_validation.KFold(10, n_folds=5, shuffle=True)
for train_index, test_index in kf:
print "TRAIN:", train_index, "TEST:", test_index
In [ ]:
from sklearn.datasets import load_iris
data = load_iris()
In [ ]:
print data.DESCR
You can read much more about the dataset here
Normalize the data
In [ ]:
# demean
X = data.data - data.data.mean(0)
# normalize by maximum
X /= X.max(0)
In [ ]:
data.target
We need to put this into 1-of-N encoding
In [ ]:
target = np.zeros((len(X), 3))
target[np.arange(len(X)),data.target] = 1
In [ ]:
target[:10]
We are going to use train_test_split to split the data up into training, testing, and validation sets
In [ ]:
(feature_train, feature_test,
target_train, target_test) = cross_validation.train_test_split(X, target, test_size=.25)
In [ ]:
target_train.mean(0)
In [ ]:
target_validate.mean(0)
If you need to split based on maintaining the same percentage of the target labels, you can use StratifiedKFold
We can now split the training data into training and validation sets
In [ ]:
(feature_train, feature_validate,
target_train, target_validate) = cross_validation.train_test_split(feature_train,
target_train,
test_size=.33)
In [ ]:
net = MLP(5, .1, outtype="softmax")
net.earlystopping(feature_train, target_train, feature_validate,
target_validate, disp=False)
In [ ]:
net.confusion_matrix(feature_test, target_test)
In [ ]:
net.confusion_matrix(feature_test, target_test, summary=False)
where $a_j$ is the output from the hidden layer neurons
note that
$$\frac{\partial t_k}{\partial w_{ik}}=0$$and
$\frac{\partial}{\partial w_{ik}}\sum_jw_{jk}x_j$
is only non-zero when $i=j$, thus
$$\frac{\partial}{\partial w_{ik}}=x_i$$so that we have
$$\begin{aligned}\frac{\partial E}{\partial w_{ik}} & = \sum_k(t_k-y_k)\left(-x_i\right) \end{aligned} $$To make our errors smaller, we follow the gradient "downhill" such that (including the learning rate)
$$w_{ik}\leftarrow w_{ik}+\eta(t_k-y_k)x_i$$where $h_k=\sum_lw_{lk}a_l$ is the input to output-layer neuron $k$
where the output of the output-layer neuron $k$
$$y_k = g(h_k)=g\left(\sum_j w_{jk}a_j\right)$$Plugging-in the derivatives we already have for $\delta_0$ gives
$$\begin{aligned} \delta_0 & = \frac{\partial E}{\partial g(h_k)}\frac{\partial g(h_k)}{\partial h_k} \cr & = \frac{\partial}{\partial g(h_k)}\left[\frac{1}{2}\sum_k{\left(t_k-g\left(\sum_j w_{jk}x_j\right)\right)}\right] \frac{\partial g(h_k)}{\partial h_k} \cr & = (g(h_k)-t_k)g^{\prime}(h_k) \cr & = (y_k - t_k)g^{\prime}(h_k) \end{aligned} $$We already have $g^\prime(h_k)$ so we can put it all together to give the update step for the second-layer weights
$$w_{jk}\leftarrow w_{jk} - \eta \frac{\partial E}{\partial w_{jk}}$$The missing piece is
$$\begin{aligned} \frac{\partial E}{\partial w_{jk}} &= \delta_0a_j \cr &= (y_k - t_k)y_k(1-y_k)a_j \end{aligned}$$and
$$\frac{\partial h_k}{\partial h_j}=\frac{\partial g(\sum_l w_{lk}h_l)}{h_j}$$Noting that $\frac{\partial h_i}{\partial h_j} = 0$ if $i\neq l$
$$\begin{aligned} \frac{\partial h_k}{\partial h_j}&=w_{jk}g^{\prime}(a_j) \cr &=w_{jk}a_j(1-a_j) \end{aligned}$$This gives a delta term
$$\delta_h = a_j(1-a_j)\sum_k \delta_0 w_{jk}$$So the update rule $v_{ij}\leftarrow v_{ij} - \eta\frac{\partial E}{\partial v_{ij}}$ needs
$$\frac{\partial E}{\partial v_{ij}} = a_j(1-a_j)(\sum_k \delta_0w_{jk})x_i$$