This code is provided as supplementary material of the lecture Machine Learning and Optimization in Communications (MLOC).
This code illustrates:
In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
function_select = 4
def myfun(x):
functions = {
1: np.power(x,2), # quadratic function
2: np.sin(x), # sinus
3: np.sign(x), # signum
4: np.exp(x), # exponential function
5: np.abs(x)
}
return functions.get(function_select)
In [3]:
# Generate training data.
N = 32
x_train = np.linspace(-2, 2, num=N).reshape(-1,1)
# Generate the evaluation data.
# (can exceed the range of the training data to evaluate the prediction capabilities)
x_eval = np.linspace(-4, 4, num=4*N).reshape(-1,1)
Function definitions. Here we consider a hard-coded two-layer perception with one hidden layer, using the rectified linear unit (ReLU) as activation function, and a linear output layer. The output of the perceptron can hence be written as \begin{equation*} \hat{f}(x,\boldsymbol{\theta}) = \sum_{i=1}^m v_i\sigma(x+b_i) \end{equation*} where $\sigma(x) = \text{ReLU}(x) = \begin{cases}0 & \text{if }x < 0 \\x & \text{otherwise}\end{cases}$.
Instead of specifying all the parameters individually, we group them in a single vector $\boldsymbol{\theta}$ with \begin{equation*} \boldsymbol{\theta}=\begin{pmatrix} v_1 & b_1 & v_2 & b_2 & \cdots & v_m & b_m\end{pmatrix} \end{equation*}
In [4]:
def sigma(x):
return np.maximum(0,x)
# First order derivative of sigma (here tanh)
def sigma_prime(x):
retval = np.zeros(x.shape)
retval[x >= 0] = 1
return retval
def MLP(x,theta):
y = np.zeros(x.shape)
for k in range(0,len(theta),2):
y += theta[k]*sigma(x+theta[k+1])
return y
The cost function is the mean-squared error, i.e., \begin{equation*} J(\boldsymbol{\theta},\mathbb{X}^{[\text{train}]},\mathbb{Y}^{[\text{train}]}) = \frac{1}{N}\sum_{i=1}^N\left(\hat{f}(x_i^{[\text{train}]},\boldsymbol{\theta}) - y_i^{[\text{train}]}\right)^2 \end{equation*} The gradient of the cost function can be computed by hand as \begin{equation*} \nabla_{\boldsymbol{\theta}}J(\boldsymbol{\theta},\mathbb{X}^{[\text{train}]},\mathbb{Y}^{[\text{train}]}) = \frac{1}{N}\sum_{i=1}^N\left(\hat{f}(x_i^{[\text{train}]},\boldsymbol{\theta}) - y_i^{[\text{train}]}\right)\begin{pmatrix} \sigma(x_i^{[\text{train}]}+\theta_2) \\ \theta_1\sigma^\prime(x_i^{[\text{train}]}+\theta_2) \\ \sigma(x_i^{[\text{train}]}+\theta_4) \\ \theta_3\sigma^\prime(x_i^{[\text{train}]}+\theta_4) \\ \vdots \\ \sigma(x_i^{[\text{train}]}+\theta_{2m}) \\ \theta_{2m-1}\sigma^\prime(x_i^{[\text{train}]}+\theta_{2m})\end{pmatrix} \end{equation*} where \begin{equation*} \sigma^\prime(x) = \left\{\begin{array}{ll} 0 & \text{if }x < 0 \\ 1 & \textrm{otherwise}\end{array}\right. \end{equation*}
In [5]:
def cost_function(x, y, theta):
# cost function is mean-squared error bvetween the training set x and the y
difference = np.array([MLP(e, theta) for e in x]) - y
return np.dot(difference.T, difference)/len(x)
# gradient of the cost function
def cost_function_gradient(x, y, theta):
gradient = np.zeros(len(theta))
for k in range(len(x)):
ig = np.zeros(len(theta))
for j in range(0,len(theta),2):
ig[j] = sigma(x[k]+theta[j+1])
ig[j+1] = theta[j]*sigma_prime(x[k]+theta[j+1])
gradient += 2*(MLP(x[k],theta) - y[k])*ig
return gradient / len(x)
Here, we use the Adam optimizer algorithm [1] to find the best configuration of parameters $\boldsymbol{\theta}$. See also the notebook MLP_introduction.ipynb
for a description of the Adam optimizer.
1] D. P. Kingma and J. L. Ba, "Adam: A Method for Stochastic Optimization," published at ICLR 2015, available at https://arxiv.org/pdf/1412.6980.pdf
In [6]:
def approx_1d_function_adam(x_train, theta_initial, epochs):
y_train = myfun(x_train)
theta = theta_initial
beta1 = 0.9
beta2 = 0.999
alpha = 0.001
epsilon = 1e-8
m = np.zeros(theta.shape)
t = 0
v = np.zeros(theta.shape)
for k in range(epochs):
t += 1
g = cost_function_gradient(x_train, y_train, theta)
m = beta1*m + (1-beta1)*g
v = beta2*v + (1-beta2)*(g**2)
mhat = m/(1-beta1**t)
vhat = v/(1-beta2**t)
theta = theta - alpha*mhat/(np.sqrt(vhat)+epsilon)
return theta
Carry out the optimization using 10000 iterations with Adam and specify the number of components $m$.
In [7]:
epochs = 10000
m = 10
theta_initial = np.random.randn(2*m)
theta_adam = approx_1d_function_adam(x_train, theta_initial, epochs)
# compute evaluation
predictions = MLP(x_eval, theta_adam)
In [8]:
fig = plt.figure(1, figsize=(18,6))
font = {'size' : 14}
plt.rc('font', **font)
plt.rc('text', usetex=True)
plt.rc('text.latex', preamble=r'\usepackage{amsmath}\usepackage{amssymb}\usepackage{bm}')
ax = fig.add_subplot(1, 2, 1)
plt.plot(x_eval, myfun(x_eval), '-', color='royalblue', linewidth=1.0)
plt.plot(x_eval, predictions, '-', label='output', color='darkorange', linewidth=2.0)
plt.plot(x_train, myfun(x_train), '.', color='royalblue',markersize=14)
plt.xlim((min(x_train),max(x_train)))
plt.ylim((-0.5,8))
plt.grid(which='both');
plt.rcParams.update({'font.size': 14})
plt.xlabel('$x$');
plt.ylabel('$y$')
plt.title('%d ReLU-neurons in hidden layer with %d iterations of Adam' % (m,epochs))
plt.legend(['Function $f(x)$', r'MLP output $\hat{f}(x,\bm{\theta})$', 'Training set'])
ax = fig.add_subplot(1, 2, 2)
for k in range(0,len(theta_adam),2):
plt.plot(x_eval, [theta_adam[k]*sigma(x + theta_adam[k+1]) for x in x_eval], '--', label='Relu %d' % (k//2), linewidth=2.0)
plt.grid(which='both');
plt.xlim((min(x_train),max(x_train)))
plt.xlabel('$x$');
plt.ylabel('$y$')
plt.title('Weighted output of the %d neurons' % m)
#plt.savefig('MLP_ReLU_m%d_fun%d.pdf' % (m,function_select),bbox_inches='tight')
plt.show()
In [89]:
from matplotlib import animation, rc
from IPython.display import HTML
from matplotlib.animation import PillowWriter # Disable if you don't want to save any GIFs.
fig, ax = plt.subplots(1, figsize=(8,6))
# Animation function. This is called sequentially.
def animate(i):
ax.clear()
ax.plot(x_eval, myfun(x_eval), '-', color='royalblue', linewidth=1.0)
ax.plot(x_train, myfun(x_train), '.', color='royalblue',markersize=8)
function_agg = np.zeros(len(x_eval))
for k in range(0,i):
part_relu = np.array([theta_adam[2*k]*sigma(x[0] + theta_adam[2*k+1]) for x in x_eval])
ax.plot(x_eval, part_relu, '--', color='gray', linewidth=0.5)
function_agg += part_relu
ax.plot(x_eval, function_agg, '-', color='darkorange', linewidth=3.0)
ax.grid(which='both')
ax.set_title("%d components" % i, fontsize=16)
ax.set_xlim((min(x_eval),max(x_eval)))
ax.set_ylim((-2,8))
ax.set_xlim((-4,4))
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y=f(x)$')
return fig,
# Call the animator.
anim = animation.FuncAnimation(fig, animate, frames=1+len(theta_adam)//2, interval=2000, blit=True, repeat=False)
# If you want to save the animation, use the following line.
#anim.save('basic_animation_test_fun%d.gif' % function_select, writer=PillowWriter(fps=.4))
In [45]:
HTML(anim.to_html5_video())
Out[45]:
In [ ]: