In [2]:
import numpy as np

$\textbf{w}$: connection weights
$b$: neuron bias
$g(\cdot)$: activation function

activation function examples:
linear: $g(a) = a$
sigmoid: $g(a) = \text{sigm}(a) = \frac{1}{1+\text{exp}(-a)} = \frac{1}{1+e^{-a}}$
hyperbolic tanget: $g(a) = \text{tanh}(a) = \frac{\text{exp}(a)-\text{exp}(-a)}{\text{exp}(a)+\text{exp}(-a)} = \frac{\text{exp}(2a)-1}{\text{exp}(2a)+1} = \frac{e^{a}-e^{-a}}{e^{a}+e^{-a}} = \frac{e^{2a}-1}{e^{2a}+1}$
rectified linear function: $g(a) = \text{reclin}(a) = \text{max}(0,a)$


In [7]:
# activation function: rectified linear function
def g(a):
    np.max(0,a)

neuron pre-activation
$a(\textbf{x}) = b + \sum\limits_{i} w_{i}x_{i} = b + \textbf{w}^{T}\textbf{x}$


In [8]:
# neuron pre-activation (input)
def a(x):
    return b + w.T.dot(x)

neuron activation
$h(\textbf{x}) = g(a(\textbf{x}))$


In [9]:
# neuron activation (output)
def h(x):
    return g(a(x))

$o(\cdot)$: output activation function

single hidden-layer

hidden layer pre-activation
$\textbf{a}(\textbf{x}) = b^{(1)} + \textbf{W}^{(1)}\textbf{x}$
$\big(\textbf{a}(\textbf{x})_{i} = b^{(1)}_{i} + \sum\limits_{j} W^{(1)}_{i,j}x_{j}\big)$

hidden layer activation
$\textbf{h}(\textbf{x}) = \textbf{g}(\textbf{a}(\textbf{x}))$

output layer activation
$f(\textbf{x}) = o\big(b^{(2)} + \textbf{w}^{(2)^{T}}\textbf{h}^{(1)}(\textbf{x})\big)$


for binary classification, can simply use sigmoid activation function

for multi-class classification, need 1 output per class

$p(y = c|\textbf{x}), c \in {1,...,C}$

softmax activation function: $\textbf{o}(\textbf{a}) = \text{softmax}(\textbf{a}) = \big[\frac{\text{exp}(a_{1})}{\sum\limits_{c}\text{exp}(a_{c})} ... \frac{\text{exp}(a_{C})}{\sum\limits_{c}\text{exp}(a_{c})}\big]^{T}$


multilayer with $L$ hidden layers

layer pre-activation for $k>0$, where $\textbf{h}^{(0)}(\textbf{x}) = \textbf{x}$
$\textbf{a}^{(k)}(\textbf{x}) = \textbf{b}^{(k)} + \textbf{W}^{(k)}\textbf{h}^{(k-1)}(\textbf{x})$

hidden layer activation ($k$ from 1 to $L$)
$\textbf{h}^{(k)}(\textbf{x}) = \textbf{g}(\textbf{a}^{(k)}(\textbf{x}))$

output layer activation ($k=L+1$)
$\textbf{h}^{(L+1)}(\textbf{x}) = \textbf{o}(\textbf{a}^{(L+1)}(\textbf{x})) = \textbf{f}(\textbf{x})$


empirical risk minimization (structural when using regularizer)

$\underset{\theta} {\mathrm{argmin}} \frac{1}{T}\sum\limits_{t}l(f(\textbf{x}^{(t)}; \boldsymbol\theta),y^{(t)}) + \lambda\Omega(\boldsymbol\theta)$

$\boldsymbol\theta \equiv \{\textbf{W}^{(1)},\textbf{b}^{(1)},...,\textbf{W}^{(L+1)},\textbf{b}^{(L+1)}\}$

$l$: a loss function

$\Omega(\boldsymbol\theta)$: regularizer (penalizes certain values of $\boldsymbol\theta$)

$\lambda$: hyperparameter that controls the balance between optimizing the average loss and optimizing the regularizer function


stochastic gradient descent

$\alpha$: hyperparameter learning rate

$\nabla_{\theta} l(f(\textbf{x}^{(t)}; \boldsymbol\theta),y^{(t)})$: computes parameter gradients

$\nabla_{\theta} \Omega(\boldsymbol\theta)$: gradient of regularizer with respect ot $\boldsymbol\theta$

algorithm:

-initialize $\boldsymbol\theta$

-for $N$ iterations
--> for each training example $(x^{(t)},y^{(t)})$
----> $\Delta = -\nabla_{\theta} l(f(\textbf{x}^{(t)}; \boldsymbol\theta),y^{(t)}) - \lambda \nabla_{\theta} \Omega(\boldsymbol\theta)$
----> $\boldsymbol\theta \leftarrow \boldsymbol\theta + \alpha \Delta$


In [ ]: