In [1]:
%load_ext tikzmagic
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import HTML
def css_styling():
styles = """
<style>
.output_png { text-align: center; }
</style>
"""
return HTML(styles)
css_styling()
Mark 1 perceptron (Frank Rosenblatt, 1957):
The New York Times, 1958:
[...] the embryo of an electronic computer that the Navy expects will be able to walk, talk, see, write, reproduce itself and be conscious of its existence.
In [835]:
from IPython.display import YouTubeVideo
YouTubeVideo('cNxadbrN_aI', width=800, height=600)
Out[835]:
Cykl uczenia perceptronu (w sumie 2000 "epok"):
In [836]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams.update({'font.size': 16})
plt.figure(figsize=(10,6))
x = [-1,-.23,1]
y = [-1, -1, 1]
plt.ylim(-1.2,1.2)
plt.xlim(-1.2,1.2)
plt.plot([-2,2],[1,1], color='black', ls="dashed")
plt.plot([-2,2],[-1,-1], color='black', ls="dashed")
plt.step(x, y, lw=3)
ax = plt.gca()
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.xaxis.set_ticks_position('bottom')
ax.spines['bottom'].set_position(('data',0))
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('data',0))
plt.annotate(r'$\theta_0$',
xy=(-.23,0), xycoords='data',
xytext=(-50, +50), textcoords='offset points', fontsize=26,
arrowprops=dict(arrowstyle="->"))
plt.show()
gdzie $z = \theta_0x_0 + \ldots + \theta_nx_n$. Niech $\theta_0$ to próg aktywacji, ustalamy $x_0 = 1$.
In [837]:
%%tikz -l arrows,automata,positioning,shapes,shapes.geometric,fit -f png -s 2000,1400
\tikzstyle{every node}=[font=\large]
\tikzstyle{every path}=[line width=1pt]
\node[state] (x0) {$1$};
\node[state] (x1) [below=0.5cm of x0] {$x_1$};
\node[state] (x2) [below=0.5cm of x1] {$x_2$};
\node[state, draw=none,fill=none] (dots) [below=0.5cm of x2] {$\cdots$};
\node[state] (xn) [below=0.5cm of dots] {$x_n$};
\node[state,circle,label=above:{Funkcja wej\'{s}cia}] (sum) [right=2cm of x2] {$z=\displaystyle\sum_{i=0}^{n}\theta_ix_i$};
\node[state,rectangle,label=above:{Funkcja aktywacji}] (g) [right=of sum]
{$g(z) = \left\{\begin{array}{rl} 1 & \textrm{gdy } z > \theta_0 \\ -1 & \textrm{wpp.} \end{array}\right.$};
\node[state] (output) [right=of g] {Wyj\'{s}cie};
\path[->]
(x0) edge node [above, pos=0.4] {$\theta_0$} (sum)
(x1) edge node [above, pos=0.4] {$\theta_1$} (sum)
(x2) edge node [above, pos=0.4] {$\theta_2$} (sum)
(xn) edge node [above, pos=0.4] {$\theta_n$} (sum)
(sum) edge node {} (g)
(g) edge node {} (output);
\node [draw,dashed, fit= (x0) (x1) (x2) (dots) (xn),label=above:Cechy, label=below:{Warstwa 0}] {};
\node [draw,dashed, fit= (sum) (g) (output),label=above:Neuron, label=below:{Warstwa 1}, inner sep=0.65cm] {};
Pytania:
Reguła perceptronowa:
$$\theta_j := \theta_j + \Delta \theta_j $$Poprawnie zaklasyfikowane:
Skoro trafiłeś, to nic nie zmieniaj!
Reguła perceptronowa:
$$\theta_j := \theta_j + \Delta \theta_j $$Niepoprawnie zaklasyfikowane:
Przesuń w wagi w odpowiednią stronę:
In [838]:
plt.figure(figsize=(16,7))
plt.subplot(121)
x = [-2,-.23,2]
y = [-1, -1, 1]
plt.ylim(-1.2,1.2)
plt.xlim(-2.2,2.2)
plt.plot([-2,2],[1,1], color='black', ls="dashed")
plt.plot([-2,2],[-1,-1], color='black', ls="dashed")
plt.step(x, y, lw=3)
ax = plt.gca()
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.xaxis.set_ticks_position('bottom')
ax.spines['bottom'].set_position(('data',0))
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('data',0))
plt.annotate(r'$\theta_0$',
xy=(-.23,0), xycoords='data',
xytext=(-50, +50), textcoords='offset points', fontsize=26,
arrowprops=dict(arrowstyle="->"))
import numpy as np
plt.subplot(122)
x2 = np.linspace(-2,2,100)
y2 = np.tanh(x2+ 0.23)
plt.ylim(-1.2,1.2)
plt.xlim(-2.2,2.2)
plt.plot([-2,2],[1,1], color='black', ls="dashed")
plt.plot([-2,2],[-1,-1], color='black', ls="dashed")
plt.plot(x2, y2, lw=3)
ax = plt.gca()
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.xaxis.set_ticks_position('bottom')
ax.spines['bottom'].set_position(('data',0))
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('data',0))
plt.annotate(r'$\theta_0$',
xy=(-.23,0), xycoords='data',
xytext=(-50, +50), textcoords='offset points', fontsize=26,
arrowprops=dict(arrowstyle="->"))
plt.show()
In [839]:
%%tikz -l arrows,automata,positioning,shapes,shapes.geometric,fit -f png -s 2000,1400
\tikzstyle{every node}=[font=\large]
\tikzstyle{every path}=[line width=1pt]
\node[state] (x0) {$1$};
\node[state] (x1) [below=0.5cm of x0] {$x_1$};
\node[state] (x2) [below=0.5cm of x1] {$x_2$};
\node[state, draw=none,fill=none] (dots) [below=0.5cm of x2] {$\cdots$};
\node[state] (xn) [below=0.5cm of dots] {$x_n$};
\node[state,circle,label=above:{Funkcja wej\'{s}cia}] (sum) [right=2cm of x2] {$z=\displaystyle\sum_{i=0}^{n}\theta_ix_i$};
\node[state,rectangle,label=above:{Funkcja aktywacji}] (g) [right=of sum]
{$g(z) = z$};
\node[state] (output) [right=of g] {Wyj\'{s}cie};
\path[->]
(x0) edge node [above, pos=0.4] {$\theta_0$} (sum)
(x1) edge node [above, pos=0.4] {$\theta_1$} (sum)
(x2) edge node [above, pos=0.4] {$\theta_2$} (sum)
(xn) edge node [above, pos=0.4] {$\theta_n$} (sum)
(sum) edge node {} (g)
(g) edge node {} (output);
\node [draw,dashed, fit= (x0) (x1) (x2) (dots) (xn),label=above:Cechy, label=below:{Warstwa 0}] {};
\node [draw,dashed, fit= (sum) (g) (output),label=above:Neuron, label=below:{Warstwa 1}, inner sep=0.65cm] {};
In [840]:
%%tikz -l arrows,automata,positioning,shapes,shapes.geometric,fit -f png -s 2000,1400
\tikzstyle{every node}=[font=\large]
\tikzstyle{every path}=[line width=1pt]
\node[state] (x0) {$1$};
\node[state] (x1) [below=0.5cm of x0] {$x_1$};
\node[state] (x2) [below=0.5cm of x1] {$x_2$};
\node[state, draw=none,fill=none] (dots) [below=0.5cm of x2] {$\cdots$};
\node[state] (xn) [below=0.5cm of dots] {$x_n$};
\node[state,circle,label=above:{Funkcja wej\'{s}cia}] (sum) [right=2cm of x2] {$z=\displaystyle\sum_{i=0}^{n}\theta_ix_i$};
\node[state,rectangle,label=above:{Funkcja aktywacji}] (g) [right=of sum]
{$g(z) = \displaystyle\frac{1}{1+e^{-z}}$};
\node[state] (output) [right=of g] {Wyj\'{s}cie};
\path[->]
(x0) edge node [above, pos=0.4] {$\theta_0$} (sum)
(x1) edge node [above, pos=0.4] {$\theta_1$} (sum)
(x2) edge node [above, pos=0.4] {$\theta_2$} (sum)
(xn) edge node [above, pos=0.4] {$\theta_n$} (sum)
(sum) edge node {} (g)
(g) edge node {} (output);
\node [draw,dashed, fit= (x0) (x1) (x2) (dots) (xn),label=above:Cechy, label=below:{Warstwa 0}] {};
\node [draw,dashed, fit= (sum) (g) (output),label=above:Neuron, label=below:{Warstwa 1}, inner sep=0.65cm] {};
Funkcja kosztu (entropia krzyżowa): $$\begin{eqnarray} J(\theta) &=& -\frac{1}{m} \sum_{i=1}^{m} [y^{(i)}\log P(1|x^{(i)},\theta) \\ && + (1-y^{(i)})\log(1-P(1|x^{(i)},\theta))]\end{eqnarray}$$
Po obliczeniu $\nabla J(\theta)$, zwykły SGD
In [841]:
%%tikz -l arrows,automata,positioning,shapes,shapes.geometric,fit -f png -s 1000,600
\tikzstyle{every node}=[font=\large]
\tikzstyle{every path}=[line width=1pt]
\node[state] (x0) {$1$};
\node[state] (x1) [below=0.5cm of x0] {$x_1$};
\node[state] (x2) [below=0.5cm of x1] {$x_2$};
\node[state, draw=none,fill=none] (dots) [below=0.5cm of x2] {$\cdots$};
\node[state] (xn) [below=0.5cm of dots] {$x_n$};
\node[state,circle] (sum1) [right=4cm of x1] {$g(\sum)$};
\node[state,circle] (sum2) [right=4cm of x2] {$g(\sum)$};
\node[state,circle] (sum3) [right=4cm of dots] {$g(\sum)$};
\node[state, draw=none,fill=none] (p1) [right=0.5cm of sum1] {$P(c=0)$};
\node[state, draw=none,fill=none] (p2) [right=0.5cm of sum2] {$P(c=1)$};
\node[state, draw=none,fill=none] (p3) [right=0.5cm of sum3] {$P(c=2)$};
\path[->]
(x0) edge node [above, pos=0.5] {$\theta^{(0)}_{0}$} (sum1)
(x1) edge node [above, pos=0.5] {$\theta^{(0)}_{1}$} (sum1)
(x2) edge node [above, pos=0.5] {$\theta^{(0)}_{2}$} (sum1)
(xn) edge node [above, pos=0.5] {$\theta^{(0)}_{n}$} (sum1);
\path[-, thin, dotted]
(x0) edge node {} (sum2)
(x1) edge node {} (sum2)
(x2) edge node {} (sum2)
(xn) edge node {} (sum2)
(x0) edge node {} (sum3)
(x1) edge node {} (sum3)
(x2) edge node {} (sum3)
(xn) edge node [below, pos=0.5] {$\theta^{(2)}_{n}$} (sum3);
\node [draw, dashed, fit= (x0) (x1) (x2) (dots) (xn),label=above:Cechy, label=below:{Warstwa 0}] (w0) {};
\node [draw, dashed, fit= (sum1) (sum2) (sum3), label=below:{Warstwa 1}, label=above:{$g(\cdot) = \mathrm{softmax}(\cdot)$}] (w1) {};
\node[draw=none,fill=none] (theta) [below=1cm of w1]
{$\Theta = \left[%
\begin{array}{ccc} %
\theta_0^{(0)} & \theta_0^{(1)} & \theta_0^{(2)} \\%
\theta_1^{(0)} & \theta_1^{(1)} & \theta_1^{(2)} \\%
\vdots & \vdots & \vdots \\%
\theta_n^{(0)} & \theta_n^{(1)} & \theta_n^{(2)} \\%
\end{array} \right]$
};
Funkcja kosztu (przymując model regresji binarnej): $$\begin{eqnarray} J(\theta^{(k)}) &=& -\frac{1}{m} \sum_{i=1}^{m} [y^{(i)}\log P(k|x^{(i)},\theta^{(k)}) \\ && + (1-y^{(i)})\log P(\neg k|x^{(i)},\theta^{(k)})]\end{eqnarray}$$
Po obliczeniu $\nabla J(\theta)$, c-krotne uruchomienie SGD, zastosowanie $\mathrm{softmax}(X)$ do niezależnie uzyskanych klasyfikatorów binarnych.
Wieloklasowa funkcja kosztu $J(\Theta)$ (kategorialna entropia krzyżowa): $$ J(\Theta) = -\frac{1}{m}\sum_{i=1}^{m}\sum_{k=1}^{c} \delta({y^{(i)},k}) \log P(k|x^{(i)},\Theta) $$
Gradient $\nabla J(\Theta)$: $$ \dfrac{\partial J(\Theta)}{\partial \Theta_{j,k}} = -\frac{1}{m}\sum_{i = 1}^{m} (\delta({y^{(i)},k}) - P(k|x^{(i)}, \Theta)) x^{(i)}_j $$
Liczymy wszystkie wagi jednym uruchomieniem SGD
In [842]:
%%tikz -l arrows,automata,positioning,shapes,shapes.geometric,fit -f png -s 1000,600
\node[state] (x1) {$x_1$};
\node[state] (x2) [below=0.5cm of x1] {$x_2$};
\node[state, draw=none,fill=none] (dots) [below=0.5cm of x2] {$\cdots$};
\node[state] (xn) [below=0.5cm of dots] {$x_n$};
\node[state,circle] (a1) [below right=-0.33cm and 3cm of x1] {$a^{(1)}_1$};
\node[state,circle] (a2) [below=0.5cm of a1] {$a^{(1)}_2$};
\node[state,circle] (a3) [below=0.5cm of a2] {$a^{(1)}_3$};
\node[state] (b1) [above left=1cm and 1cm of a1] {$1$};
\node[state,circle] (a21) [right=2cm of a1] {$a^{(2)}_1$};
\node[state,circle] (a22) [below=0.5cm of a21] {$a^{(2)}_2$};
\node[state,circle] (a23) [below=0.5cm of a22] {$a^{(2)}_3$};
\node[state] (b2) [above left=1cm and 1cm of a21] {$1$};
\node[state,circle] (a31) [right=2cm of a22] {$a^{(3)}_1$};
\node[state] (b3) [right=2cm of b2] {$1$};
\path[-]
(b1) edge node [above=.2cm, pos=0.5] {$\beta^{(1)}_{1}$} (a1)
(x1) edge node [above, pos=0.5] {$\Theta^{(1)}_{1,1}$} (a1)
(x2) edge node [above, pos=0.5] {$\Theta^{(1)}_{2,1}$} (a1)
(xn) edge node [above, pos=0.5] {$\Theta^{(1)}_{n,1}$} (a1);
\path[-, thin, dotted]
(b1) edge node {} (a2)
(x1) edge node {} (a2)
(x2) edge node {} (a2)
(xn) edge node {} (a2)
(b1) edge node {} (a3)
(x1) edge node {} (a3)
(x2) edge node {} (a3)
(xn) edge node [below, pos=0.5] {$\Theta^{(1)}_{n,3}$} (a3);
\path[-]
(b2) edge node [above=.2cm, pos=0.5] {$\beta^{(2)}_{1}$} (a21)
(a1) edge node [above, pos=0.5] {$\Theta^{(2)}_{1,1}$} (a21)
(a2) edge node [above, pos=0.5] {$\Theta^{(2)}_{2,1}$} (a21)
(a3) edge node [above, pos=0.5] {$\Theta^{(2)}_{3,1}$} (a21);
\path[-, thin, dotted]
(b2) edge node {} (a22)
(a1) edge node {} (a22)
(a2) edge node {} (a22)
(a3) edge node {} (a22)
(b2) edge node {} (a23)
(a1) edge node {} (a23)
(a2) edge node {} (a23)
(a3) edge node [below, pos=0.5] {$\Theta^{(2)}_{3,3}$} (a23);
\path[-]
(b3) edge node [above=.5cm, pos=0.5] {$\beta^{(3)}_{1}$} (a31)
(a21) edge node [above, pos=0.5] {$\Theta^{(3)}_{1,1}$} (a31)
(a22) edge node [above, pos=0.5] {$\Theta^{(3)}_{2,1}$} (a31)
(a23) edge node [above, pos=0.5] {$\Theta^{(3)}_{3,1}$} (a31);
\node [draw, dashed, fit= (x1) (x2) (dots) (xn)] (w0) {};
\node [draw, dashed, fit= (a1) (a2) (a3)] (w1) {};
\node [draw, dashed, fit= (a21) (a22) (a23)] (w2) {};
\node [draw, dashed, fit= (a31)] (w3) {};
\node [draw, draw=none, fill=none, below=0.5cm of w0] (mw0) {\small$a^{(0)}=x$};
\node [draw, draw=none, fill=none, right=1.1cm of mw0] (mw1)
{\small$\begin{array}{l}z^{(1)} = a^{(0)} \Theta^{(1)} + \beta^{(1)}\\g^{(1)}(x)=\tanh(x)\\a^{(1)}=g^{(1)}(z^{(1)})\end{array}$};
\node [draw, draw=none, fill=none, right=-.3cm of mw1] (mw2)
{\small$\begin{array}{l}z^{(2)} = a^{(1)} \Theta^{(2)} + \beta^{(2)}\\g^{(2)}(x)=\tanh(x)\\a^{(2)}=g^{(2)}(z^{(2)})\end{array}$};
\node [draw, draw=none, fill=none, right=-.3cm of mw2] (mw3)
{\small$\begin{array}{l}z^{(3)} = a^{(2)} \Theta^{(3)} + \beta^{(3)}\\g^{(3)}(x)=\tanh(x)\\a^{(3)}=g^{(3)}(z^{(3)})\end{array}$};
Niech: $$\Theta = (\Theta^{(1)},\Theta^{(2)},\Theta^{(3)},\beta^{(1)},\beta^{(2)},\beta^{(3)})$$
Funkcja sieci neuronowej z grafiki:
Dla jednego przykładu (x,y):
In [843]:
%%tikz -l arrows,automata,positioning,shapes,shapes.geometric,fit -f png -s 1000,600
\node[state] (x1) {$x_1$};
\node[state] (x2) [below=0.5cm of x1] {$x_2$};
\node[state, draw=none,fill=none] (dots) [below=0.5cm of x2] {$\cdots$};
\node[state] (xn) [below=0.5cm of dots] {$x_n$};
\node[state,circle] (a1) [below right=-0.33cm and 3cm of x1] {$a^{(1)}_1$};
\node[state,circle] (a2) [below=0.5cm of a1] {$a^{(1)}_2$};
\node[state,circle] (a3) [below=0.5cm of a2] {$a^{(1)}_3$};
\node[state] (b1) [above left=1cm and 1cm of a1] {$1$};
\node[state,circle] (a21) [right=2cm of a1] {$a^{(2)}_1$};
\node[state,circle] (a22) [below=0.5cm of a21] {$a^{(2)}_2$};
\node[state,circle] (a23) [below=0.5cm of a22] {$a^{(2)}_3$};
\node[state] (b2) [above left=1cm and 1cm of a21] {$1$};
\node[state,circle] (a31) [right=2cm of a22] {$a^{(3)}_1$};
\node[state] (b3) [right=2cm of b2] {$1$};
\node[draw=none, fill=none] (delta3) [below right=0.5cm and -1cm of a31]
{$\delta^{(3)}=(a^{(3)}-y) \odot (1-\tanh^2(z^{(3)}))$};
\node[draw=none, fill=none] (delta2) [below right=0.5cm and -1cm of a23]
{$\delta^{(2)}= \delta^{(3)}(\Theta^{(3)})^T \odot (1-\tanh^2(z^{(2)}))$};
\node[draw=none, fill=none] (delta1) [below right=1.5cm and -4cm of a3]
{$\delta^{(1)}= \delta^{(2)}(\Theta^{(2)})^T \odot (1-\tanh^2(z^{(1)}))$};
\path[-]
(b1) edge node [above=.2cm, pos=0.5] {$\beta^{(1)}_{1}$} (a1)
(x1) edge node [above, pos=0.5] {$\Theta^{(1)}_{1,1}$} (a1)
(x2) edge node [above, pos=0.5] {$\Theta^{(1)}_{2,1}$} (a1)
(xn) edge node [above, pos=0.5] {$\Theta^{(1)}_{n,1}$} (a1);
\path[-, thin, dotted]
(b1) edge node {} (a2)
(x1) edge node {} (a2)
(x2) edge node {} (a2)
(xn) edge node {} (a2)
(b1) edge node {} (a3)
(x1) edge node {} (a3)
(x2) edge node {} (a3)
(xn) edge node [below, pos=0.5] {$\Theta^{(1)}_{n,3}$} (a3);
\path[-]
(b2) edge node [above=.2cm, pos=0.5] {$\beta^{(2)}_{1}$} (a21)
(a1) edge node [above, pos=0.5] {$\Theta^{(2)}_{1,1}$} (a21)
(a2) edge node [above, pos=0.5] {$\Theta^{(2)}_{2,1}$} (a21)
(a3) edge node [above, pos=0.5] {$\Theta^{(2)}_{3,1}$} (a21);
\path[-, thin, dotted]
(b2) edge node {} (a22)
(a1) edge node {} (a22)
(a2) edge node {} (a22)
(a3) edge node {} (a22)
(b2) edge node {} (a23)
(a1) edge node {} (a23)
(a2) edge node {} (a23)
(a3) edge node [below, pos=0.5] {$\Theta^{(2)}_{3,3}$} (a23);
\path[-]
(b3) edge node [above=.5cm, pos=0.5] {$\beta^{(3)}_{1}$} (a31)
(a21) edge node [above, pos=0.5] {$\Theta^{(3)}_{1,1}$} (a31)
(a22) edge node [above, pos=0.5] {$\Theta^{(3)}_{2,1}$} (a31)
(a23) edge node [above, pos=0.5] {$\Theta^{(3)}_{3,1}$} (a31);
\node [draw, dashed, fit= (x1) (x2) (dots) (xn)] (w0) {};
\node [draw, dashed, fit= (a1) (a2) (a3)] (w1) {};
\node [draw, dashed, fit= (a21) (a22) (a23)] (w2) {};
\node [draw, dashed, fit= (a31)] (w3) {};
\node [draw, draw=none, fill=none, below=2cm of w0] (mw0) {\small$a^{(0)}=x$};
\node [draw, draw=none, fill=none, right=.5cm of mw0] (mw1)
{\small$\begin{array}{l}z^{(1)} = a^{(0)} \Theta^{(1)} + \beta^{(1)}\\g^{(1)}(x)=\tanh(x)\\a^{(1)}=g^{(1)}(z^{(1)})\end{array}$};
\node [draw, draw=none, fill=none, right=.5cm of mw1] (mw2)
{\small$\begin{array}{l}z^{(2)} = a^{(1)} \Theta^{(2)} + \beta^{(2)}\\g^{(2)}(x)=\tanh(x)\\a^{(2)}=g^{(2)}(z^{(2)})\end{array}$};
\node [draw, draw=none, fill=none, right=.5cm of mw2] (mw3)
{\small$\begin{array}{l}z^{(3)} = a^{(2)} \Theta^{(3)} + \beta^{(3)}\\g^{(3)}(x)=\tanh(x)\\a^{(3)}=g^{(3)}(z^{(3)})\end{array}$};
\path[->]
(mw0) edge node {} (mw1)
(mw1) edge node {} (mw2)
(mw2) edge node {} (mw3)
(mw3.east) edge[in=320,out=0] node {} (delta3.south)
;
\path[->]
(delta3) edge node {} (delta2)
(delta2) edge node {} (delta1)
;
Jedna iteracja: