In [ ]:
import numpy as np
In [ ]:
%run magic.ipynb
考慮 $F = f(\mathbf{a},\mathbf{g}(\mathbf{b},\mathbf{h}(\mathbf{c}, \mathbf{i}))$
$\mathbf{a},\mathbf{b},\mathbf{c},$ 代表著權重 , $\mathbf{i}$ 是輸入
站在 \mathbf{g} 的角度,為了要更新權重,我們想算
我們需要什麼? 由 chain rule 得知
\sum_j \frac{\partial F}{\partial g_j}\frac{\partial g_j}{\partial b_i}$ 或者寫成 Jabobian 的形式
\frac{\partial F}{\partial \mathbf{g}} \frac{\partial \mathbf{g}}{\partial \mathbf{b}}$
所以我們希望前面能傳給我們 $\frac{\partial F}{\partial \mathbf{g}}$
將心比心,因為 $\mathbf{h}$ 也要算 $\frac{\partial F}{\partial \mathbf{c}}$, 所以我們還要負責傳 $\frac{\partial F}{\partial \mathbf{h}}$ 給他。 而因為
\frac{\partial F}{\partial \mathbf{g}} \frac{\partial \mathbf{g}}{\partial \mathbf{h}}$
所以 $\mathbf{g}$ 中間真正需要負責計算的東西就是 $\frac{\partial \mathbf{g}}{\partial \mathbf{h}}$ 和 $\frac{\partial \mathbf{g}}{\partial \mathbf{b}}$
由於
let $U = f(Ax+b) $, $Z=CU+d$
= \frac{\partial L}{\partial Z} = p^T (1 \sigma(Z)^T - \delta) = \sigma(Z)^T - p^T = \sigma(CU+d)^T - p^T $
= \frac{\partial L}{\partial Z} \frac{\partial CU+d}{\partial C_{i,j}} = (p^T (1 \sigma(Z)^T - \delta))_i U_j = (\sigma(Z) - p)_i U_j $ 所以
= (\sigma(Z) - p) U^T $
到目前為止,都跟原來 softmax 的結果一樣。
繼續計算 A, b 的偏微分
= \frac{\partial L}{\partial Z} \frac{\partial CU+d}{\partial U} = (p^T (1 \sigma(Z)^T - \delta)) C = (\sigma(Z) - p)^T C $
$ \frac{\partial U_k}{\partial b_i} = \frac{\partial f(A_kx+b_k)}{\partial b_i} = \delta_{k,i} f'(Ax+b)_i $
$ \frac{\partial L}{\partial b_i } = ((\sigma(Z) - p)^T C)_i f'(Ax+b)_i$
$ \frac{\partial L}{\partial A_{i,j} } = ((\sigma(Z) - p)^T C)_i f'(Ax+b)_i x_j$
In [ ]:
# 參考範例, 各種函數、微分
%run -i solutions/ff_funcs.py
In [ ]:
# 參考範例, 計算 loss
%run -i solutions/ff_compute_loss2.py
$ \frac{\partial L}{\partial d} = \sigma(CU+d)^T - p^T$
$ \frac{\partial L}{\partial C } = (\sigma(Z) - p) U^T$
$ \frac{\partial L}{\partial b_i } = ((\sigma(Z) - p)^T C)_i f'(Ax+b)_i$
$ \frac{\partial L}{\partial A_{i,j} } = ((\sigma(Z) - p)^T C)_i f'(Ax+b)_i x_j$
In [ ]:
# 計算 gradient
%run -i solutions/ff_compute_gradient.py
In [ ]:
# 更新權重,計算新的 loss
%run -i solutions/ff_update.py
練習:隨機訓練 20000 次
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
In [ ]:
# 參考範例
%run -i solutions/ff_train_mod3.py
plt.plot(L_history);
In [ ]:
# 訓練結果測試
for i in range(16):
x = Vector(i%2, (i>>1)%2, (i>>2)%2, (i>>3)%2)
y = i%3
U = relu(A@x+b)
q = softmax(C@U+d)
print(q.argmax(), y)
In [ ]:
def truth(x):
x = x.reshape(3,3)
return int(x.all(axis=0).any() or
x.all(axis=1).any() or
x.diagonal().all() or
x[::-1].diagonal().all())
In [ ]:
%run -i solutions/ff_train_ttt.py
plt.plot(accuracy_history);
In [ ]: