In [8]:
import sys
import numpy as np
import impl.RNN as rnn
import impl.solver as solver
In [9]:
with open('data/text_data/japan.txt', 'r') as f:
txt = f.read()
X = []
y = []
char_to_idx = {char: i for i, char in enumerate(set(txt))}
idx_to_char = {i: char for i, char in enumerate(set(txt))}
X = np.array([char_to_idx[x] for x in txt])
y = [char_to_idx[x] for x in txt[1:]]
y.append(char_to_idx['.'])
y = np.array(y)
In [33]:
vocab_size = len(char_to_idx)
# hyper parameters
time_step = 10
n_iter = 13000 # epochs
alpha = 1e-3
print_after = 1000
H = 64
In [34]:
import numpy as np
import impl.loss as loss_fun
import impl.layer as l
import impl.regularization as reg
import impl.utils as util
import impl.NN as nn
class RNN(nn.NN):
def __init__(self, D, H, char2idx, idx2char):
self.D = D
self.H = H
self.char2idx = char2idx
self.idx2char = idx2char
self.vocab_size = len(char2idx)
super().__init__(D, D, H, None, None, loss='cross_ent', nonlin='relu')
def initial_state(self):
return np.zeros((1, self.H))
def forward(self, X, h, m): # m = self.model
Wxh, Whh, Why = m['Wxh'], m['Whh'], m['Why']
bh, by = m['bh'], m['by']
hprev = h.copy()
h, h_cache = l.tanh_forward(X @ Wxh + hprev @ Whh + bh)
y, y_cache = l.fc_forward(h, Why, by)
cache = X, Whh, h, hprev, y, h_cache, y_cache, Wxh
return y, h, cache
def backward(self, dy, dh, cache):
X, Whh, h, hprev, y, h_cache, y_cache, Wxh = cache
dh_next = dh.copy()
# Hidden to output gradient
dh, dWhy, dby = l.fc_backward(dy, y_cache)
dh += dh_next
dby = dby.reshape((1, -1))
# tanh
dh = l.tanh_backward(dh, h_cache)
# Hidden gradient
dbh = dh
dWhh = hprev.T @ dh
dWxh = X.T @ dh
dX = dh @ Wxh.T
dh = dh @ Whh.T
grad = dict(Wxh=dWxh, Whh=dWhh, Why=dWhy, bh=dbh, by=dby)
return dX, dh, grad
def _init_model(self, D, C, H):
self.model = []
for l in range(2):
m = dict(
Wxh=np.random.randn(D, H) / np.sqrt(D / 2.),
Whh=np.random.randn(H, H) / np.sqrt(H / 2.),
Why=np.random.randn(H, D) / np.sqrt(C / 2.),
bh=np.zeros((1, H)),
by=np.zeros((1, D))
)
self.model.append(m)
def train_step_fwd(self, X_train, h):
ys, caches = [], []
for X in X_train:
X_one_hot = np.zeros(self.D)
X_one_hot[X] = 1.
x = X_one_hot.reshape(1, -1)
y, h, cache = self.forward(x, h, self.model[0])
y, h, cache2 = self.forward(y, h, self.model[1])
ys.append(y)
caches.append(cache)
caches.append(cache2)
return ys, caches
def train_step_bwd(self, y_train, ys, caches):
loss, dys = 0.0, []
for y_pred, y in zip(ys, y_train):
loss += loss_fun.cross_entropy(self.model, y_pred, y, lam=0)/ y_train.shape[0]
dy = loss_fun.dcross_entropy(y_pred, y)
dys.append(dy)
# Grads
grads_ = []
dh = np.zeros((1, self.H))
grads = {key: np.zeros_like(val) for key, val in self.model[0].items()}
grads2 = {key: np.zeros_like(val) for key, val in self.model[1].items()}
for t in reversed(range(len(dys))):
dX, dh2, grad2 = self.backward(dys[t], dh, caches[t+1])
dX, dh, grad = self.backward(dX, dh, caches[t])
for k in grad.keys():
grads_[1][k] += grad2[k]
grads_[][k] += grad[k]
return grads, loss
In [35]:
net = RNN(D=vocab_size, H=H, char2idx=char_to_idx, idx2char=idx_to_char)
In [36]:
import numpy as np
import impl.utils as util
import impl.constant as c
import copy
from sklearn.utils import shuffle as skshuffle
def get_minibatch(X, y, minibatch_size, shuffle=True):
minibatches = []
if shuffle:
X, y = skshuffle(X, y)
for i in range(0, X.shape[0], minibatch_size):
X_mini = X[i:i + minibatch_size]
y_mini = y[i:i + minibatch_size]
minibatches.append((X_mini, y_mini))
return minibatches
def adam_rnn(nn, X_train, y_train, alpha=0.001, mb_size=256, n_iter=2000, print_after=100):
minibatches = get_minibatch(X_train, y_train, mb_size, shuffle=False)
idx = 0
state = nn.initial_state()
smooth_loss = -np.log(1.0 / len(set(X_train)))
M = {k: np.zeros_like(v) for k, v in nn.model.items()}
R = {k: np.zeros_like(v) for k, v in nn.model.items()}
beta1 = .9
beta2 = .999
for iter in range(1, n_iter + 1):
t = iter
if idx >= len(minibatches):
idx = 0
state = nn.initial_state()
X_mini, y_mini = minibatches[idx]
idx += 1
if iter % print_after == 0:
print('Iter-{} loss: {:.4f}'.format(iter, smooth_loss))
# # Testing can be completed once we make sure the training is done and is validated.
# sample = nn.test_step_fwd(X_mini[0], state)
# print(sample)
ys, caches = nn.train_step_fwd(X_mini, state)
grads, loss = nn.train_step_bwd(y_mini, ys, caches)
smooth_loss = 0.999 * smooth_loss + 0.001 * loss
for k in grads.keys(): #key, value: items
M[k] = util.exp_running_avg(M[k], grads[k], beta1)
R[k] = util.exp_running_avg(R[k], grads[k]**2, beta2)
m_k_hat = M[k] / (1. - beta1**(t))
r_k_hat = R[k] / (1. - beta2**(t))
nn.model[k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps)
return nn
In [37]:
adam_rnn(nn=net, X_train=X, y_train=y, alpha=alpha, mb_size=time_step, n_iter=n_iter,
print_after=print_after)
Out[37]:
In [ ]: