In [32]:
# Data
import numpy as np

with open('data/text_data/japan.txt', 'r') as f:
# with open('data/text_data/anna.txt', 'r') as f:

    txt = f.read()

    X = []
    y = []

    char_to_idx = {char: i for i, char in enumerate(set(txt))}
    idx_to_char = {i: char for i, char in enumerate(set(txt))}

    X = np.array([char_to_idx[x] for x in txt])
    y = [char_to_idx[x] for x in txt[1:]]
    y.append(char_to_idx['.'])
    y = np.array(y)

In [39]:
# Model or Network
import impl.layer as l
from impl.loss import *

class GRU:
    def __init__(self, D, H, L, char2idx, idx2char):
        self.D = D
        self.H = H
        self.L = L
        self.char2idx = char2idx
        self.idx2char = idx2char
        self.vocab_size = len(char2idx)
        self.losses = {'train':[], 'smooth train':[], 'valid':[]}
        
        # Conv model parameters
        # y_nx1 = X_nxt @ W_tx1 + b_1x1
        # K: kernel size, min_size = 3, i.e. one before, one itself, one after
        # N: number of kernels/filters/windows
        # K, N = 3, 10
        # For the seq or one sequence learning:
        K = 2 # kernel 
        N = 1 # Neurons
        self.N = N
        # Stride/ scanning one by one: stride = 1
        # Pad = K//2 half of the kernel size: pad = 1, zero padding
        m = dict(
            W = np.random.randn(K, N) / np.sqrt(K / 2.),
            b = np.random.randn(1, N)
        )
        self.model_conv = []
        for _ in range(self.L):
            self.model_conv.append(m)
            
        # Recurrent Model params
        Z = H + (D*N)
        m = dict(
            Wz=np.random.randn(Z, H) / np.sqrt(Z / 2.),
            Wr=np.random.randn(Z, H) / np.sqrt(Z / 2.),
            Wh=np.random.randn(Z, H) / np.sqrt(Z / 2.),
            Wy=np.random.randn(H, D) / np.sqrt(H / 2.),
            bz=np.zeros((1, H)),
            br=np.zeros((1, H)),
            bh=np.zeros((1, H)),
            by=np.zeros((1, D))
        )

        self.model = []
        for _ in range(self.L):
            self.model.append(m)
            
    def initial_state(self):
        return np.zeros((1, self.H))

    def forward(self, X, h, m):
        Wz, Wr, Wh, Wy = m['Wz'], m['Wr'], m['Wh'], m['Wy']
        bz, br, bh, by = m['bz'], m['br'], m['bh'], m['by']

        X_in = X.copy()
        h_in = h.copy()

        X = np.column_stack((h_in, X_in))

        hz, hz_cache = l.fc_forward(X, Wz, bz)
        hz, hz_sigm_cache = l.sigmoid_forward(hz)

        hr, hr_cache = l.fc_forward(X, Wr, br)
        hr, hr_sigm_cache = l.sigmoid_forward(hr)

        X = np.column_stack((hr * h_in, X_in))
        
        hh, hh_cache = l.fc_forward(X, Wh, bh)
        hh, hh_tanh_cache = l.tanh_forward(hh)

        # h = (1. - hz) * h_old + hz * hh
        # or
        h = ((1. - hz) * h_in) + (hz * hh)
        # or
        # h = h_in + hz (hh - h_in)

        y, y_cache = l.fc_forward(h, Wy, by)

        cache = (h_in, hz, hz_cache, hz_sigm_cache, hr, hr_cache, hr_sigm_cache, hh, hh_cache, hh_tanh_cache, y_cache)

        return y, h, cache

    def backward(self, dy, dh, cache):
        h_in, hz, hz_cache, hz_sigm_cache, hr, hr_cache, hr_sigm_cache, hh, hh_cache, hh_tanh_cache, y_cache = cache
        
        dh_out = dh.copy()

        dh, dWy, dby = l.fc_backward(dy, y_cache)
        dh += dh_out

        dh_in1 = (1. - hz) * dh
        dhh = hz * dh
        dhz = (hh * dh) - (h_in * dh)
        # or
        # dhz = (hh - h_in) * dh

        dhh = l.tanh_backward(dhh, hh_tanh_cache)
        dXh, dWh, dbh = l.fc_backward(dhh, hh_cache)

        dh = dXh[:, :self.H]
        dX_in2 = dXh[:, self.H:]
        dh_in2 = hr * dh

        dhr = h_in * dh
        dhr = l.sigmoid_backward(dhr, hr_sigm_cache)
        dXr, dWr, dbr = l.fc_backward(dhr, hr_cache)

        dhz = l.sigmoid_backward(dhz, hz_sigm_cache)
        dXz, dWz, dbz = l.fc_backward(dhz, hz_cache)

        dX = dXr + dXz
        dh_in3 = dX[:, :self.H]
        dX_in1 = dX[:, self.H:]

        dh = dh_in1 + dh_in2 + dh_in3
        dX = dX_in1 + dX_in2

        grad = dict(Wz=dWz, Wr=dWr, Wh=dWh, Wy=dWy, bz=dbz, br=dbr, bh=dbh, by=dby)
        
        return dX, dh, grad

    def train_forward(self, X_train, h):
        ys, caches, caches_conv, X_prev = [], [], [], []
        h_init = h.copy()
        h = []
        X_zero = np.zeros((1, self.D)) # 1xD
        for _ in range(self.L):
            h.append(h_init)
            caches.append([])
            caches_conv.append([])
            X_prev.append(X_zero)
        
        # Embedding, Input layer, 1st layer
        for X in X_train:
            X_one_hot = np.zeros(self.D)
            X_one_hot[X] = 1.
            X = X_one_hot.reshape(1, -1) # X_1xn
            for layer in range(self.L):
                X_pad = X_prev[layer].copy() # X_1xD
                X_prev[layer] = X.copy()
                X_conv = np.row_stack((X_pad, X)).T # (1xD, 1XD).T = Dx2
                # y_DxN = X_Dx2 @ W_2xN + b_1xN, D: dim, N: neurons/unit
                y, cache = l.fc_forward(X_conv, self.model_conv[layer]['W'], self.model_conv[layer]['b'])
                caches_conv[layer].append(cache)
                #print('y.shape', y.shape)
                X = y.reshape(1, -1).copy() # X_1xD*N
                # print('X.shape', X.shape, X.dtype)
                y, h[layer], cache = self.forward(X, h[layer], self.model[layer])
                caches[layer].append(cache)
                X = y.copy() # X_1xD 
            ys.append(y)

        ys_caches = (caches, caches_conv)

        return ys, ys_caches
    
    def loss_function(self, y_train, ys):
        loss, dys = 0.0, []

        for y_pred, y in zip(ys, y_train):
            loss += cross_entropy(y_pred, y)
            dy = dcross_entropy(y_pred, y)
            dys.append(dy)
            
        return loss, dys
    
    def train_backward(self, dys, ys_caches):
        dh, grad, grads, grads_conv, dX_prev = [], [], [], [], []
        for layer in range(self.L):
            dh.append(np.zeros((1, self.H)))
            grad.append({key: np.zeros_like(val) for key, val in self.model[layer].items()})
            grads.append({key: np.zeros_like(val) for key, val in self.model[layer].items()})
            grads_conv.append({key: np.zeros_like(val) for key, val in self.model_conv[layer].items()})
            dX_prev.append(np.zeros((1, self.D))) # This will be dy for the previous layer
        
        caches, caches_conv = ys_caches
        
        for t in reversed(range(len(dys))):
            dy = dys[t]
            for layer in reversed(range(self.L)):
                dX, dh[layer], grad[layer] = self.backward(dy, dh[layer], caches[layer][t])
                for k in grad[layer].keys():
                    grads[layer][k] += grad[layer][k]
                dy = dX.reshape(self.D, self.N).copy()
                dX_conv, dW, db = l.fc_backward(dy, caches_conv[layer][t])
                grads_conv[layer]['W'] += dW
                grads_conv[layer]['b'] += db
                dX = dX_conv.T # dX_2xD
                dy = dX[1].reshape(1, -1) + dX_prev[layer]
                dX_prev[layer] = dX[0].reshape(1, -1)
                
        grads_all = grads, grads_conv
        
        return grads_all
    
#     def test(self, X_seed, h, size):
#         chars = [self.idx2char[X_seed]]
#         idx_list = list(range(self.vocab_size))
#         X = X_seed
        
#         h_init = h.copy()
#         h = []
#         for _ in range(self.L):
#             h.append(h_init.copy())

#         ys = []
#         for _ in range(size):
#             X_one_hot = np.zeros(self.D)
#             X_one_hot[X] = 1.
#             X = X_one_hot.reshape(1, -1)
#             for layer in range(self.L):
#                 y, h[layer], _ = self.forward(X, h[layer], self.model[layer])
#                 X = y.copy()
                
#             prob = l.softmax(y)
#             idx = np.random.choice(idx_list, p=prob.ravel())
#             chars.append(self.idx2char[idx])
#             X = idx
#             #ys.append(prob) # entropy is the loss function

#         return ''.join(chars) #, ys

In [40]:
def get_minibatch(X, y, minibatch_size, shuffle):
    minibatches = []

    for i in range(0, X.shape[0], minibatch_size):
#     for i in range(0, X.shape[0] - minibatch_size + 1, 1):
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]
        minibatches.append((X_mini, y_mini))

    return minibatches

def adam_rnn(nn, X_train, y_train, alpha, mb_size, n_iter, print_after):
    M, R = [], []
    M_conv, R_conv = [], []
    for layer in range(nn.L):
        M.append({k: np.zeros_like(v) for k, v in nn.model[layer].items()})
        R.append({k: np.zeros_like(v) for k, v in nn.model[layer].items()})
        M_conv.append({k: np.zeros_like(v) for k, v in nn.model_conv[layer].items()})
        R_conv.append({k: np.zeros_like(v) for k, v in nn.model_conv[layer].items()})

    beta1 = .99
    beta2 = .999
    state = nn.initial_state()
    smooth_loss = 1.
    minibatches = get_minibatch(X_train, y_train, mb_size, shuffle=False)
    
    for iter in range(1, n_iter + 1):
        for idx in range(len(minibatches)):
            # Traing
            X_mini, y_mini = minibatches[idx]
            ys, caches = nn.train_forward(X_mini, state)
            loss, dys = nn.loss_function(y_mini, ys)
            grads_all = nn.train_backward(dys, caches)
            grads, grads_conv = grads_all
            
            nn.losses['train'].append(loss)
            smooth_loss = (0.999 * smooth_loss) + (0.001 * loss)
            nn.losses['smooth train'].append(smooth_loss)

            # Update the weights & biases or model
            for layer in range(nn.L):
                # Recurrent model
                for k in grads[layer].keys(): #key, value: items
                    M[layer][k] = l.exp_running_avg(M[layer][k], grads[layer][k], beta1)
                    R[layer][k] = l.exp_running_avg(R[layer][k], grads[layer][k]**2, beta2)
                    m_k_hat = M[layer][k] / (1. - (beta1**(iter)))
                    r_k_hat = R[layer][k] / (1. - (beta2**(iter)))
                    nn.model[layer][k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + l.eps)
                # ConvNet model
                for k in grads_conv[layer].keys(): #key, value: items
                    M_conv[layer][k] = l.exp_running_avg(M_conv[layer][k], grads_conv[layer][k], beta1)
                    R_conv[layer][k] = l.exp_running_avg(R_conv[layer][k], grads_conv[layer][k]**2, beta2)
                    m_k_hat = M_conv[layer][k] / (1. - (beta1**(iter)))
                    r_k_hat = R_conv[layer][k] / (1. - (beta2**(iter)))
                    nn.model_conv[layer][k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + l.eps)
                    
        # Print loss and test sample
        if iter % print_after == 0:
            print('Iter-{}, train loss: {:.4f}'.format(iter, loss))

    return nn

In [38]:
# Hyper-parameters
time_step = 10 # width, minibatch size and test sample size as well
num_layers = 1 # depth
n_iter = 100 # epochs
alpha = 1e-3 # learning_rate
print_after = 1 # n_iter//10 # print training loss, valid, and test
num_hidden_units = 64 # num_hidden_units in hidden layer
num_input_units = len(char_to_idx) # vocab_size = len(char_to_idx)

# Build the network and learning it or optimizing it using SGD
net = GRU(D=num_input_units, H=num_hidden_units, L=num_layers, char2idx=char_to_idx, idx2char=idx_to_char)

# Start learning using BP-SGD-ADAM
adam_rnn(nn=net, X_train=X, y_train=y, alpha=alpha, mb_size=time_step, n_iter=n_iter, print_after=print_after)

# # Display the learning curve and losses for training, validation, and testing
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

plt.plot(net.losses['train'], label='Train loss')
plt.plot(net.losses['smooth train'], label='Train smooth loss')
plt.legend()
plt.show()


Iter-1, train loss: 47.0896
Iter-2, train loss: 46.9066
Iter-3, train loss: 44.2837
Iter-4, train loss: 42.4659
Iter-5, train loss: 41.2715
Iter-6, train loss: 40.4115
Iter-7, train loss: 39.7401
Iter-8, train loss: 39.2772
Iter-9, train loss: 38.9953
Iter-10, train loss: 38.8389
Iter-11, train loss: 38.7500
Iter-12, train loss: 38.6994
Iter-13, train loss: 38.6640
Iter-14, train loss: 38.6366
Iter-15, train loss: 38.6140
Iter-16, train loss: 38.5946
Iter-17, train loss: 38.5776
Iter-18, train loss: 38.5625
Iter-19, train loss: 38.5490
Iter-20, train loss: 38.5368
Iter-21, train loss: 38.5258
Iter-22, train loss: 38.5157
Iter-23, train loss: 38.5065
Iter-24, train loss: 38.4981
Iter-25, train loss: 38.4904
Iter-26, train loss: 38.4832
Iter-27, train loss: 38.4765
Iter-28, train loss: 38.4703
Iter-29, train loss: 38.4646
Iter-30, train loss: 38.4592
Iter-31, train loss: 38.4541
Iter-32, train loss: 38.4494
Iter-33, train loss: 38.4449
Iter-34, train loss: 38.4407
Iter-35, train loss: 38.4367
Iter-36, train loss: 38.4330
Iter-37, train loss: 38.4294
Iter-38, train loss: 38.4265
Iter-39, train loss: 38.7407
Iter-40, train loss: 38.3781
Iter-41, train loss: 38.4179
Iter-42, train loss: 38.4128
Iter-43, train loss: 38.4106
Iter-44, train loss: 38.4084
Iter-45, train loss: 38.4061
Iter-46, train loss: 38.4040
Iter-47, train loss: 38.4019
Iter-48, train loss: 38.3998
Iter-49, train loss: 38.3978
Iter-50, train loss: 38.3959
Iter-51, train loss: 38.3940
Iter-52, train loss: 38.3923
Iter-53, train loss: 38.3906
Iter-54, train loss: 38.3889
Iter-55, train loss: 38.3873
Iter-56, train loss: 38.3858
Iter-57, train loss: 38.3843
Iter-58, train loss: 38.3829
Iter-59, train loss: 38.3816
Iter-60, train loss: 38.3803
Iter-61, train loss: 38.3790
Iter-62, train loss: 38.3778
Iter-63, train loss: 38.3766
Iter-64, train loss: 38.3755
Iter-65, train loss: 38.3744
Iter-66, train loss: 38.3733
Iter-67, train loss: 38.3723
Iter-68, train loss: 38.3713
Iter-69, train loss: 38.3704
Iter-70, train loss: 38.3695
Iter-71, train loss: 38.3686
Iter-72, train loss: 38.3678
Iter-73, train loss: 38.3669
Iter-74, train loss: 38.3662
Iter-75, train loss: 38.3654
Iter-76, train loss: 38.3647
Iter-77, train loss: 38.3639
Iter-78, train loss: 38.3633
Iter-79, train loss: 38.3626
Iter-80, train loss: 38.3620
Iter-81, train loss: 38.3613
Iter-82, train loss: 38.3607
Iter-83, train loss: 38.3602
Iter-84, train loss: 38.3596
Iter-85, train loss: 38.3591
Iter-86, train loss: 38.3586
Iter-87, train loss: 38.3581
Iter-88, train loss: 38.3576
Iter-89, train loss: 38.3571
Iter-90, train loss: 38.3567
Iter-91, train loss: 38.3562
Iter-92, train loss: 38.3558
Iter-93, train loss: 38.3554
Iter-94, train loss: 38.3550
Iter-95, train loss: 38.3547
Iter-96, train loss: 38.3543
Iter-97, train loss: 38.3540
Iter-98, train loss: 38.3536
Iter-99, train loss: 38.3533
Iter-100, train loss: 38.3530

In [ ]:


In [ ]: