In [1]:
# Data
import numpy as np

with open('data/text_data/japan.txt', 'r') as f:
# with open('data/text_data/anna.txt', 'r') as f:

    txt = f.read()

    X = []
    y = []

    char_to_idx = {char: i for i, char in enumerate(set(txt))}
    idx_to_char = {i: char for i, char in enumerate(set(txt))}

    X = np.array([char_to_idx[x] for x in txt])
    y = [char_to_idx[x] for x in txt[1:]]
    y.append(char_to_idx['.'])
    y = np.array(y)
    
y.shape, y.dtype, X.shape, X.dtype, y[:10], X[:10]
# train & test/validation
# test-valid/train ~ 1/10 ~ 
# In this dataset = 329/3629 ~ 1/12
X_train = X[:3300]
Y_train = y[:3300]
X_valid = X[3300:]
Y_valid = y[3300:]
X_train.shape, Y_train.shape, X_valid.shape, Y_valid.shape

XY_train = (X_train, Y_train)
XY_valid = (X_valid, Y_valid)

In [37]:
# Model or Network
import impl.layer as l
from impl.loss import *

class GRU:        
    def __init__(self, D, H, p_dropout, lam, char2idx, idx2char):
        self.D = D
        self.H = H
        self.p_dropout = p_dropout
        self.lam = lam
        self.char2idx = char2idx
        self.idx2char = idx2char
        self.vocab_size = len(char2idx)
        self.losses = {'train':[], 'smooth train':[], 'valid': []}
        
        # Model params
        Z = H + D
        m = dict(
            Wz=np.random.randn(Z, H) / np.sqrt(Z / 2.),
            Wr=np.random.randn(Z, H) / np.sqrt(Z / 2.),
            Wh=np.random.randn(Z, H) / np.sqrt(Z / 2.),
            Wy=np.random.randn(H, D) / np.sqrt(H / 2.),
            bz=np.zeros((1, H)),
            br=np.zeros((1, H)),
            bh=np.zeros((1, H)),
            by=np.zeros((1, D))
        )
        self.model = m

    def initial_state(self):
        return np.zeros((1, self.H))

    def forward(self, X, h, m):
        Wz, Wr, Wh, Wy = m['Wz'], m['Wr'], m['Wh'], m['Wy']
        bz, br, bh, by = m['bz'], m['br'], m['bh'], m['by']

        X_in = X.copy()
        h_in = h.copy()

        X = np.column_stack((h_in, X_in))

        hz, hz_cache = l.fc_forward(X, Wz, bz)
        hz, hz_sigm_cache = l.sigmoid_forward(hz)

        hr, hr_cache = l.fc_forward(X, Wr, br)
        hr, hr_sigm_cache = l.sigmoid_forward(hr)

        X = np.column_stack((hr * h_in, X_in))
        
        hh, hh_cache = l.fc_forward(X, Wh, bh)
        hh, hh_tanh_cache = l.tanh_forward(hh)

        # h = (1. - hz) * h_old + hz * hh
        # or
        h = ((1. - hz) * h_in) + (hz * hh)
        # or
        # h = h_in + hz (hh - h_in)

        y, y_cache = l.fc_forward(h, Wy, by)
        
        cache = (h_in, hz, hz_cache, hz_sigm_cache, hr, hr_cache, hr_sigm_cache, hh, hh_cache, hh_tanh_cache, 
                 y_cache)

        return y, h, cache

    def backward(self, dy, dh, cache):
        h_in, hz, hz_cache, hz_sigm_cache, hr, hr_cache, hr_sigm_cache, hh, hh_cache, hh_tanh_cache, y_cache = cache
        
        dh_out = dh.copy()

        dh, dWy, dby = l.fc_backward(dy, y_cache)
        dh += dh_out

        dh_in1 = (1. - hz) * dh
        dhh = hz * dh
        dhz = (hh * dh) - (h_in * dh)
        # or
        # dhz = (hh - h_in) * dh

        dhh = l.tanh_backward(dhh, hh_tanh_cache)
        dXh, dWh, dbh = l.fc_backward(dhh, hh_cache)

        dh = dXh[:, :self.H]
        dX_in2 = dXh[:, self.H:]
        dh_in2 = hr * dh

        dhr = h_in * dh
        dhr = l.sigmoid_backward(dhr, hr_sigm_cache)
        dXr, dWr, dbr = l.fc_backward(dhr, hr_cache)

        dhz = l.sigmoid_backward(dhz, hz_sigm_cache)
        dXz, dWz, dbz = l.fc_backward(dhz, hz_cache)

        dX = dXr + dXz
        dh_in3 = dX[:, :self.H]
        dX_in1 = dX[:, self.H:]

        dh = dh_in1 + dh_in2 + dh_in3
        dX = dX_in1 + dX_in2

        grad = dict(Wz=dWz, Wr=dWr, Wh=dWh, Wy=dWy, bz=dbz, br=dbr, bh=dbh, by=dby)
        
        return dX, dh, grad

    def train_forward(self, X_train, h):
        ys, caches, do_caches = [], [], []

        for X in X_train:
            X = X.reshape(1, -1) # X_1xn
            y, h, cache = self.forward(X, h, self.model)
            y, do_cache = l.dropout_forward(y, self.p_dropout)
            caches.append(cache)
            do_caches.append(do_cache)
            ys.append(y)
        
        ys = np.array(ys, dtype=float).reshape(len(ys), -1) # ys_txn instead of ys_tx1xn
        
        return ys, caches, do_caches
                                
    def loss_function(self, y_pred, y_train): # , alpha alpha: learning rate
        loss, dys = 0.0, []

        for y, Y in zip(y_pred, y_train):
            loss += l2_regression_reg(model=self.model, y_pred=y, y_train=Y, lam=self.lam)
            dy = dl2_regression(y_pred=y, y_train=Y)
            dys.append(dy)
            
        return loss, dys
    
    def train_backward(self, dys, caches, do_caches):
        dh = np.zeros((1, self.H)) 
        grad = {key: np.zeros_like(val) for key, val in self.model.items()}
        grads= {key: np.zeros_like(val) for key, val in self.model.items()}

        for t in reversed(range(len(dys))):
            dy = dys[t].reshape(1, -1) # dy_1xn
            dy = l.dropout_backward(dy, do_caches[t])
            _, dh, grad = self.backward(dy, dh, caches[t])
            for key in grad.keys():
                grads[key] += grad[key]
                
        return grads
    
    def test(self, X_seed, h, size):
        ys = []
        X = X_seed.reshape(1, -1)
        for _ in range(size):
            y, h, _ = self.forward(X, h, self.model)
            X = y.copy() # previous out for the next input for prediction
            ys.append(y) # list array
        
        ys = np.array(ys, dtype=float).reshape(len(ys), -1) # ys_txn instead of ys_tx1xn
        return ys

In [38]:
def get_minibatch(X, y, minibatch_size, shuffle):
    minibatches = []

    for i in range(0, X.shape[0], minibatch_size):
#     for i in range(0, X.shape[0] - minibatch_size + 1, 1):
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]
        minibatches.append((X_mini, y_mini))

    return minibatches

def adam_rnn(nn, XY_train, XY_valid, alpha, mb_size, n_iter, print_after):
    X_train, y_train = XY_train
    X_valid, y_valid = XY_valid
    
    M=({k: np.zeros_like(v) for k, v in nn.model.items()})
    R=({k: np.zeros_like(v) for k, v in nn.model.items()})
        
    beta1 = .99
    beta2 = .999
    state = nn.initial_state()
    smooth_loss = 1.
    minibatches = get_minibatch(X_train, y_train, mb_size, shuffle=False)
    
    for iter in range(1, n_iter + 1):
        for idx in range(len(minibatches)):
            # Train the model
            X_mini, y_mini = minibatches[idx]
            ys, caches = nn.train_forward(X_mini, state)
            loss, dys = nn.loss_function(ys, y_mini)
            grads = nn.train_backward(dys, caches)
            nn.losses['train'].append(loss)
            smooth_loss = (0.999 * smooth_loss) + (0.001 * loss)
            nn.losses['smooth train'].append(smooth_loss)

            # Update the model
            for k in grads.keys(): #key, value: items
                M[k] = l.exp_running_avg(M[k], grads[k], beta1)
                R[k] = l.exp_running_avg(R[k], grads[k]**2, beta2)
                m_k_hat = M[k] / (1. - (beta1**(iter)))
                r_k_hat = R[k] / (1. - (beta2**(iter)))
                nn.model[k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + l.eps)

            # Validate the model (pre-testing- testing)
            sample, ys = nn.test(X_valid[0], state, size=X_valid.shape[0])
            valid_loss, _ = nn.loss_function(ys, Y_valid)
            nn.losses['valid'].append(valid_loss)
            
        # Print the loss of the model
        if iter % print_after == 0:
            print('Iter-{}, train loss: {:.4f}, valid loss: {:.4f}'.format(iter, loss, valid_loss))
#             print(sample)

    return nn

In [39]:
# Hyper-parameters
time_step = 10 # minibatch size
n_iter = 100 # epochs
alpha = 1e-4 # learning_rate
print_after = 1 # print training loss, valid, and test
num_hidden_units = 4 # num_hidden_units in hidden layer
num_input_units = len(char_to_idx) # vocab_size = len(char_to_idx)

# Build the network and learning it or optimizing it using SGD
net = GRU(D=num_input_units, H=num_hidden_units, char2idx=char_to_idx, idx2char=idx_to_char)

# Start learning using BP-SGD-ADAM
adam_rnn(nn=net, XY_train=XY_train, XY_valid=XY_valid, alpha=alpha, mb_size=time_step, n_iter=n_iter, 
         print_after=print_after)

# # Display the learning curve and losses for training, validation, and testing
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

plt.plot(net.losses['train'], label='Train loss')
plt.plot(net.losses['smooth train'], label='Train smooth loss')
plt.plot(net.losses['valid'], label='Valid loss')
plt.legend()
plt.show()


Iter-1, train loss: 40.5620, valid loss: 1370.3431
Iter-2, train loss: 38.7266, valid loss: 1327.0482
Iter-3, train loss: 37.2446, valid loss: 1289.7858
Iter-4, train loss: 36.0040, valid loss: 1243.7824
Iter-5, train loss: 35.0144, valid loss: 1224.4204
Iter-6, train loss: 34.2512, valid loss: 1202.3408
Iter-7, train loss: 33.6452, valid loss: 1187.3373
Iter-8, train loss: 33.1416, valid loss: 1173.4911
Iter-9, train loss: 32.7105, valid loss: 1166.7576
Iter-10, train loss: 32.3343, valid loss: 1163.4091
Iter-11, train loss: 32.0016, valid loss: 1157.1030
Iter-12, train loss: 31.7044, valid loss: 1153.3501
Iter-13, train loss: 31.4369, valid loss: 1153.0590
Iter-14, train loss: 31.1947, valid loss: 1144.8496
Iter-15, train loss: 30.9742, valid loss: 1144.3736
Iter-16, train loss: 30.7728, valid loss: 1143.1373
Iter-17, train loss: 30.5881, valid loss: 1136.8339
Iter-18, train loss: 30.4181, valid loss: 1139.5942
Iter-19, train loss: 30.2612, valid loss: 1139.9630
Iter-20, train loss: 30.1158, valid loss: 1134.8840
Iter-21, train loss: 29.9808, valid loss: 1128.4408
Iter-22, train loss: 29.8551, valid loss: 1130.6755
Iter-23, train loss: 29.7377, valid loss: 1138.2948
Iter-24, train loss: 29.6277, valid loss: 1132.9693
Iter-25, train loss: 29.5244, valid loss: 1130.7435
Iter-26, train loss: 29.4272, valid loss: 1128.0523
Iter-27, train loss: 29.3354, valid loss: 1132.0186
Iter-28, train loss: 29.2487, valid loss: 1130.4296
Iter-29, train loss: 29.1664, valid loss: 1133.9877
Iter-30, train loss: 29.0882, valid loss: 1132.6329
Iter-31, train loss: 29.0138, valid loss: 1132.9061
Iter-32, train loss: 28.9428, valid loss: 1132.3236
Iter-33, train loss: 28.8749, valid loss: 1134.4027
Iter-34, train loss: 28.8099, valid loss: 1136.4701
Iter-35, train loss: 28.7475, valid loss: 1129.8692
Iter-36, train loss: 28.6875, valid loss: 1129.0062
Iter-37, train loss: 28.6298, valid loss: 1128.7604
Iter-38, train loss: 28.5741, valid loss: 1135.6810
Iter-39, train loss: 28.5203, valid loss: 1130.5058
Iter-40, train loss: 28.4684, valid loss: 1137.1270
Iter-41, train loss: 28.4181, valid loss: 1133.8508
Iter-42, train loss: 28.3693, valid loss: 1138.0935
Iter-43, train loss: 28.3220, valid loss: 1148.7020
Iter-44, train loss: 28.2760, valid loss: 1132.1882
Iter-45, train loss: 28.2313, valid loss: 1142.0160
Iter-46, train loss: 28.1878, valid loss: 1132.9986
Iter-47, train loss: 28.1454, valid loss: 1134.6971
Iter-48, train loss: 28.1041, valid loss: 1139.3216
Iter-49, train loss: 28.0638, valid loss: 1144.8469
Iter-50, train loss: 28.0245, valid loss: 1146.2431
Iter-51, train loss: 27.9860, valid loss: 1140.9675
Iter-52, train loss: 27.9484, valid loss: 1140.7137
Iter-53, train loss: 27.9117, valid loss: 1145.2144
Iter-54, train loss: 27.8757, valid loss: 1136.7424
Iter-55, train loss: 27.8405, valid loss: 1147.0338
Iter-56, train loss: 27.8060, valid loss: 1134.5640
Iter-57, train loss: 27.7722, valid loss: 1149.2180
Iter-58, train loss: 27.7390, valid loss: 1151.4375
Iter-59, train loss: 27.7065, valid loss: 1139.6119
Iter-60, train loss: 27.6745, valid loss: 1146.1002
Iter-61, train loss: 27.6431, valid loss: 1147.2983
Iter-62, train loss: 27.6123, valid loss: 1155.6324
Iter-63, train loss: 27.5820, valid loss: 1164.0052
Iter-64, train loss: 27.5522, valid loss: 1152.7702
Iter-65, train loss: 27.5228, valid loss: 1153.0251
Iter-66, train loss: 27.4939, valid loss: 1159.2303
Iter-67, train loss: 27.4655, valid loss: 1153.9311
Iter-68, train loss: 27.4375, valid loss: 1159.9378
Iter-69, train loss: 27.4099, valid loss: 1145.8762
Iter-70, train loss: 27.3827, valid loss: 1149.6112
Iter-71, train loss: 27.3559, valid loss: 1160.8266
Iter-72, train loss: 27.3294, valid loss: 1165.0944
Iter-73, train loss: 27.3034, valid loss: 1163.7457
Iter-74, train loss: 27.2777, valid loss: 1163.6895
Iter-75, train loss: 27.2523, valid loss: 1161.7378
Iter-76, train loss: 27.2273, valid loss: 1150.0585
Iter-77, train loss: 27.2027, valid loss: 1164.5431
Iter-78, train loss: 27.1784, valid loss: 1177.5418
Iter-79, train loss: 27.1544, valid loss: 1146.3072
Iter-80, train loss: 27.1307, valid loss: 1158.6979
Iter-81, train loss: 27.1074, valid loss: 1152.9464
Iter-82, train loss: 27.0843, valid loss: 1180.6589
Iter-83, train loss: 27.0616, valid loss: 1154.3271
Iter-84, train loss: 27.0392, valid loss: 1165.9303
Iter-85, train loss: 27.0171, valid loss: 1156.7709
Iter-86, train loss: 26.9952, valid loss: 1180.9761
Iter-87, train loss: 26.9737, valid loss: 1160.1811
Iter-88, train loss: 26.9524, valid loss: 1168.3535
Iter-89, train loss: 26.9315, valid loss: 1165.3183
Iter-90, train loss: 26.9108, valid loss: 1176.3823
Iter-91, train loss: 26.8904, valid loss: 1180.3783
Iter-92, train loss: 26.8702, valid loss: 1182.9036
Iter-93, train loss: 26.8503, valid loss: 1156.2201
Iter-94, train loss: 26.8307, valid loss: 1180.5460
Iter-95, train loss: 26.8113, valid loss: 1193.7134
Iter-96, train loss: 26.7922, valid loss: 1174.6471
Iter-97, train loss: 26.7733, valid loss: 1191.1686
Iter-98, train loss: 26.7547, valid loss: 1183.5174
Iter-99, train loss: 26.7363, valid loss: 1182.8950
Iter-100, train loss: 26.7182, valid loss: 1189.3540

In [40]:
# # Display the learning curve and losses for training, validation, and testing
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

plt.plot(net.losses['train'], label='Train loss')
plt.plot(net.losses['smooth train'], label='Train smooth loss')
plt.plot(net.losses['valid'], label='Valid loss')
plt.legend()
plt.show()



In [ ]: