In [1]:
# Data
import numpy as np

with open('data/text_data/japan.txt', 'r') as f:
    txt = f.read()

    X = []
    y = []

    char_to_idx = {char: i for i, char in enumerate(set(txt))}
    idx_to_char = {i: char for i, char in enumerate(set(txt))}

    X = np.array([char_to_idx[x] for x in txt])
    y = [char_to_idx[x] for x in txt[1:]]
    y.append(char_to_idx['.'])
    y = np.array(y)

In [3]:
# Model
import impl.loss as loss_fun
import impl.layer as l
import impl.utils as util
import impl.NN as nn

class GRU3(nn.NN):

    def __init__(self, D, H, L, char2idx, idx2char):
        self.D = D
        self.H = H
        self.L = L
        self.char2idx = char2idx
        self.idx2char = idx2char
        self.vocab_size = len(char2idx)
        self.losses = {'train':[], 'valid':[], 'test':[]}
        super().__init__(D, D, H, None, None, loss='cross_ent', nonlin='relu')

    def _init_model(self, D, C, H):
        Z = H + D

        m = dict(
            Wh=np.random.randn(Z, H) / np.sqrt(Z / 2.),
            Wy=np.random.randn(H, D) / np.sqrt(D / 2.),
            bh=np.zeros((1, H)),
            by=np.zeros((1, D))
        )
        self.model = []
        for l in range(self.L):
            self.model.append(m)
        
    def initial_state(self):
        return np.zeros((1, self.H))

    def forward(self, X, h, m):
        Wh, Wy = m['Wh'], m['Wy']
        bh, by = m['bh'], m['by']

        X_one_hot = X.copy()
        h_old = h.copy()
        
        # input: concat: [h, x]
        X = np.column_stack((h_old, X_one_hot))
        hh, hh_cache = l.fc_forward(X, Wh, bh)

        # gate: h_prob
        hz, hz_sigm_cache = l.sigmoid_forward(hh)

        # signal: h_pred
        hh, hh_tanh_cache = l.tanh_forward(hh)

        # output: h_next and y_pred
        h = h_old + hz * (hh - h_old)
        y, y_cache = l.fc_forward(h, Wy, by)

        cache = h_old, X, hh_cache, hz, hz_sigm_cache, hh, hh_tanh_cache, y_cache

        return y, h, cache

    def backward(self, dy, dh, cache):
        h_old, X, hh_cache, hz, hz_sigm_cache, hh, hh_tanh_cache, y_cache = cache
        dh_next = dh.copy()

        # output: h_next and y_pred
        dh, dWy, dby = l.fc_backward(dy, y_cache)
        dh += dh_next
        dh_old1 = (1. - hz) * dh

        # signal: h_pred
        dhh = hz * dh
        dhh = l.tanh_backward(dhh, hh_tanh_cache)

        # gate: h_prob
        dhz = (hh - h_old) * dh
        dhz = l.sigmoid_backward(dhz, hz_sigm_cache)

        # input
        dhh += dhz
        dX, dWh, dbh = l.fc_backward(dhh, hh_cache)
        dh_old2 = dX[:, :self.H]

        # concat: [h, x]
        dh = dh_old1 + dh_old2
        dX = dX[:, self.H:]

        grad = dict(Wh=dWh, Wy=dWy, bh=dbh, by=dby)
        
        return dX, dh, grad
        
    def train_forward(self, X_train, h):
        ys, caches = [], []
        h_init = h.copy()
        h = []
        for l in range(self.L):
            h.append(h_init.copy())
            caches.append([])
            
        for X in X_train:
            X_one_hot = np.zeros(self.D)
            X_one_hot[X] = 1.
            y = X_one_hot.reshape(1, -1)
            for l in range(self.L):
                y, h[l], cache = self.forward(y, h[l], self.model[l])
                caches[l].append(cache)
                
            ys.append(y)
            
        return ys, caches

    def loss_function(self, y_train, ys):
        loss, dys = 0.0, []

        for y_pred, y in zip(ys, y_train):
            loss += loss_fun.cross_entropy(y_pred, y)/ y_train.shape[0]
            dy = loss_fun.dcross_entropy(y_pred, y)
            dys.append(dy)
            
        return loss, dys

    def train_backward(self, dys, caches):
        dh, grad, grads = [], [], []
        for l in range(self.L):
            dh.append(np.zeros((1, self.H)))
            grad.append({key: np.zeros_like(val) for key, val in self.model[0].items()})
            grads.append({key: np.zeros_like(val) for key, val in self.model[0].items()})
            
        for t in reversed(range(len(dys))):
            dX = dys[t]
            for l in reversed(range(self.L)):
                dX, dh[l], grad[l] = self.backward(dX, dh[l], caches[l][t])
                for k in grad[0].keys():
                    grads[l][k] += grad[l][k]
                
        return dX, grads
    
    def test(self, X_seed, h, size=100):
        chars = [self.idx2char[X_seed]]
        idx_list = list(range(self.vocab_size))
        X = X_seed
        
        h_init = h.copy()
        h = []
        for l in range(self.L):
            h.append(h_init.copy())

        for _ in range(size - 1):
            X_one_hot = np.zeros(self.D)
            X_one_hot[X] = 1.
            y = X_one_hot.reshape(1, -1)
            for l in range(self.L):
                y, h[l], _ = self.forward(y, h[l], self.model[l])
                
            prob = util.softmax(y)
            idx = np.random.choice(idx_list, p=prob.ravel())
            chars.append(self.idx2char[idx])
            X = idx

        return ''.join(chars)

In [4]:
# Optimizer/solver
import impl.constant as c
import copy
from sklearn.utils import shuffle as skshuffle

def get_minibatch(X, y, minibatch_size, shuffle=True):
    minibatches = []

    if shuffle:
        X, y = skshuffle(X, y)

    for i in range(0, X.shape[0], minibatch_size):
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]

        minibatches.append((X_mini, y_mini))

    return minibatches

def adam_rnn(nn, X_train, y_train, alpha=0.001, mb_size=256, n_iter=2000, print_after=100):
    minibatches = get_minibatch(X_train, y_train, mb_size, shuffle=False)

    idx = 0
    state = nn.initial_state()
    loss = np.log(len(set(X_train)))
#     smooth_loss = -np.log(1.0 / len(set(X_train)))

    
    M, R = [], []
    for l in range(nn.L):
        M.append({k: np.zeros_like(v) for k, v in nn.model[0].items()})
        R.append({k: np.zeros_like(v) for k, v in nn.model[0].items()})
        
    beta1 = .9
    beta2 = .999

    for iter in range(1, n_iter + 1):

        if idx >= len(minibatches):
            idx = 0
            state = nn.initial_state()

        X_mini, y_mini = minibatches[idx]
        idx += 1

        # Print loss and test sample
        if iter % print_after == 0:
            print('Iter-{} loss: {:.4f}'.format(iter, loss))
#             print('Iter-{} loss: {:.4f}'.format(iter, smooth_loss))
            sample = nn.test(X_mini[0], state)
            print(sample)

        ys, caches = nn.train_forward(X_mini, state)
        loss, dys = nn.loss_function(y_mini, ys)
        dX, grads = nn.train_backward(dys, caches)
#         smooth_loss = 0.999 * smooth_loss + 0.001 * loss
        nn.losses['train'].append(loss)
        


        for l in range(nn.L):
            for k in grads[0].keys(): #key, value: items
                M[l][k] = util.exp_running_avg(M[l][k], grads[l][k], beta1)
                R[l][k] = util.exp_running_avg(R[l][k], grads[l][k]**2, beta2)

                m_k_hat = M[l][k] / (1. - beta1**(iter))
                r_k_hat = R[l][k] / (1. - beta2**(iter))

                nn.model[l][k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps)
    
    return nn

In [8]:
# Hyper-parameters
vocab_size = len(char_to_idx)

# hyper parameters
time_step = 10 # width
num_layers = 2 # depth
n_iter = 13000 # epochs
alpha = 1e-3 # learning_rate
print_after = 1000 # print train loss, valid, and test
H = 64 # num_hidden_units in the hidden layer

In [9]:
# Runnign session/ the actual learning session
net = GRU3(D=vocab_size, H=H, L=num_layers, char2idx=char_to_idx, idx2char=idx_to_char)

adam_rnn(nn=net, X_train=X, y_train=y, alpha=alpha, mb_size=time_step, n_iter=n_iter, print_after=print_after)


Iter-1000 loss: 2.2717
 uptulao th ocsi pioee aa anDtaat, lsogtarl sia an thelbgam au lacad moe 1xzuratsid. .mthr i"tt eoal
Iter-2000 loss: 2.2271
onr. an sy ssaro s ar hs istelao the raolg ins tintaens koustera o8 frasros th'e as oralgesitesictyr
Iter-3000 loss: 2.2601
rsee. 6o-ke andest Freko -kar 1umtre th  inldinve Oipol ipe ort ory mar ope parcur A. Toocmeeinn ae 
Iter-4000 loss: 1.9157
uNthe Sax, stinimin, e1flhrts  wa th Rs le R mreon ivosatire 3athes cxensktl iitve meand pan's na mi
Iter-5000 loss: 1.1332
colcure, Huhhcan and eadventvesrciulsiegintokoangkagokok ge athe horacaviircolinbxiastte tEhislal Sh
Iter-6000 loss: 2.0589
ountyrr, esten Japle os tty meren, ssa neokaiy on-nnInan th. wis lecan.  thegsored 1Hochho hacolctar
Iter-7000 loss: 1.2219
an", and an.ictymipailiissi nn in isrep,kro keueve npred int  an ineofudghh a"g xpparcfe tun arl 196
Iter-8000 loss: 0.7615
ily 13t peet As.tenjve aCn in.leWowesg,hstke h  owit mt nanmelioni thiur-,uesu " s peeee venn9sskpe 
Iter-9000 loss: 0.8525
s of maten as palwo an tem it Th o cldern3 sithtion Eicaieloo, thatrar ociod demar e angioteteli2c c
Iter-10000 loss: 0.8657
 furean 57 War bef nctt tyin liintiin esiore fror ofle,tor by bop tr nfallen. r  eree erecere ringle
Iter-11000 loss: 1.1998
r bof nal the couured 18 wacuicn oro ical ili cyoepolim heroc hg gtanc mapelac d  aptet ter  ii c  S
Iter-12000 loss: 0.7133
 ceneety in cistearsc sainiinointroon"epareessy cupoev Nocbcielrothundiri. SiooJapapn eei日 Srei"suSr
Iter-13000 loss: 2.1800
 mrinc Weprse ti im lgsimopre Chihh wom pousl an is atcsof 1tiacisialuoowJarudsotsh lpough ha whe wo
Out[9]:
<__main__.GRU3 at 0x7f6aa7255b70>

In [10]:
# Display the learning curve and losses for training, validation, and testing
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt

plt.plot(net.losses['train'], label='Train loss')
# matplot.plot(losses_tx2['valid'], label='Valid loss')
plt.legend()
# _ = plt.ylim()


Out[10]:
<matplotlib.legend.Legend at 0x7f6a7ac39cc0>

In [ ]: