RNN training using numpy code. Based on the works of A Karpathy. Modified to use RELU instead of logistic function.


In [10]:
import numpy as np

# read full file
data = open( 'pg_essays.txt', 'rt' ).read()

# dictionary conversion. We will use n-gram coding for all the unique characters in the input data
dict_chars = list( set( data ) )
data_size = len( data )
dict_size = len( dict_chars )

# character encoding
char_to_x = { c:i for i, c in enumerate( dict_chars ) }
x_to_char = { i:c for i, c in enumerate( dict_chars ) }

print( 'data size: {0}, dictionary size: {1}'.format( data_size, dict_size ) )


data size: 1538444, dictionary size: 120

In [11]:
# net structure
hidden_nodes = 100
seq_len = 25

learning_rate = 1e-1

RNN equations with normalisation

$$ h' = f( W_{hh} h + W_{xh} x + b_h ) $$$$ y = W_{hy} h' + b_y $$$$ p = \left[ \frac{e^{y_i}}{\sum{ e^{y_i} } }, ... \right]^T $$

In [12]:
# model weights - drawn initially from guassian distribution
Whh = np.random.normal( 0., 2., (hidden_nodes, hidden_nodes) )
Wxh = np.random.normal( 0., 2., (hidden_nodes, dict_size ) )
Wyh = np.random.normal( 0., 2., (dict_size, hidden_nodes ) )

bh = np.zeros( (hidden_nodes, 1) )
by = np.zeros( (dict_size, 1 ) )

In [17]:
# calculate loss, model weights gradients and return hidden state for propagation
def loss( inputs, targets, hprev ):
    '''
    input and target have len of seq_len
    '''
    
    xs, hraw, hs, ys, ps = {}, {}, {}, {}, {}
    hs[ -1 ] = hprev
    loss = 0.
    
    # forward pass
    for i in range( len( inputs ) ):
        
        # encode input character
        xs[i] = np.zeros( ( dict_size, 1 ) )
        xs[i][inputs[i]] = 1.
        
        hraw[i] = np.dot( Wxh, xs[i] ) + np.dot( Whh, hs[i-1] ) + bh
        hs[i] = np.maximum( hraw[i], 0. )
        ys[i] = np.dot( Wyh, hs[i] )
        
        # normalise probabilities
        ps[i] = np.exp( ys[i] ) / np.sum( np.exp( ys[i] ) )
        
        # softmax (cross-entropy loss)
        loss += -np.log( ps[i][targets[i],0] )

    dWxh, dWhh, dWhy = np.zeros_like( Wxh ), np.zeros_like( Whh ), np.zeros_like( Why )
    dbh, dby = np.zeros_like( bh ), np.zeros_like( by )
    dhnext = np.zeros_like( hs[0] )
    
    # backward pass: start from the end
    for i in reverse( range( len( inputs ) ) ):
        dy = np.copy( ps[i] )
        dy[targets[i]] -= 1.0    # backprop into y.  In the target state the ps[ target[i] ] == 1.0

        # recover weights
        dWhy += np.dot( dy, hs[i].T )
        dby += dy
        
        dh = np.dot( Why.T, dy) + dhnext # backprop into h
        dhtemp = np.zeros_like( dhnext )
        dhtemp[ hraw[i] > 0. ] = 1.
        dhraw = dhtemp * dh
        
        dbh += dhraw
        dWxh += np.dot( dhraw, xs[t].T )
        dWhh += np.dot( dhraw, hs[t-1].T )
        dhnext = np.dot( Whh.T, dhraw )
        
        # clip to mitigate exploding gradients
        for dparam in [ dWxh, dWhh, dWhy, dbh, dby ]:
            np.clip( dparam, -5, 5, out=dparam )
            
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[ len(inputs)-1 ]

In [23]:
# gradient validation by A Karpathy
from random import uniform

def gradCheck(inputs, targets, hprev):
    global Wxh, Whh, Why, bh, by

    num_checks, delta = 10, 1e-5

    # calculate gradients using backprop
    _, dWxh, dWhh, dWhy, dbh, dby, _ = loss( inputs, targets, hprev )

    for param, dparam, name in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], ['Wxh', 'Whh', 'Why', 'bh', 'by']):
        s0 = dparam.shape
        s1 = param.shape
        assert s0 == s1, 'Error dims dont match: %s and %s.' % (`s0`, `s1`)
        print( name )

        for i in xrange(num_checks):
            ri = int( uniform(0,param.size) )
            
            # evaluate cost at [x + delta] and [x - delta]
            old_val = param.flat[ri]
            param.flat[ri] = old_val + delta
            cg0, _, _, _, _, _, _ = lossFun( inputs, targets, hprev )
            param.flat[ri] = old_val - delta
            cg1, _, _, _, _, _, _ = lossFun( inputs, targets, hprev )
            param.flat[ri] = old_val # reset old value for this parameter
 
            # fetch both numerical and analytic gradient
            grad_analytic = dparam.flat[ri]
            grad_numerical = (cg0 - cg1) / ( 2 * delta )
            rel_error = abs( grad_analytic - grad_numerical ) / abs( grad_numerical + grad_analytic )

            print( '%f, %f => %e ' % ( grad_numerical, grad_analytic, rel_error) )
            # rel_error should be on order of 1e-7 or less

In [24]:
p = 0
inputs = [ char_to_x[ c ] for c in data[ p:p+seq_len ] ]
targets = [ char_to_x[ c ] for c in data[ p + 1:p+seq_len + 1 ] ]

hprev = np.zeros_like( bh )

print( data[ p:p+seq_len ], inputs )
print( data[ p + 1:p+seq_len + 1 ], targets )

# run gradianet check
gradCheck( inputs, target, hprev )


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-24-fcefbc1045a5> in <module>()
      9 
     10 # run gradianet check
---> 11 gradCheck( inputs, target, hprev )

<ipython-input-23-80920421cc0a> in gradCheck(inputs, targets, hprev)
      8 
      9     # calculate gradients using backprop
---> 10     _, dWxh, dWhh, dWhy, dbh, dby, _ = loss( inputs, targets, hprev )
     11 
     12     for param, dparam, name in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], ['Wxh', 'Whh', 'Why', 'bh', 'by']):

<ipython-input-17-451a4f3f97d5> in loss(inputs, targets, hprev)
     25 
     26         # softmax (cross-entropy loss)
---> 27         loss += -np.log( ps[i][targets[i],0] )
     28 
     29     dWxh, dWhh, dWhy = np.zeros_like( Wxh ), np.zeros_like( Whh ), np.zeros_like( Why )

ValueError: invalid literal for long() with base 10: 'r'
('Programming Bottom-Up\r\n\r\n', [13, 85, 53, 50, 85, 108, 113, 113, 111, 83, 50, 1, 67, 53, 26, 26, 53, 113, 93, 105, 24, 89, 59, 89, 59])
('rogramming Bottom-Up\r\n\r\n(', [85, 53, 50, 85, 108, 113, 113, 111, 83, 50, 1, 67, 53, 26, 26, 53, 113, 93, 105, 24, 89, 59, 89, 59, 3])

In [15]: