RNN training using numpy code. Based on the works of A Karpathy. Modified to use RELU instead of logistic function.
In [10]:
import numpy as np
# read full file
data = open( 'pg_essays.txt', 'rt' ).read()
# dictionary conversion. We will use n-gram coding for all the unique characters in the input data
dict_chars = list( set( data ) )
data_size = len( data )
dict_size = len( dict_chars )
# character encoding
char_to_x = { c:i for i, c in enumerate( dict_chars ) }
x_to_char = { i:c for i, c in enumerate( dict_chars ) }
print( 'data size: {0}, dictionary size: {1}'.format( data_size, dict_size ) )
In [11]:
# net structure
hidden_nodes = 100
seq_len = 25
learning_rate = 1e-1
RNN equations with normalisation
$$ h' = f( W_{hh} h + W_{xh} x + b_h ) $$$$ y = W_{hy} h' + b_y $$$$ p = \left[ \frac{e^{y_i}}{\sum{ e^{y_i} } }, ... \right]^T $$
In [12]:
# model weights - drawn initially from guassian distribution
Whh = np.random.normal( 0., 2., (hidden_nodes, hidden_nodes) )
Wxh = np.random.normal( 0., 2., (hidden_nodes, dict_size ) )
Wyh = np.random.normal( 0., 2., (dict_size, hidden_nodes ) )
bh = np.zeros( (hidden_nodes, 1) )
by = np.zeros( (dict_size, 1 ) )
In [17]:
# calculate loss, model weights gradients and return hidden state for propagation
def loss( inputs, targets, hprev ):
'''
input and target have len of seq_len
'''
xs, hraw, hs, ys, ps = {}, {}, {}, {}, {}
hs[ -1 ] = hprev
loss = 0.
# forward pass
for i in range( len( inputs ) ):
# encode input character
xs[i] = np.zeros( ( dict_size, 1 ) )
xs[i][inputs[i]] = 1.
hraw[i] = np.dot( Wxh, xs[i] ) + np.dot( Whh, hs[i-1] ) + bh
hs[i] = np.maximum( hraw[i], 0. )
ys[i] = np.dot( Wyh, hs[i] )
# normalise probabilities
ps[i] = np.exp( ys[i] ) / np.sum( np.exp( ys[i] ) )
# softmax (cross-entropy loss)
loss += -np.log( ps[i][targets[i],0] )
dWxh, dWhh, dWhy = np.zeros_like( Wxh ), np.zeros_like( Whh ), np.zeros_like( Why )
dbh, dby = np.zeros_like( bh ), np.zeros_like( by )
dhnext = np.zeros_like( hs[0] )
# backward pass: start from the end
for i in reverse( range( len( inputs ) ) ):
dy = np.copy( ps[i] )
dy[targets[i]] -= 1.0 # backprop into y. In the target state the ps[ target[i] ] == 1.0
# recover weights
dWhy += np.dot( dy, hs[i].T )
dby += dy
dh = np.dot( Why.T, dy) + dhnext # backprop into h
dhtemp = np.zeros_like( dhnext )
dhtemp[ hraw[i] > 0. ] = 1.
dhraw = dhtemp * dh
dbh += dhraw
dWxh += np.dot( dhraw, xs[t].T )
dWhh += np.dot( dhraw, hs[t-1].T )
dhnext = np.dot( Whh.T, dhraw )
# clip to mitigate exploding gradients
for dparam in [ dWxh, dWhh, dWhy, dbh, dby ]:
np.clip( dparam, -5, 5, out=dparam )
return loss, dWxh, dWhh, dWhy, dbh, dby, hs[ len(inputs)-1 ]
In [23]:
# gradient validation by A Karpathy
from random import uniform
def gradCheck(inputs, targets, hprev):
global Wxh, Whh, Why, bh, by
num_checks, delta = 10, 1e-5
# calculate gradients using backprop
_, dWxh, dWhh, dWhy, dbh, dby, _ = loss( inputs, targets, hprev )
for param, dparam, name in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], ['Wxh', 'Whh', 'Why', 'bh', 'by']):
s0 = dparam.shape
s1 = param.shape
assert s0 == s1, 'Error dims dont match: %s and %s.' % (`s0`, `s1`)
print( name )
for i in xrange(num_checks):
ri = int( uniform(0,param.size) )
# evaluate cost at [x + delta] and [x - delta]
old_val = param.flat[ri]
param.flat[ri] = old_val + delta
cg0, _, _, _, _, _, _ = lossFun( inputs, targets, hprev )
param.flat[ri] = old_val - delta
cg1, _, _, _, _, _, _ = lossFun( inputs, targets, hprev )
param.flat[ri] = old_val # reset old value for this parameter
# fetch both numerical and analytic gradient
grad_analytic = dparam.flat[ri]
grad_numerical = (cg0 - cg1) / ( 2 * delta )
rel_error = abs( grad_analytic - grad_numerical ) / abs( grad_numerical + grad_analytic )
print( '%f, %f => %e ' % ( grad_numerical, grad_analytic, rel_error) )
# rel_error should be on order of 1e-7 or less
In [24]:
p = 0
inputs = [ char_to_x[ c ] for c in data[ p:p+seq_len ] ]
targets = [ char_to_x[ c ] for c in data[ p + 1:p+seq_len + 1 ] ]
hprev = np.zeros_like( bh )
print( data[ p:p+seq_len ], inputs )
print( data[ p + 1:p+seq_len + 1 ], targets )
# run gradianet check
gradCheck( inputs, target, hprev )
In [15]: