In [1]:
# Add tools
# NOTE: This should only be needed if you do not store the notebook on the lxmls root
import sys
sys.path.append('../../')
from pdb import set_trace
In [2]:
# Location of Part-of-Speech WSJ Data
WSJ_TRAIN = "../../data/train-02-21.conll"
WSJ_TEST = "../../data/test-23.conll"
WSJ_DEV = "../../data/dev-22.conll"
In [3]:
# Load Part-of-Speech data
import lxmls.readers.pos_corpus as pcc
corpus = pcc.PostagCorpus()
train_seq = corpus.read_sequence_list_conll(WSJ_TRAIN, max_sent_len=15, max_nr_sent=1000)
test_seq = corpus.read_sequence_list_conll(WSJ_TEST, max_sent_len=15, max_nr_sent=1000)
dev_seq = corpus.read_sequence_list_conll(WSJ_DEV, max_sent_len=15, max_nr_sent=1000)
# Redo indices so that they are consecutive. Also cast all data to numpy arrays
# of int32 for compatibility with GPUs and theano and add reverse index
train_seq, test_seq, dev_seq = pcc.compacify(train_seq, test_seq, dev_seq, theano=True)
# Get number of words and tags in the corpus
nr_words = len(train_seq.x_dict)
nr_tags = len(train_seq.y_dict)
Model configuration
In [4]:
import lxmls.deep_learning.rnn as rnns
reload(rnns)
Out[4]:
In [5]:
# RNN configuration
SEED = 1234 # Random seed to initialize weigths
emb_size = 50 # Size of word embeddings
hidden_size = 20 # size of hidden layer
In [6]:
np_rnn = rnns.NumpyRNN(nr_words, emb_size, hidden_size, nr_tags, seed=SEED)
In [7]:
x0 = train_seq[0].x
y0 = train_seq[0].y
In [8]:
# Forward pass
p_y, y_rnn, h, z1, x = np_rnn.forward(x0, all_outputs=True)
# Compute gradients
numpy_rnn_gradients = np_rnn.grads(x0, y0)
Understand the basics of scan with this examples. Scan allows you to build computation graphs with a variable number of nodes. It acts as a python "for" loop but it is symbolic. The following example should help you understand the basic scan functionality. It generates a sequence for a given length. Run it and modify it. Try to arrive at an error.
In [9]:
import numpy as np
import theano
import theano.tensor as T
theano.config.optimizer='None'
In [10]:
def square(x):
return x**2
# Python
def np_square_n_steps(nr_steps):
out = []
for n in np.arange(nr_steps):
out.append(square(n))
return np.array(out)
# Theano
nr_steps = T.lscalar('nr_steps')
h, _ = theano.scan(fn=square, sequences=T.arange(nr_steps))
th_square_n_steps = theano.function([nr_steps], h)
In [11]:
print np_square_n_steps(10)
print th_square_n_steps(10)
The following example should help you understand about matrix multiplications and passing values from one iteration to the other. It at each step it we multiply the output of the previous step by a matrix A. We start with an initial vector s0. The matrix and vector are random but normalized to result on a Markov chain.
In [12]:
# Configuration
nr_states = 3
nr_steps = 5
# Transition matrix
A = np.abs(np.random.randn(nr_states, nr_states))
A = A/A.sum(0, keepdims=True)
# Initial state
s0 = np.zeros(nr_states)
s0[0] = 1
In [13]:
# Numpy version
def np_markov_step(s_tm1):
s_t = np.dot(s_tm1, A.T)
return s_t
def np_markov_chain(nr_steps, A, s0):
# Pre-allocate space
s = np.zeros((nr_steps+1, nr_states))
s[0, :] = s0
for t in np.arange(nr_steps):
s[t+1, :] = np_markov_step(s[t, :])
return s
In [14]:
np_markov_chain(nr_steps, A, s0)
Out[14]:
In [15]:
# Theano version
# Store variables as shared variables
th_A = theano.shared(A, name='A', borrow=True)
th_s0 = theano.shared(s0, name='s0', borrow=True)
# Symbolic variable for the number of steps
th_nr_steps = T.lscalar('nr_steps')
def th_markov_step(s_tm1):
s_t = T.dot(s_tm1, th_A.T)
# Remember to name variables
s_t.name = 's_t'
return s_t
s, _ = theano.scan(th_markov_step,
outputs_info=[dict(initial=th_s0)],
n_steps=th_nr_steps)
th_markov_chain = theano.function([th_nr_steps], T.concatenate((th_s0[None, :], s), 0))
In [16]:
th_markov_chain(nr_steps)
Out[16]:
In [17]:
rnn = rnns.RNN(nr_words, emb_size, hidden_size, nr_tags, seed=SEED)
In [18]:
# Compile theano function
x = T.ivector('x')
th_forward = theano.function([x], rnn._forward(x).T)
When working with theano, it is more difficult to localize the source of errors. It is therefore extremely important to work step by step and test the code frequently. To debug we suggest to implement and compile the forward pass first. You can use this code for testing. If it raises no error you are good to go.
In [19]:
assert np.allclose(th_forward(x0), np_rnn.forward(x0)), \
"Numpy and Theano forward pass differ!"
Once you are confident the forward pass is working you can test the gradients
In [20]:
# Compile function returning the list of gradients
x = T.ivector('x') # Input words
y = T.ivector('y') # gold tags
p_y = rnn._forward(x)
cost = -T.mean(T.log(p_y)[T.arange(y.shape[0]), y])
grads_fun = theano.function([x, y], [T.grad(cost, par) for par in rnn.param])
In [21]:
# Compare numpy and theano gradients
theano_rnn_gradients = grads_fun(x0, y0)
for n in range(len(theano_rnn_gradients)):
assert np.allclose(numpy_rnn_gradients[n], theano_rnn_gradients[n]), \
"Numpy and Theano gradients differ in step n"
Finally, its time to test our network!. For this, lets first compile a function that does predictions
In [22]:
rnn_prediction = theano.function([x], T.argmax(p_y, 1))
# Lets test the predictions
def test_model(sample_seq, rnn_prediction):
words = [train_seq.word_dict[wrd] for wrd in sample_seq.x]
tags = [train_seq.tag_dict[pred] for pred in rnn_prediction(sample_seq.x)]
print ["/".join([word, tag]) for word , tag in zip(words, tags)]
In [23]:
test_model(train_seq[0], rnn_prediction)
Now lets define the optimization parameters and compile a batch update function
In [24]:
lrate = 0.5
n_iter = 5
In [25]:
# Get list of SGD batch update rule for each parameter
updates = [(par, par - lrate*T.grad(cost, par)) for par in rnn.param]
# compile
rnn_batch_update = theano.function([x, y], cost, updates=updates)
Finally it is time to run SGD. You can use the following code for this purpose
In [26]:
nr_words = sum([len(seq.x) for seq in train_seq])
for i in range(n_iter):
# Training
cost = 0
errors = 0
for n, seq in enumerate(train_seq):
cost += rnn_batch_update(seq.x, seq.y)
errors += sum(rnn_prediction(seq.x) != seq.y)
acc_train = 100*(1-errors*1./nr_words)
print "Epoch %d: Train cost %2.2f Acc %2.2f %%" % (i+1, cost, acc_train),
# Evaluation
errors = 0
for n, seq in enumerate(dev_seq):
errors += sum(rnn_prediction(seq.x) != seq.y)
acc_dev = 100*(1-errors*1./nr_words)
print " Devel Acc %2.2f %%" % acc_dev
sys.stdout.flush()
Test the effect of using pre-trained embeddings. Run the following code to download the embeddings, reset the layer parameters and initialize the embedding layer with the pre-trained embeddings. Then run the training code above.
In [27]:
# Embeddings Path
EMBEDDINGS = "../../data/senna_50"
import lxmls.deep_learning.embeddings as emb
import os
reload(emb)
if not os.path.isfile(EMBEDDINGS):
emb.download_embeddings('senna_50', EMBEDDINGS)
E = emb.extract_embeddings(EMBEDDINGS, train_seq.x_dict)
# Reset model to remove the effect of training
rnn = rnns.reset_model(rnn, seed=SEED)
# Set the embedding layer to the pre-trained values
rnn.param[0].set_value(E.astype(theano.config.floatX))
In [28]:
lstm = rnns.LSTM(nr_words, emb_size, hidden_size, nr_tags)
lstm_prediction = theano.function([x],
T.argmax(lstm._forward(x), 1))
lstm_cost = -T.mean(T.log(lstm._forward(x))[T.arange(y.shape[0]), y])
In [29]:
# Get list of SGD batch update rule for each parameter
lstm_updates = [(par, par - lrate*T.grad(lstm_cost, par)) for par in lstm.param]
# compile
lstm_batch_update = theano.function([x, y], lstm_cost,
updates=lstm_updates)
In [30]:
nr_words = sum([len(seq.x) for seq in train_seq])
for i in range(n_iter):
# Training
cost = 0
errors = 0
for n, seq in enumerate(train_seq):
cost += lstm_batch_update(seq.x, seq.y)
errors += sum(lstm_prediction(seq.x) != seq.y)
acc_train = 100*(1-errors*1./nr_words)
print "Epoch %d: Train cost %2.2f Acc %2.2f %%" % (i+1, cost, acc_train),
# Evaluation:
errors = 0
for n, seq in enumerate(dev_seq):
errors += sum(lstm_prediction(seq.x) != seq.y)
acc_dev = 100*(1-errors*1./nr_words)
print " Devel Acc %2.2f %%" % acc_dev
sys.stdout.flush()
In [ ]: