``````

In [ ]:

import numpy as np
import theano
import theano.tensor as T
import lasagne
import os

#thanks Muammar

``````

# Problem & Dataset

• We solve a problem of transribing english words.
• word (sequence of letters) -> transcipt (sequence of phonemes)
• The problem is, some letters correspond to several phonemes and others - to none.
• We solve it through encoder-decoder recurrent neural networks
• This architecture is generally about converting ANY sequence into ANY other sequence. It could even become president one day.
``````

In [ ]:

with open("./train.csv") as fin:
ids,words,transcripts = zip(*[line.split(',') for line in list(fin)[1:]])
words = [word+"@" for word in words]
transcripts = [["START"]+ts[:-2].split()+["END"] for ts in transcripts]

``````
``````

In [ ]:

for word, trans in zip(words[:5],phonemes[:5]):
print word,':',trans

``````

# Tokenization

• Same as before, only now we do this separately for words and transcripts
``````

In [ ]:

phonemes = list(set([token for ts in transcripts for token in ts]))
phoneme_to_ix = {ph:i for i,ph in enumerate(phonemes)}

``````
``````

In [ ]:

letters = list(set([token for word in words for token in word]))
letter_to_ix = {l:i for i,l in enumerate(letters)}

``````
``````

In [ ]:

import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(map(len,transcripts),bins=25);

# truncate names longer than MAX_LEN characters.
MAX_LEN = min([60,max(list(map(len,transcripts)))])
#ADJUST IF YOU ARE UP TO SOMETHING SERIOUS

``````

### Cast everything from symbols into matrix of int32. Pad with -1

``````

In [ ]:

max_len = max_len or max(map(len,sequences))

matrix = np.zeros((len(sequences),max_len),dtype='int8') -1
for i,seq in enumerate(sequences):
row_ix = map(token_to_i.get,seq)[:max_len]
matrix[i,:len(row_ix)] = row_ix

return matrix

``````
``````

In [ ]:

print as_matrix(words[:10],letter_to_ix)

``````

# Input variables

``````

In [ ]:

input_sequence = T.matrix('token sequence','int32')
target_phonemes = T.matrix('target phonemes','int32')

``````

# Build NN

You will be building a model that takes token sequence and predicts next token

• iput sequence
• one-hot / embedding
• recurrent layer(s)
• otput layer(s) that predict output probabilities
``````

In [ ]:

from lasagne.layers import InputLayer,DenseLayer,EmbeddingLayer
from lasagne.layers import RecurrentLayer,LSTMLayer,GRULayer,CustomRecurrentLayer

``````
``````

In [ ]:

##ENCODER
l_in = lasagne.layers.InputLayer(shape=(None, None),input_var=input_sequence)
l_emb = lasagne.layers.EmbeddingLayer(l_in, len(letters), 40)

##DECODER
transc_in = lasagne.layers.InputLayer(shape=(None, None),input_var=target_phonemes)
transc_emb = lasagne.layers.EmbeddingLayer(transc_in, len(phonemes), 50)

#flatten batch and time to be compatible with feedforward layers (will un-flatten later)
transc_rnn_flat = lasagne.layers.reshape(transc_rnn, (-1,transc_rnn.output_shape[-1]))

l_out = lasagne.layers.DenseLayer(transc_rnn_flat,len(phonemes),nonlinearity=lasagne.nonlinearities.softmax)

``````
``````

In [ ]:

# Model weights
weights = lasagne.layers.get_all_params(l_out,trainable=True)
print weights

``````
``````

In [ ]:

network_output = lasagne.layers.get_output(l_out)
network_output = <reshape to [batch_i, time_tick, number_of_phonemes] symbolically>
#If you use dropout do not forget to create deterministic version for evaluation

``````
``````

In [ ]:

predictions_flat = network_output[:,:-1,:].reshape([-1,len(phonemes)])
targets_flat = target_phonemes[:,1:].ravel()

#do not count loss for '-1' tokens

``````

# Compiling it

``````

In [ ]:

#training

#computing loss without training
compute_cost = theano.function([input_sequence, target_phonemes], loss, allow_input_downcast=True)

``````

# generation

Simple:

• get initial context(seed),
• predict next token probabilities,
• sample next token,
• add it to the context
• repeat from step 2

You'll get a more detailed info on how it works in the homework section.

``````

In [ ]:

#compile the function that computes probabilities for next token given previous text.

network_output = <network output reshaped to [batch,tick,phoneme] format>

last_word_probas = <a matrix [batch_i, n_phonemes], counting all phonemes>

probs = <a function that predicts probabilities coming after the last token

``````
``````

In [ ]:

def generate_transcript(word,transcript_prefix = ("START",),END_phoneme="END"
temperature=1,sample=True):

transcript = list(transcript_prefix)
while True:
next_phoneme_probs = <a vector of probabilities of the next token>
next_phoneme_probs = <maybe apply temperature>

if sample:
next_phoneme = <phoneme sampled with these probabilities (string character)>
else:
next_phoneme = <most likely phoneme>

transcript.append(next_phoneme)

if next_phoneme==END_phoneme:
break
return transcript

``````

# Model training

Here you can tweak parameters or insert your generation function

Once something word-like starts generating, try increasing seq_length

``````

In [ ]:

words = np.array(words)
transcripts = np.array(transcripts)

``````
``````

In [ ]:

def sample_batch(words,transcripts, batch_size):

<sample random batch of words and transcripts>
<convert both into network-edible format (as_matrix)>

return words_batch,transcripts_batch

``````
``````

In [ ]:

from tqdm import tqdm

print("Training ...")

#total N iterations
n_epochs=100

# how many minibatches are there in the epoch
batches_per_epoch = 500

#how many training sequences are processed in a single function call
batch_size=10

for epoch in range(n_epochs):

avg_cost = 0;

for _ in tqdm(range(batches_per_epoch)):

x,y = sample_batch(words,transcripts,batch_size)
avg_cost += train(x, y).mean()

print("Epoch {} average loss = {}".format(epoch, avg_cost / batches_per_epoch))
for i in range(5):
ind = np.random.randint(len(words))
print words[ind],':', ' '.join(generate_transcript(words[ind],sample=False)[1:-1])

``````
``````

In [ ]:

``````

# And now,

• try lstm/gru
• try several layers
• try mtg cards
• try your own dataset of any kind
``````

In [ ]:

``````