In [ ]:

    
import numpy as np
import theano
import theano.tensor as T
import lasagne
import os
#thanks @keskarnitish

Generate names

Struggle to find a name for the variable? Let's see how you'll come up with a name for your son/daughter. Surely no human has expertize over what is a good child name, so let us train NN instead.
Dataset contains ~8k human names from different cultures[in latin transcript]
Objective (toy problem): learn a generative model over names.



In [ ]:

    
start_token = " "

with open("names") as f:
    names = f.read()[:-1].split('\n')
    names = [start_token+name for name in names]



In [ ]:

    
print ('n samples = ',len(names))
for x in names[::1000]:
    print (x)

Text processing



In [ ]:

    
#all unique characters go here
tokens = <all unique characters in the dataset>

tokens = list(tokens)
print ('n_tokens = ',len(tokens))



In [ ]:

    
#!token_to_id = <dictionary of symbol -> its identifier (index in tokens list)>
token_to_id = {t:i for i,t in enumerate(tokens) }

#!id_to_token = < dictionary of symbol identifier -> symbol itself>
id_to_token = {i:t for i,t in enumerate(tokens)}



In [ ]:

    
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(list(map(len,names),bins=25));

# truncate names longer than ~80% percentile
MAX_LEN = ?!

Cast everything from symbols into identifiers



In [ ]:

    
names_ix = list(map(lambda name: list(map(token_to_id.get,name)),names))


#crop long names and pad short ones
for i in range(len(names_ix)):
    names_ix[i] = names_ix[i][:MAX_LEN] #crop too long
    
    if len(names_ix[i]) < MAX_LEN:
        names_ix[i] += [token_to_id[" "]]*(MAX_LEN - len(names_ix[i])) #pad too short
        
assert len(set(map(len,names_ix)))==1

names_ix = np.array(names_ix)

Input variables



In [ ]:

    
input_sequence = T.matrix('token sequencea','int32')
target_values = T.matrix('actual next token','int32')

Build NN

You will be building a model that takes token sequence and predicts next token

iput sequence
one-hot / embedding
recurrent layer(s)
otput layer(s) that predict output probabilities



In [ ]:

    
from lasagne.layers import InputLayer,DenseLayer,EmbeddingLayer
from lasagne.layers import RecurrentLayer,LSTMLayer,GRULayer,CustomRecurrentLayer



In [ ]:

    
l_in = lasagne.layers.InputLayer(shape=(None, None),input_var=input_sequence)

#!<Your neural network>
l_emb = <embedding layer or one-hot encoding>

l_rnn = <some recurrent layer(or several such layers)>

#flatten batch and time to be compatible with feedforward layers (will un-flatten later)
l_rnn_flat = lasagne.layers.reshape(l_rnn, (-1,l_rnn.output_shape[-1]))

l_out = <last dense layer (or several layers), returning probabilities for all possible next tokens>



In [1]:

    
# Model weights
weights = lasagne.layers.get_all_params(l_out,trainable=True)
print( weights)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-ed868f8cad4c> in <module>()
      1 # Model weights
----> 2 weights = lasagne.layers.get_all_params(l_out,trainable=True)
      3 print( weights)

NameError: name 'lasagne' is not defined



In [ ]:

    
network_output = <NN output via lasagne>
#If you use dropout do not forget to create deterministic version for evaluation



In [ ]:

    
predicted_probabilities_flat = network_output
correct_answers_flat = target_values.ravel()


loss = <Loss function - a simple categorical crossentropy will do, maybe add some regularizer>

updates = <your favorite optimizer>

Compiling it



In [ ]:

    
#training
train = theano.function([input_sequence, target_values], loss, updates=updates, allow_input_downcast=True)

#computing loss without training
compute_cost = theano.function([input_sequence, target_values], loss, allow_input_downcast=True)

generation

Simple:

get initial context(seed),
predict next token probabilities,
sample next token,
add it to the context
repeat from step 2

You'll get a more detailed info on how it works in the homework section.



In [ ]:

    
#compile the function that computes probabilities for next token given previous text.

#reshape back into original shape
next_word_probas = network_output.reshape((input_sequence.shape[0],input_sequence.shape[1],len(tokens)))
#predictions for next tokens (after sequence end)
last_word_probas = next_word_probas[:,-1]
probs = theano.function([input_sequence],last_word_probas,allow_input_downcast=True)



In [ ]:

    
def generate_sample(seed_phrase=None,N=MAX_LEN,t=1,n_snippets=1):
    '''
    The function generates text given a phrase of length at least SEQ_LENGTH.
        
    parameters:
        sample_fun - max_ or proportional_sample_fun or whatever else you implemented
        
        The phrase is set using the variable seed_phrase

        The optional input "N" is used to set the number of characters of text to predict.     
    '''
    if seed_phrase is None:
        seed_phrase=start_token
    if len(seed_phrase) > MAX_LEN:
        seed_phrase = seed_phrase[-MAX_LEN:]
    assert type(seed_phrase) is str

    snippets = []
    for _ in range(n_snippets):
        sample_ix = []
        x = [token_to_id.get(c,0) for c in seed_phrase]
        x = np.array([x])

        for i in range(N):
            # Pick the character that got assigned the highest probability
            p = probs(x).ravel()
            p = p**t / np.sum(p**t)
            ix = np.random.choice(np.arange(len(tokens)),p=p)
            sample_ix.append(ix)

            x = np.hstack((x[-MAX_LEN+1:],[[ix]]))

        random_snippet = seed_phrase + ''.join(id_to_token[ix] for ix in sample_ix)    
        snippets.append(random_snippet)
        
    print("----\n %s \n----" % '; '.join(snippets))

Model training

Here you can tweak parameters or insert your generation function

Once something word-like starts generating, try increasing seq_length



In [ ]:

    
def sample_batch(data, batch_size):
    
    rows = data[np.random.randint(0,len(data),size=batch_size)]
    
    return rows[:,:-1],rows[:,1:]



In [ ]:

    
print("Training ...")


#total N iterations
n_epochs=100

# how many minibatches are there in the epoch 
batches_per_epoch = 500

#how many training sequences are processed in a single function call
batch_size=10


for epoch in xrange(n_epochs):

    print "Generated names"
    generate_sample(n_snippets=10)

    avg_cost = 0;
    
    for _ in range(batches_per_epoch):
        
        x,y = sample_batch(names_ix,batch_size)
        avg_cost += train(x, y)
        
    print("Epoch {} average loss = {}".format(epoch, avg_cost / batches_per_epoch))



In [ ]:



In [ ]:

    
generate_sample(n_snippets=100)



In [ ]:

    
generate_sample(seed=" A")