In [1]:
import numpy as np
import theano
import theano.tensor as T
import lasagne
import os
In [2]:
start_token = " "
with open("names") as f:
names = f.read()[:-1].split('\n')
names = [start_token+name for name in names]
In [3]:
print 'n samples = ',len(names)
for x in names[::1000]:
print x
In [4]:
#all unique characters go here
token_set = set()
for name in names:
for letter in name:
token_set.add(letter)
tokens = list(token_set)
print 'n_tokens = ',len(tokens)
In [5]:
#!token_to_id = <dictionary of symbol -> its identifier (index in tokens list)>
token_to_id = {t:i for i,t in enumerate(tokens) }
#!id_to_token = < dictionary of symbol identifier -> symbol itself>
id_to_token = {i:t for i,t in enumerate(tokens)}
In [6]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(map(len,names),bins=25);
# truncate names longer than MAX_LEN characters.
MAX_LEN = min([60,max(list(map(len,names)))])
#ADJUST IF YOU ARE UP TO SOMETHING SERIOUS
In [7]:
names_ix = list(map(lambda name: list(map(token_to_id.get,name)),names))
#crop long names and pad short ones
for i in range(len(names_ix)):
names_ix[i] = names_ix[i][:MAX_LEN] #crop too long
if len(names_ix[i]) < MAX_LEN:
names_ix[i] += [token_to_id[" "]]*(MAX_LEN - len(names_ix[i])) #pad too short
assert len(set(map(len,names_ix)))==1
names_ix = np.array(names_ix)
In [8]:
from agentnet import Recurrence
from lasagne.layers import *
from agentnet.memory import *
from agentnet.resolver import ProbabilisticResolver
In [9]:
sequence = T.matrix('token sequence','int64')
inputs = sequence[:,:-1]
targets = sequence[:,1:]
l_input_sequence = InputLayer(shape=(None, None),input_var=inputs)
In [10]:
###One step of rnn
class step:
#inputs
inp = InputLayer((None,),name='current character')
h_prev = InputLayer((None,10),name='previous rnn state')
#recurrent part
emb = EmbeddingLayer(inp, len(tokens), 30,name='emb')
h_new = RNNCell(h_prev,emb,name="rnn") #just concat -> denselayer
next_token_probas = DenseLayer(h_new,len(tokens),nonlinearity=T.nnet.softmax)
#pick next token from predicted probas
next_token = ProbabilisticResolver(next_token_probas)
In [11]:
training_loop = Recurrence(
state_variables={step.h_new:step.h_prev},
input_sequences={step.inp:l_input_sequence},
tracked_outputs=[step.next_token_probas,],
unroll_scan=False,
)
In [12]:
# Model weights
weights = lasagne.layers.get_all_params(training_loop,trainable=True)
print weights
In [13]:
predicted_probabilities = lasagne.layers.get_output(training_loop[step.next_token_probas])
#If you use dropout do not forget to create deterministic version for evaluation
In [15]:
loss = lasagne.objectives.categorical_crossentropy(predicted_probabilities.reshape((-1,len(tokens))),
targets.reshape((-1,))).mean()
#<Loss function - a simple categorical crossentropy will do, maybe add some regularizer>
updates = lasagne.updates.adam(loss,weights)
In [20]:
#training
train_step = theano.function([sequence], loss,
updates=training_loop.get_automatic_updates()+updates)
In [21]:
n_steps = T.scalar(dtype='int32')
feedback_loop = Recurrence(
state_variables={step.h_new:step.h_prev,
step.next_token:step.inp},
tracked_outputs=[step.next_token_probas,],
batch_size=theano.shared(1),
n_steps=n_steps,
unroll_scan=False,
)
In [22]:
generated_tokens = get_output(feedback_loop[step.next_token])
In [23]:
generate_sample = theano.function([n_steps],generated_tokens,updates=feedback_loop.get_automatic_updates())
In [24]:
def generate_string(length=MAX_LEN):
output_indices = generate_sample(length)[0]
return ''.join(tokens[i] for i in output_indices)
In [25]:
generate_string()
Out[25]:
In [26]:
def sample_batch(data, batch_size):
rows = data[np.random.randint(0,len(data),size=batch_size)]
return rows
In [27]:
print("Training ...")
#total N iterations
n_epochs=100
# how many minibatches are there in the epoch
batches_per_epoch = 500
#how many training sequences are processed in a single function call
batch_size=10
for epoch in xrange(n_epochs):
avg_cost = 0;
for _ in range(batches_per_epoch):
avg_cost += train_step(sample_batch(names_ix,batch_size))
print("\n\nEpoch {} average loss = {}".format(epoch, avg_cost / batches_per_epoch))
print "Generated names"
for i in range(10):
print generate_string(),
In [ ]:
In [ ]: