In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
# load the dataset
import os
with open('sharespeare-kaparthy.txt') as f:
raw = f.read()
print raw[:1000] # print the first 100 characters
In [3]:
# convert raw data into indices
import numpy as np
from pystacks.utils.text.vocab import Vocab
vocab = Vocab(unk=False)
data = np.array([vocab.add(c) for c in raw])
print data[:100] # print the first 100 characters
In [4]:
def get_batch(ngrams=10, batch_size=100):
X, Y = [], []
for i in np.random.randint(len(data)-ngrams-1, size=batch_size):
x = data[i:i+ngrams]
y = data[i+1:i+ngrams+1]
X.append(x)
Y.append(y)
X = np.array(X, dtype='int32').reshape(batch_size, ngrams, 1)
Y = np.array(Y, dtype='int32')
return X, Y
X, Y = get_batch()
print X.shape # dim0 is batch size, dim1 is time steps, dim2 is feature size
print Y.shape # dim0 is batch size, dim2 is time steps
def one_hot(y):
Y = np.zeros( list(y.shape) + [len(vocab)])
for batch in xrange(Y.shape[0]):
for time in xrange(Y.shape[1]):
Y[batch, time, y[batch, time]] = 1
return Y.astype('float32')
print one_hot(Y).shape
In [5]:
# make a character model
from theano import tensor as T, function
from pystacks.layers.container import Recurrent
from pystacks.layers.memory import GRUMemoryLayer, LSTMMemoryLayer
from pystacks.layers.lookup import LookupTable
from pystacks.layers.common import LinearLayer, Tanh, Softmax, Dropout
from pystacks.criteria import cross_entropy_loss
from pystacks.transformer import UnitNorm
emb_size = 20
h1_size = 500
h2_size = 500
net = Recurrent([
LookupTable(len(vocab), emb_size, E_transform=UnitNorm()),
LSTMMemoryLayer(emb_size, h1_size),
LSTMMemoryLayer(h1_size, h2_size),
LinearLayer(h2_size, len(vocab)),
Softmax()])
sym_X = T.itensor3()
sym_prob = net.forward(sym_X, return_sequence=True, truncate_grad=50)
sym_pred = sym_prob.argmax(axis=-1)
f_pred = function([sym_X], [sym_prob, sym_pred])
In [6]:
original_weights = {name:param.var.get_value() for name, param in net.params.items()}
def reset_weights():
for name, param in net.params.items():
param.var.set_value(original_weights[name])
In [ ]:
prob, pred = f_pred(X)
print prob.shape # (batch_size, time_step, probabilities)
print pred.shape # (batch_size, class_label at each time step)
In [ ]:
from pystacks.optimizer import RMSProp
from pystacks.gradient_transformer import ClipGradientNorm
optimizer = RMSProp()
sym_Y = T.ftensor3()
sym_loss = cross_entropy_loss(sym_prob, sym_Y)
sym_acc = T.mean(T.eq(sym_pred, sym_Y.argmax(-1)))
sym_lr = T.fscalar()
updates = net.grad_updates(sym_loss, lr=sym_lr, optimizer=optimizer, default_grad_transformer=ClipGradientNorm(20.))
train = function([sym_X, sym_Y, sym_lr], [sym_loss, sym_acc], updates=updates)
test = function([sym_X, sym_Y], [sym_loss, sym_acc])
In [ ]:
ngrams = 100
batch_size = 64
num_batches = 5000
print_every = 100
decay_rate = 1e-3
lr = 1e-2
from time import time
reset_weights()
start = time()
for i in xrange(num_batches):
X, Y = get_batch(ngrams, batch_size)
loss, acc = train(X, one_hot(Y), lr)
lr *= 1. / (1. + decay_rate)
if i % print_every == 0:
print 'iteration', i, 'loss', loss, 'acc', acc, 'elapsed', time() - start
start = time()
In [ ]:
chars = ['I']
for i in xrange(1000):
in_ind = [vocab[c] for c in chars]
prob, pred = f_pred(np.array([[in_ind[-ngrams:]]], dtype='int32').reshape(1, -1, 1))
char = np.random.choice(vocab.index2word, p=prob[0, -1].flatten())
chars.append(char)
print ''.join(chars)
In [ ]: