OVERVIEW
Deep LSTM language model trained on a large dataset of tweets by presenting the next character at each (time)step of a string.
The input is a sequence of class labels representing the letters (cf. twitter preprocessing ipynb) which gets transformed into a dense embedding by the first layer. Variable length sequences are masked by the zero index. The embeddings are then fed into five stacked LSTM layers. No regularization like dropout or batch normalization is applied as our dataset contains over 420 million sample tweets. Overfitting is not a potential problem.
The final layer consists of one neuron per symbol of the alphabet followed by a softmax activation. This provides a posterior distribution over character probabilities (for a character at position i+1) given the string (up to position i) for all positions in the string. We can directly sample from this distribution and thus generate novel tweets.
For future experiments it could generally be interesting to inspect closer whether the hidden distributions generated by comparable symbol level networks can be used in related tasks - either as pure embeddings or in joint neural architectures. Also the embedding is chosen rather arbitrarily; various sizes should be evaluated in comparison to just one hot vectors, although I would not expect to find a large impact either way there as it is already chosen on the rather large side given the alphabet size.
In [38]:
import linereader
import numpy as np
import random
import string
In [39]:
datafile = 'dataset-twitter/cleaned_tweets.txt' # File with 1 "document", i.e. tweet, per line.
n_lines = 420012443 # Set to -1 to recount.
class DataGenerator(object):
def __init__(self, f, test_size=512, cache_size=100000, n_lines=-1):
self.f = linereader.dopen(f)
if n_lines < 1:
self.n_lines = 0
for _ in self.f:
self.n_lines += 1
else:
self.n_lines = n_lines
assert self.n_lines > cache_size
self.cache = []
self.cache_size = cache_size
self.test_size = test_size
self.test_set = self.f.getlines(1, test_size)
self.test_set = map(lambda s: '\t' + s, self.test_set) # Prefix start symbol.
self.__cache()
def __del__(self):
self.f.close()
def __iter__(self):
return self
def __cache(self):
linenum = random.randint(self.test_size + 1, self.n_lines - self.cache_size)
lines = self.f.getlines(linenum, linenum + self.cache_size)
self.cache.extend(lines)
random.shuffle(self.cache)
def next(self):
if len(self.cache) == 0:
self.__cache()
return '\t' + self.cache.pop()
gen = DataGenerator(datafile, n_lines=n_lines)
In [41]:
tweet = gen.next()
print tweet
print len(gen.cache)
In [42]:
alphabet = set(string.printable) - set(string.ascii_uppercase) - set(string.whitespace) - set(['`'])
alphabet = list(alphabet) + [' ', '\n', '\t'] # EOL and SOL
alphabet.sort()
alphabet = ['ZEROVECTOR'] + alphabet # The zero vector is used for masking sequences.
print alphabet, len(alphabet)
index_to_char = alphabet
char_to_index = {}
for i in xrange(len(alphabet)):
char_to_index[alphabet[i]] = i
print char_to_index
In [43]:
# string -> one hot vectors
def encode_vectors(tweet):
arr = np.zeros((len(tweet), len(alphabet)), dtype='uint8')
for i in xrange(len(tweet)):
arr[i, char_to_index[tweet[i]]] = 1
return arr
# string -> vocabulary (=alphabet) indices
def encode_indices(tweet):
arr = np.zeros(len(tweet), dtype='uint8')
for i in xrange(len(tweet)):
arr[i] = char_to_index[tweet[i]]
return arr
# index -> char
def _decode(idx):
if idx != 0:
return index_to_char[idx]
else:
return ''
# indices -> string
def decode_indices(arr):
return ''.join(map(_decode, arr))
# one hot vectors -> string
def decode_vectors(arr):
string = []
for i in xrange(arr.shape[0]):
c = arr[i]
idx = np.argmax(arr[i])
if idx == 0:
continue
else:
c = index_to_char[idx]
if c == '\n':
# We only use this function to visualize predictions during training.
# So we put a special symbol here to keep it optically aligned with the target.
c = 'Ç'
string.append(c)
return ''.join(string)
tweet = gen.next()
print tweet, len(tweet)
arr = encode_indices(tweet)
tweet = decode_indices(arr)
print tweet, len(tweet)
arr = encode_vectors(tweet)
tweet = decode_vectors(arr)
print tweet, len(tweet)
In [6]:
import keras
from keras.layers import Dense, Activation, LSTM, Embedding
from keras.layers.wrappers import TimeDistributed
from keras.models import Sequential, load_model
from keras.optimizers import RMSprop
import bisect
In [7]:
load = False
# It would probably not hurt to go even wider and maybe deeper, but training is already taking quite a bit and
# we are not trying to beat state-of-the-art results on prominent datasets here.
n_lstm_cells = 512
# Picked pretty randomly. Should be investigated closer, although I guess it's rather on the (too?) large side.
embedding_size = 32
max_tweet_len = 162 # longest tweet is 161 + 1 for start symbol
if load:
model = load_model("filename")
else:
model = Sequential()
model.add(Embedding(len(alphabet), embedding_size, mask_zero=True))
# TODO: parameterize this
model.add(LSTM(n_lstm_cells, return_sequences=True))
model.add(LSTM(n_lstm_cells, return_sequences=True))
model.add(LSTM(n_lstm_cells, return_sequences=True))
model.add(LSTM(n_lstm_cells, return_sequences=True))
model.add(LSTM(n_lstm_cells, return_sequences=True))
# TimeDistributed as we want to predict the next char at each step.
model.add(TimeDistributed(Dense(len(alphabet))))
model.add(Activation('softmax'))
opt = RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
In [44]:
batch_size = 32
def make_batch(gen=gen, batch_size=batch_size, sample_len=max_tweet_len):
xs = np.zeros((batch_size, sample_len), dtype='uint8')
ys = np.zeros((batch_size, sample_len, len(alphabet)), dtype='uint8')
for i in xrange(batch_size):
tweet = gen.next()
length = len(tweet) - 1
xs[i,:length] = encode_indices(tweet[:-1])
ys[i,:length] = encode_vectors(tweet[1:])
return xs,ys
def make_testset(s=gen.test_set, sample_len=max_tweet_len):
ns = len(s)
xs = np.zeros((ns, sample_len), dtype='uint8')
ys = np.zeros((ns, sample_len, len(alphabet)), dtype='uint8')
for i in xrange(ns):
tweet = s[i]
# TODO: refactor common code
length = len(tweet) - 1
xs[i,:length] = encode_indices(tweet[:-1])
ys[i,:length] = encode_vectors(tweet[1:])
###
return xs,ys
bx,by = make_batch(batch_size=1)
print bx
bx = bx[0]
by = by[0]
dx = decode_indices(bx)
dy = decode_vectors(by)
print len(dx), len(dy)
print dx
print dy
test_set = make_testset()
print test_set[0].shape
print test_set[1].shape
print decode_indices(test_set[0][0])
print decode_vectors(test_set[1][0])
In [25]:
output_every_n = 500 # Give status update and eval on test data every n batches.
n_batches = 20000
n_examples = 4
# Note that I do not keep track of total seen samples in the code.
# The model producing the following output actually trained on 30k batches of 32 samples each, i.e. 960k samples.
mu_acc_train = 0.0
for i in xrange(n_batches):
xs,ys = make_batch()
_,acc_train = model.train_on_batch(xs, ys)
mu_acc_train += acc_train
if (i+1) % output_every_n == 0:
print "Batch %d of %d" % (i+1, n_batches)
_,acc_test = model.evaluate(test_set[0], test_set[1], batch_size=batch_size)
# Take some random examples from the test set and show the net output.
# At each prediction step the net sees the string up to then.
# As expected one can soon see that guessing a follow-up word is much trickier than word remainders.
print "Example outputs:"
examples = np.zeros((n_examples, max_tweet_len), dtype='uint8')
for i in xrange(n_examples):
example = random.randint(0, test_set[0].shape[0] - 1)
examples[i] = test_set[0][example]
preds = model.predict(examples, batch_size=n_examples)
for i in xrange(n_examples):
print 'Target: ' + decode_indices(examples[i])[1:]
print 'Prediction: ' + decode_vectors(preds[i])
print 'Train accuracy: %.3f' % (mu_acc_train / (1. * output_every_n))
print 'Test accuracy: %.3f' % (acc_test)
print
mu_acc_train = 0.0
It should be noted that the accuracy on twitter data should not be compared with accuracies on datasets containing more structured or domain-specific text. As it exhibits lots of unusual characteristics, e.g. many tags, URLs, misspellings, noise (e.g. non-english data), smilies, colloquial language, one can expect a significantly lower result. Many of those (and other) errors are also irrelevant from a generation perspective, e.g. choosing the wrong one of somewhat equally possible followups.
In [47]:
text_len = 500 # Upper bound. The net should output the end token much earlier as the longest tweet is of length 162.
n_samples = 5
# Draws a random sample from the net's softmax output and returns the corresponding char.
# beta can, in an analogy to statistical mechanics, be seen as the inverse of thermodynamic temperature.
# Setting it higher (> 1.0), i.e. lower temp/entropy, tilts the sampling towards the more likely options.
# Setting it lower (< 1.0), i.e. higher entropy, gives a relatively higher weight to less likely options.
# Theoretical extreme cases are beta->inf which is identical to argmax, i.e. it always produces the most likely
# sequence, and beta=0 which is like sampling from a uniform distribution, i.e. it produces each sequence equally
# likely.
def tochar_prob(output, beta):
summed = []
assert beta >= 0
if beta != 1.0:
output = output**beta
output /= sum(output)
summed = np.cumsum(output)
i = bisect.bisect(summed, random.random())
if i == 0:
return 'Ž' # Just such that we can catch it in the output. The net should never do that.
return index_to_char[i]
# Sample tweets by feeding the net output back into itself.
# The net can also be seeded with an arbitrary string and continue from there.
# TODO: Support batch processing.
def sample_tweet(length, seed='\t', model=model, beta=1.5):
assert length - len(seed) > 0
seq = np.zeros((1, length), dtype='uint8')
for i, char in enumerate(seed):
seq[:,i] = char_to_index[seed[i]]
string = list(seed)
for i in xrange(len(seed), length):
c = model.predict(seq[:,:i])[0,-1]
c = tochar_prob(c, beta=beta)
if c == '\n':
break
string.append(c)
seq[0,i] = char_to_index[c]
return ''.join(string).strip()
# 5 general samples.
for i in xrange(n_samples):
print sample_tweet(text_len)
print
# 5 samples given a seed.
for i in xrange(n_samples):
print sample_tweet(text_len, "\tyo yo yo")
print
# Feel free to experiment with different betas.
In [48]:
model.save('model-charlevel-twitter/32emb-5x512lstm-1M.h5')