This code implements a Recurrent Neural Network with LSTM/RNN units for training/sampling from character-level language models. In other words the model takes a text file as input and trains the RNN network that learns to predict the next character in a sequence.
The RNN can then be used to generate text character by character that will look like the original training data.
This code is based on this blog, and the code is an step-by-step implimentation of the character-level implimentation.
In [1]:
import tensorflow as tf
import time
In [2]:
print('TensorFlow version:', tf.__version__)
In [3]:
import codecs
import os
import collections
from six.moves import cPickle
import numpy as np
class TextLoader():
def __init__(self, data_dir, batch_size, seq_length, encoding='utf-8'):
self.data_dir = data_dir
self.batch_size = batch_size
self.seq_length = seq_length
self.encoding = encoding
input_file = os.path.join(data_dir, "input.txt")
vocab_file = os.path.join(data_dir, "vocab.pkl")
tensor_file = os.path.join(data_dir, "data.npy")
if not (os.path.exists(vocab_file) and os.path.exists(tensor_file)):
print("reading text file")
self.preprocess(input_file, vocab_file, tensor_file)
else:
print("loading preprocessed files")
self.load_preprocessed(vocab_file, tensor_file)
self.create_batches()
self.reset_batch_pointer()
def preprocess(self, input_file, vocab_file, tensor_file):
with codecs.open(input_file, "r", encoding=self.encoding) as f:
data = f.read()
counter = collections.Counter(data)
count_pairs = sorted(counter.items(), key=lambda x: -x[1])
self.chars, _ = zip(*count_pairs)
self.vocab_size = len(self.chars)
self.vocab = dict(zip(self.chars, range(len(self.chars))))
with open(vocab_file, 'wb') as f:
cPickle.dump(self.chars, f)
self.tensor = np.array(list(map(self.vocab.get, data)))
np.save(tensor_file, self.tensor)
def load_preprocessed(self, vocab_file, tensor_file):
with open(vocab_file, 'rb') as f:
self.chars = cPickle.load(f)
self.vocab_size = len(self.chars)
self.vocab = dict(zip(self.chars, range(len(self.chars))))
self.tensor = np.load(tensor_file)
self.num_batches = int(self.tensor.size / (self.batch_size *
self.seq_length))
def create_batches(self):
self.num_batches = int(self.tensor.size / (self.batch_size *
self.seq_length))
# When the data (tensor) is too small, let's give them a better error message
if self.num_batches==0:
assert False, "Not enough data. Make seq_length and batch_size small."
self.tensor = self.tensor[:self.num_batches * self.batch_size * self.seq_length]
xdata = self.tensor
ydata = np.copy(self.tensor)
ydata[:-1] = xdata[1:]
ydata[-1] = xdata[0]
self.x_batches = np.split(xdata.reshape(self.batch_size, -1), self.num_batches, 1)
self.y_batches = np.split(ydata.reshape(self.batch_size, -1), self.num_batches, 1)
def next_batch(self):
x, y = self.x_batches[self.pointer], self.y_batches[self.pointer]
self.pointer += 1
return x, y
def reset_batch_pointer(self):
self.pointer = 0
what is batch, number_of_batch, batch_size and seq_length in the charcter level example?
Lets assume the input is 'here is an example'. Then:
So, what are our actual parameters?
In [4]:
batch_size = 60 #minibatch size, i.e. size of dataset in each epoch
seq_length = 50 #RNN sequence length
num_epochs = 25 # you should change it to 50 if you want to see a relatively good results
learning_rate = 0.002
decay_rate = 0.97
rnn_size = 128 #size of RNN hidden state
num_layers = 2 #number of layers in the RNN
In [5]:
!mkdir -p ../../data/character_model
!wget -nv -O ../../data/character_model/input.txt https://ibm.box.com/shared/static/a3f9e9mbpup09toq35ut7ke3l3lf03hg.txt
In [6]:
data_loader = TextLoader('../../data/character_model/', batch_size, seq_length)
vocab_size = data_loader.vocab_size
data_loader.vocab_size
Out[6]:
In [7]:
data_loader.num_batches
Out[7]:
In [8]:
x,y = data_loader.next_batch()
In [9]:
x
Out[9]:
In [10]:
x.shape #batch_size=60, seq_length=50
Out[10]:
In [11]:
y
Out[11]:
In [12]:
print('Vocabulary size:', data_loader.vocab_size)
In [13]:
print(", ".join(sorted(list(data_loader.chars))))
In [14]:
data_loader.vocab['t']
Out[14]:
BasicRNNCell is the most basic RNN cell.
In [15]:
# a two layer cell
with tf.variable_scope('multi_rnn_cell'):
stacked_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicRNNCell(rnn_size) for _ in range(num_layers)])
In [16]:
# hidden state size
stacked_cell.output_size
Out[16]:
In [17]:
stacked_cell.state_size
Out[17]:
In [18]:
input_data = tf.placeholder(tf.int32, [batch_size, seq_length])# a 60x50
targets = tf.placeholder(tf.int32, [batch_size, seq_length]) # a 60x50
The memory state of the network is initialized with a vector of zeros and gets updated after reading each character.
BasicRNNCell.zero_state(batch_size, dtype) Return zero-filled state tensor(s).
Args:
batch_size: int, float, or unit Tensor representing the batch size.
dtype: the data type to use for the state.
In [19]:
initial_state = stacked_cell.zero_state(batch_size, tf.float32) #why batch_size ? 60x128
In [20]:
input_data
Out[20]:
In [21]:
session = tf.Session()
In [22]:
feed_dict={input_data:x, targets:y}
In [23]:
session.run(input_data, feed_dict)
Out[23]:
In [24]:
with tf.variable_scope('rnnlm',reuse=False):
softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size]) #128x65
softmax_b = tf.get_variable("softmax_b", [vocab_size]) # 1x65)
with tf.device("/cpu:0"):
embedding = tf.get_variable("embedding", [vocab_size, rnn_size]) #65x128
#input_data is a matrix of 60x50 and embedding is dictionary of 65x128 for all 65 characters
# embedding_lookup goes to each row of input_data, and for each character in the row, finds the correspond vector in embedding
# it creates a 60*50*[1*128] matrix
# so, the first elemnt of em, is a matrix of 50x128, which each row of it is vector representing that character
em = tf.nn.embedding_lookup(embedding, input_data) # em is 60x50x[1*128]
# split: Splits a tensor into sub tensors.
# syntax: tf.split(split_dim, num_split, value, name='split')
# it will split the 60x50x[1x128] matrix into 50 matrix of 60x[1*128]
inputs = tf.split(em, seq_length, 1)
# It will convert the list to 50 matrix of [60x128]
inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
In [25]:
session.run(tf.global_variables_initializer())
session.run(embedding)
Out[25]:
In [26]:
em = tf.nn.embedding_lookup(embedding, input_data)
em
Out[26]:
In [27]:
emp = session.run(em,feed_dict={input_data:x})
emp.shape
Out[27]:
In [28]:
emp[0]
Out[28]:
In [29]:
inputs = tf.split(em, seq_length, 1)
inputs[0:5]
Out[29]:
In [30]:
inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
inputs[0:5]
Out[30]:
The parallelism is only for efficiency. Each character in a batch is handled in parallel, but the network sees one character of a sequence at a time and does the computations accordingly. All the computations involving the characters of all sequences in a batch at a given time step are done in parallel.
In [31]:
session.run(inputs[0],feed_dict={input_data:x})
Out[31]:
In [32]:
stacked_cell.state_size
Out[32]:
In [33]:
#outputs is 50x[60*128]
outputs, last_state = tf.contrib.legacy_seq2seq.rnn_decoder(inputs, initial_state, stacked_cell, loop_function=None, scope='rnnlm')
In [34]:
outputs[0:5]
Out[34]:
In [35]:
test = outputs[0]
test
Out[35]:
In [36]:
session.run(tf.global_variables_initializer())
session.run(test,feed_dict={input_data:x})
Out[36]:
outputs is 50x[60*128]. We need to reshape it to [60x50x128]. Then we can calculate the softmax:
softmax_w is [rnn_size, vocab_size], [128x65]
[60x50x128]x[128x65]+[60x50]
In [37]:
output = tf.reshape(tf.concat(outputs, 1), [-1, rnn_size])
output
Out[37]:
In [38]:
logits = tf.matmul(output, softmax_w) + softmax_b
logits
Out[38]:
In [39]:
probs = tf.nn.softmax(logits)
probs
Out[39]:
In [40]:
session.run(tf.global_variables_initializer())
session.run(probs,feed_dict={input_data:x})
Out[40]:
In [41]:
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits],
[tf.reshape(targets, [-1])],
[tf.ones([batch_size * seq_length])],
vocab_size)
In [42]:
cost = tf.reduce_sum(loss) / batch_size / seq_length
cost
Out[42]:
In [43]:
final_state = last_state
final_state
Out[43]:
In [44]:
lr = tf.Variable(0.0, trainable=False)
In [45]:
grad_clip =5.
tvars = tf.trainable_variables()
In [46]:
tvars
Out[46]:
In [47]:
session.run(tf.global_variables_initializer())
[v.name for v in tf.global_variables()]
Out[47]:
In [48]:
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)
grads
Out[48]:
In [49]:
session.run(grads, feed_dict)[0]
Out[49]:
In [50]:
optimizer = tf.train.AdamOptimizer(lr)
train_op = optimizer.apply_gradients(zip(grads, tvars))
In [51]:
class LSTMModel():
def __init__(self,sample=False):
rnn_size = 128 # size of RNN hidden state vector
batch_size = 60 # minibatch size, i.e. size of dataset in each epoch
seq_length = 50 # RNN sequence length
num_layers = 2 # number of layers in the RNN
vocab_size = 65
grad_clip = 5.
if sample:
print("sample mode")
batch_size = 1
seq_length = 1
# model.cell.state_size is (128, 128)
with tf.variable_scope('lstm_model_cell'):
reuse = tf.get_variable_scope().reuse
self.stacked_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicRNNCell(rnn_size, reuse=reuse)
for _ in range(num_layers)])
self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length])
self.targets = tf.placeholder(tf.int32, [batch_size, seq_length])
# Initial state of the LSTM memory.
# The memory state of the network is initialized with a vector of zeros and gets updated after reading each char.
self.initial_state = stacked_cell.zero_state(batch_size, tf.float32) #why batch_size
with tf.variable_scope('rnnlm_class1'):
softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size]) #128x65
softmax_b = tf.get_variable("softmax_b", [vocab_size]) # 1x65
with tf.device("/cpu:0"):
embedding = tf.get_variable("embedding", [vocab_size, rnn_size]) #65x128
inputs = tf.split(tf.nn.embedding_lookup(embedding, self.input_data), seq_length, 1)
inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
#inputs = tf.split(em, seq_length, 1)
# The value of state is updated after processing each batch of chars.
outputs, last_state = tf.contrib.legacy_seq2seq.rnn_decoder(inputs, self.initial_state, self.stacked_cell, loop_function=None, scope='rnnlm_class1')
output = tf.reshape(tf.concat(outputs,1), [-1, rnn_size])
self.logits = tf.matmul(output, softmax_w) + softmax_b
self.probs = tf.nn.softmax(self.logits)
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([self.logits],
[tf.reshape(self.targets, [-1])],
[tf.ones([batch_size * seq_length])],
vocab_size)
self.cost = tf.reduce_sum(loss) / batch_size / seq_length
self.final_state = last_state
self.lr = tf.Variable(0.0, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),grad_clip)
optimizer = tf.train.AdamOptimizer(self.lr)
self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def sample(self, sess, chars, vocab, num=200, prime='The ', sampling_type=1):
state = sess.run(self.stacked_cell.zero_state(1, tf.float32))
for char in prime[:-1]:
x = np.zeros((1, 1))
x[0, 0] = vocab[char]
feed = {self.input_data: x, self.initial_state:state}
[state] = sess.run([self.final_state], feed)
def weighted_pick(weights):
t = np.cumsum(weights)
s = np.sum(weights)
return(int(np.searchsorted(t, np.random.rand(1)*s)))
ret = prime
char = prime[-1]
for n in range(num):
x = np.zeros((1, 1))
x[0, 0] = vocab[char]
feed = {self.input_data: x, self.initial_state:state}
[probs, state] = sess.run([self.probs, self.final_state], feed)
p = probs[0]
if sampling_type == 0:
sample = np.argmax(p)
elif sampling_type == 2:
if char == ' ':
sample = weighted_pick(p)
else:
sample = np.argmax(p)
else: # sampling_type == 1 default:
sample = weighted_pick(p)
pred = chars[sample]
ret += pred
char = pred
return ret
the input is always a matrix of of shape [n x m]. Where n is the batch size, m is the feature size. In our case, the input shape will be [60 x ??].
size of data is 1113000, number of batches are 371, batch size is 60 and sequence length is 50. so, 5060371= 1113000
we have 50 epochs. each input matrix will represent 1 update per epoch.
In [52]:
with tf.variable_scope("rnn"):
model = LSTMModel()
In [53]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
e=1
sess.run(tf.assign(model.lr, learning_rate * (decay_rate ** e)))
data_loader.reset_batch_pointer()
state = sess.run(model.initial_state)
state
Out[53]:
In [54]:
x, y = data_loader.next_batch()
feed = {model.input_data: x, model.targets: y, model.initial_state:state}
In [55]:
train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed)
train_loss
Out[55]:
In [56]:
state
Out[56]:
In [57]:
initial_lr = 0.01
num_epochs = 50
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for e in range(num_epochs): # num_epochs is 5 for test, but should be higher
current_lr = initial_lr * (decay_rate ** e)
sess.run(tf.assign(model.lr, current_lr))
print('Epoch {} ({} / {} batches, lr={:.4f})'.format(
e+1,
(e+1) * data_loader.num_batches,
num_epochs * data_loader.num_batches,
current_lr
))
data_loader.reset_batch_pointer()
state = sess.run(model.initial_state) # (2x[60x128])
for b in range(data_loader.num_batches): #for each batch
start = time.time()
x, y = data_loader.next_batch()
feed = {model.input_data: x, model.targets: y, model.initial_state:state}
train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed)
end = time.time()
print("Train_loss={:.3f} Time/Batch={:.3f} ms".format(
train_loss,
(end - start) * 1000
))
print()
#model.sample(sess, data_loader.chars , data_loader.vocab, num=200, prime='The ', sampling_type=1)
In [ ]:
sess = tf.InteractiveSession()
with tf.variable_scope("sample_test"):
sess.run(tf.global_variables_initializer())
m = LSTMModel(sample=True)
In [ ]:
prime='The '
num=200
sampling_type=1
vocab=data_loader.vocab
chars=data_loader.chars
In [ ]:
sess.run(m.initial_state)
In [ ]:
#print state
sess.run(tf.global_variables_initializer())
state=sess.run(m.initial_state)
for char in prime[:-1]:
x = np.zeros((1, 1))
x[0, 0] = vocab[char]
feed = {m.input_data: x, m.initial_state:state}
[state] = sess.run([m.final_state], feed)
In [ ]:
state
In [ ]:
def weighted_pick(weights):
t = np.cumsum(weights)
s = np.sum(weights)
return(int(np.searchsorted(t, np.random.rand(1)*s)))
ret = prime
char = prime[-1]
for n in range(num):
x = np.zeros((1, 1))
x[0, 0] = vocab[char]
feed = {m.input_data: x, m.initial_state:state}
[probs, state] = sess.run([m.probs, m.final_state], feed)
p = probs[0]
if sampling_type == 0:
sample = np.argmax(p)
elif sampling_type == 2:
if char == ' ':
sample = weighted_pick(p)
else:
sample = np.argmax(p)
else: # sampling_type == 1 default:
sample = weighted_pick(p)
pred = chars[sample]
ret += pred
char = pred
In [ ]:
ret
In [66]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
state=sess.run(m.initial_state)
m.sample(sess, data_loader.chars , data_loader.vocab, num=200, prime='The ', sampling_type=1)
Out[66]:
In [ ]: