This code implements a Recurrent Neural Network with LSTM/RNN units for training/sampling from character-level language models. In other words the model takes a text file as input and trains the RNN network that learns to predict the next character in a sequence.
The RNN can then be used to generate text character by character that will look like the original training data.
This code is based on this blog, and the code is an step-by-step implimentation of the character-level implimentation.
In [1]:
    
import tensorflow as tf
import time
    
In [2]:
    
print('TensorFlow version:', tf.__version__)
    
    
In [3]:
    
import codecs
import os
import collections
from six.moves import cPickle
import numpy as np
class TextLoader():
    def __init__(self, data_dir, batch_size, seq_length, encoding='utf-8'):
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.encoding = encoding
        input_file = os.path.join(data_dir, "input.txt")
        vocab_file = os.path.join(data_dir, "vocab.pkl")
        tensor_file = os.path.join(data_dir, "data.npy")
        if not (os.path.exists(vocab_file) and os.path.exists(tensor_file)):
            print("reading text file")
            self.preprocess(input_file, vocab_file, tensor_file)
        else:
            print("loading preprocessed files")
            self.load_preprocessed(vocab_file, tensor_file)
        self.create_batches()
        self.reset_batch_pointer()
    def preprocess(self, input_file, vocab_file, tensor_file):
        with codecs.open(input_file, "r", encoding=self.encoding) as f:
            data = f.read()
        counter = collections.Counter(data)
        count_pairs = sorted(counter.items(), key=lambda x: -x[1])
        self.chars, _ = zip(*count_pairs)
        self.vocab_size = len(self.chars)
        self.vocab = dict(zip(self.chars, range(len(self.chars))))
        with open(vocab_file, 'wb') as f:
            cPickle.dump(self.chars, f)
        self.tensor = np.array(list(map(self.vocab.get, data)))
        np.save(tensor_file, self.tensor)
    def load_preprocessed(self, vocab_file, tensor_file):
        with open(vocab_file, 'rb') as f:
            self.chars = cPickle.load(f)
        self.vocab_size = len(self.chars)
        self.vocab = dict(zip(self.chars, range(len(self.chars))))
        self.tensor = np.load(tensor_file)
        self.num_batches = int(self.tensor.size / (self.batch_size *
                                                   self.seq_length))
    def create_batches(self):
        self.num_batches = int(self.tensor.size / (self.batch_size *
                                                   self.seq_length))
        # When the data (tensor) is too small, let's give them a better error message
        if self.num_batches==0:
            assert False, "Not enough data. Make seq_length and batch_size small."
        self.tensor = self.tensor[:self.num_batches * self.batch_size * self.seq_length]
        xdata = self.tensor
        ydata = np.copy(self.tensor)
        ydata[:-1] = xdata[1:]
        ydata[-1] = xdata[0]
        self.x_batches = np.split(xdata.reshape(self.batch_size, -1), self.num_batches, 1)
        self.y_batches = np.split(ydata.reshape(self.batch_size, -1), self.num_batches, 1)
    def next_batch(self):
        x, y = self.x_batches[self.pointer], self.y_batches[self.pointer]
        self.pointer += 1
        return x, y
    def reset_batch_pointer(self):
        self.pointer = 0
    
what is batch, number_of_batch, batch_size and seq_length in the charcter level example?
Lets assume the input is 'here is an example'. Then:
So, what are our actual parameters?
In [4]:
    
batch_size = 60 #minibatch size, i.e. size of dataset in each epoch
seq_length = 50 #RNN sequence length
num_epochs = 25 # you should change it to 50 if you want to see a relatively good results
learning_rate = 0.002
decay_rate = 0.97
rnn_size = 128 #size of RNN hidden state
num_layers = 2 #number of layers in the RNN
    
In [5]:
    
!mkdir -p ../../data/character_model
!wget -nv -O ../../data/character_model/input.txt https://ibm.box.com/shared/static/a3f9e9mbpup09toq35ut7ke3l3lf03hg.txt
    
    
In [6]:
    
data_loader = TextLoader('../../data/character_model/', batch_size, seq_length)
vocab_size = data_loader.vocab_size
data_loader.vocab_size
    
    
    Out[6]:
In [7]:
    
data_loader.num_batches
    
    Out[7]:
In [8]:
    
x,y = data_loader.next_batch()
    
In [9]:
    
x
    
    Out[9]:
In [10]:
    
x.shape  #batch_size=60, seq_length=50
    
    Out[10]:
In [11]:
    
y
    
    Out[11]:
In [12]:
    
print('Vocabulary size:', data_loader.vocab_size)
    
    
In [13]:
    
print(", ".join(sorted(list(data_loader.chars))))
    
    
In [14]:
    
data_loader.vocab['t']
    
    Out[14]:
BasicRNNCell is the most basic RNN cell.
In [15]:
    
# a two layer cell
with tf.variable_scope('multi_rnn_cell'):
    stacked_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicRNNCell(rnn_size) for _ in range(num_layers)])
    
In [16]:
    
# hidden state size
stacked_cell.output_size
    
    Out[16]:
In [17]:
    
stacked_cell.state_size
    
    Out[17]:
In [18]:
    
input_data = tf.placeholder(tf.int32, [batch_size, seq_length])# a 60x50
targets = tf.placeholder(tf.int32, [batch_size, seq_length]) # a 60x50
    
The memory state of the network is initialized with a vector of zeros and gets updated after reading each character.
BasicRNNCell.zero_state(batch_size, dtype) Return zero-filled state tensor(s).
Args:
batch_size: int, float, or unit Tensor representing the batch size.
dtype: the data type to use for the state.
In [19]:
    
initial_state = stacked_cell.zero_state(batch_size, tf.float32) #why batch_size ? 60x128
    
In [20]:
    
input_data
    
    Out[20]:
In [21]:
    
session = tf.Session()
    
In [22]:
    
feed_dict={input_data:x, targets:y}
    
In [23]:
    
session.run(input_data, feed_dict)
    
    Out[23]:
In [24]:
    
with tf.variable_scope('rnnlm',reuse=False):
    softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size]) #128x65
    softmax_b = tf.get_variable("softmax_b", [vocab_size]) # 1x65)
    with tf.device("/cpu:0"):
        embedding = tf.get_variable("embedding", [vocab_size, rnn_size])  #65x128
        #input_data is a matrix of 60x50 and embedding is dictionary of 65x128 for all 65 characters
        # embedding_lookup goes to each row of input_data, and for each character in the row, finds the correspond vector in embedding
        # it creates a 60*50*[1*128] matrix
        # so, the first elemnt of em, is a matrix of 50x128, which each row of it is vector representing that character
        em = tf.nn.embedding_lookup(embedding, input_data) # em is 60x50x[1*128]
        # split: Splits a tensor into sub tensors.
        # syntax:  tf.split(split_dim, num_split, value, name='split')
        # it will split the 60x50x[1x128] matrix into 50 matrix of 60x[1*128]
        inputs = tf.split(em, seq_length, 1)
        # It will convert the list to 50 matrix of [60x128]
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
    
In [25]:
    
session.run(tf.global_variables_initializer())
session.run(embedding)
    
    Out[25]:
In [26]:
    
em = tf.nn.embedding_lookup(embedding, input_data)
em
    
    Out[26]:
In [27]:
    
emp = session.run(em,feed_dict={input_data:x})
emp.shape
    
    Out[27]:
In [28]:
    
emp[0]
    
    Out[28]:
In [29]:
    
inputs = tf.split(em, seq_length, 1)
inputs[0:5]
    
    Out[29]:
In [30]:
    
inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
inputs[0:5]
    
    Out[30]:
The parallelism is only for efficiency. Each character in a batch is handled in parallel, but the network sees one character of a sequence at a time and does the computations accordingly. All the computations involving the characters of all sequences in a batch at a given time step are done in parallel.
In [31]:
    
session.run(inputs[0],feed_dict={input_data:x})
    
    Out[31]:
In [32]:
    
stacked_cell.state_size
    
    Out[32]:
In [33]:
    
#outputs is 50x[60*128]
outputs, last_state = tf.contrib.legacy_seq2seq.rnn_decoder(inputs, initial_state, stacked_cell, loop_function=None, scope='rnnlm')
    
In [34]:
    
outputs[0:5]
    
    Out[34]:
In [35]:
    
test = outputs[0]
test
    
    Out[35]:
In [36]:
    
session.run(tf.global_variables_initializer())
session.run(test,feed_dict={input_data:x})
    
    Out[36]:
outputs is 50x[60*128]. We need to reshape it to [60x50x128]. Then we can calculate the softmax:
softmax_w is [rnn_size, vocab_size], [128x65]
[60x50x128]x[128x65]+[60x50]
In [37]:
    
output = tf.reshape(tf.concat(outputs, 1), [-1, rnn_size])
output
    
    Out[37]:
In [38]:
    
logits = tf.matmul(output, softmax_w) + softmax_b
logits
    
    Out[38]:
In [39]:
    
probs = tf.nn.softmax(logits)
probs
    
    Out[39]:
In [40]:
    
session.run(tf.global_variables_initializer())
session.run(probs,feed_dict={input_data:x})
    
    Out[40]:
In [41]:
    
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits],
                [tf.reshape(targets, [-1])],
                [tf.ones([batch_size * seq_length])],
                vocab_size)
    
In [42]:
    
cost = tf.reduce_sum(loss) / batch_size / seq_length
cost
    
    Out[42]:
In [43]:
    
final_state = last_state
final_state
    
    Out[43]:
In [44]:
    
lr = tf.Variable(0.0, trainable=False)
    
In [45]:
    
grad_clip =5.
tvars = tf.trainable_variables()
    
In [46]:
    
tvars
    
    Out[46]:
In [47]:
    
session.run(tf.global_variables_initializer())
[v.name for v in tf.global_variables()]
    
    Out[47]:
In [48]:
    
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)
grads
    
    Out[48]:
In [49]:
    
session.run(grads, feed_dict)[0]
    
    Out[49]:
In [50]:
    
optimizer = tf.train.AdamOptimizer(lr)
train_op = optimizer.apply_gradients(zip(grads, tvars))
    
In [51]:
    
class LSTMModel():
    def __init__(self,sample=False):
        rnn_size = 128 # size of RNN hidden state vector
        batch_size = 60 # minibatch size, i.e. size of dataset in each epoch
        seq_length = 50 # RNN sequence length
        num_layers = 2 # number of layers in the RNN
        vocab_size = 65
        grad_clip = 5.
        if sample:
            print("sample mode")
            batch_size = 1
            seq_length = 1
        # model.cell.state_size is (128, 128)
        with tf.variable_scope('lstm_model_cell'):
            reuse = tf.get_variable_scope().reuse
            self.stacked_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicRNNCell(rnn_size, reuse=reuse) 
                                                         for _ in range(num_layers)])
        self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length])
        self.targets = tf.placeholder(tf.int32, [batch_size, seq_length])
        # Initial state of the LSTM memory.
        # The memory state of the network is initialized with a vector of zeros and gets updated after reading each char. 
        self.initial_state = stacked_cell.zero_state(batch_size, tf.float32) #why batch_size
        with tf.variable_scope('rnnlm_class1'):
            softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size]) #128x65
            softmax_b = tf.get_variable("softmax_b", [vocab_size]) # 1x65
            with tf.device("/cpu:0"):
                embedding = tf.get_variable("embedding", [vocab_size, rnn_size])  #65x128
                inputs = tf.split(tf.nn.embedding_lookup(embedding, self.input_data), seq_length, 1)
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
                #inputs = tf.split(em, seq_length, 1)
        # The value of state is updated after processing each batch of chars.
        outputs, last_state = tf.contrib.legacy_seq2seq.rnn_decoder(inputs, self.initial_state, self.stacked_cell, loop_function=None, scope='rnnlm_class1')
        output = tf.reshape(tf.concat(outputs,1), [-1, rnn_size])
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([self.logits],
                [tf.reshape(self.targets, [-1])],
                [tf.ones([batch_size * seq_length])],
                vocab_size)
        self.cost = tf.reduce_sum(loss) / batch_size / seq_length
        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
        
    def sample(self, sess, chars, vocab, num=200, prime='The ', sampling_type=1):
        state = sess.run(self.stacked_cell.zero_state(1, tf.float32))
        for char in prime[:-1]:
            x = np.zeros((1, 1))
            x[0, 0] = vocab[char]
            feed = {self.input_data: x, self.initial_state:state}
            [state] = sess.run([self.final_state], feed)
        def weighted_pick(weights):
            t = np.cumsum(weights)
            s = np.sum(weights)
            return(int(np.searchsorted(t, np.random.rand(1)*s)))
        ret = prime
        char = prime[-1]
        for n in range(num):
            x = np.zeros((1, 1))
            x[0, 0] = vocab[char]
            feed = {self.input_data: x, self.initial_state:state}
            [probs, state] = sess.run([self.probs, self.final_state], feed)
            p = probs[0]
            if sampling_type == 0:
                sample = np.argmax(p)
            elif sampling_type == 2:
                if char == ' ':
                    sample = weighted_pick(p)
                else:
                    sample = np.argmax(p)
            else: # sampling_type == 1 default:
                sample = weighted_pick(p)
            pred = chars[sample]
            ret += pred
            char = pred
        return ret
    
the input is always a matrix of of shape [n x m]. Where n is the batch size, m is the feature size. In our case, the input shape will be [60 x ??].
size of data is 1113000, number of batches are 371, batch size is 60 and sequence length is 50. so, 5060371= 1113000
we have 50 epochs. each input matrix will represent 1 update per epoch.
In [52]:
    
with tf.variable_scope("rnn"):
    model = LSTMModel()
    
In [53]:
    
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
e=1
sess.run(tf.assign(model.lr, learning_rate * (decay_rate ** e)))
data_loader.reset_batch_pointer()
state = sess.run(model.initial_state)
state
    
    Out[53]:
In [54]:
    
x, y = data_loader.next_batch()
feed = {model.input_data: x, model.targets: y, model.initial_state:state}
    
In [55]:
    
train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed)
train_loss
    
    Out[55]:
In [56]:
    
state
    
    Out[56]:
In [57]:
    
initial_lr = 0.01
num_epochs = 50
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for e in range(num_epochs): # num_epochs is 5 for test, but should be higher
        current_lr = initial_lr * (decay_rate ** e)
        sess.run(tf.assign(model.lr, current_lr))
        print('Epoch {} ({} / {} batches, lr={:.4f})'.format(
            e+1,
            (e+1) * data_loader.num_batches, 
            num_epochs * data_loader.num_batches,
            current_lr
        ))
        data_loader.reset_batch_pointer()
        state = sess.run(model.initial_state) # (2x[60x128])
        for b in range(data_loader.num_batches): #for each batch
            start = time.time()
            x, y = data_loader.next_batch()
            feed = {model.input_data: x, model.targets: y, model.initial_state:state}
            train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed)
            end = time.time()
        print("Train_loss={:.3f}   Time/Batch={:.3f} ms".format(
            train_loss, 
            (end - start) * 1000
        ))
        print()
        #model.sample(sess, data_loader.chars , data_loader.vocab, num=200, prime='The ', sampling_type=1)
    
    
    
In [ ]:
    
sess = tf.InteractiveSession()
with tf.variable_scope("sample_test"):
    sess.run(tf.global_variables_initializer())
    m = LSTMModel(sample=True)
    
In [ ]:
    
prime='The '
num=200
sampling_type=1
vocab=data_loader.vocab
chars=data_loader.chars
    
In [ ]:
    
sess.run(m.initial_state)
    
In [ ]:
    
#print state
sess.run(tf.global_variables_initializer())
state=sess.run(m.initial_state)
for char in prime[:-1]:
    x = np.zeros((1, 1))
    x[0, 0] = vocab[char]
    feed = {m.input_data: x, m.initial_state:state}
    [state] = sess.run([m.final_state], feed)
    
In [ ]:
    
state
    
In [ ]:
    
def weighted_pick(weights):
    t = np.cumsum(weights)
    s = np.sum(weights)
    return(int(np.searchsorted(t, np.random.rand(1)*s)))
ret = prime
char = prime[-1]
for n in range(num):
    x = np.zeros((1, 1))
    x[0, 0] = vocab[char]
    feed = {m.input_data: x, m.initial_state:state}
    [probs, state] = sess.run([m.probs, m.final_state], feed)
    p = probs[0]
    if sampling_type == 0:
        sample = np.argmax(p)
    elif sampling_type == 2:
        if char == ' ':
            sample = weighted_pick(p)
        else:
            sample = np.argmax(p)
    else: # sampling_type == 1 default:
        sample = weighted_pick(p)
    pred = chars[sample]
    ret += pred
    char = pred
    
In [ ]:
    
ret
    
In [66]:
    
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
state=sess.run(m.initial_state)
m.sample(sess, data_loader.chars , data_loader.vocab, num=200, prime='The ', sampling_type=1)
    
    Out[66]:
In [ ]: