Create a RNN model to text generation

  • RNN model at character level
    • Input: n character previous
    • Output: next character
    • Model LSTM
  • Use 'El Quijote' to train the generator

In [1]:
# Header
import numpy as np

path = '/home/ubuntu/data/training/keras/'
path = '/Users/jorge/data/training/keras/'

In [2]:
#Read book
text = open(path + "pg2000.txt").read().lower()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('Chars list: ', chars)
print('total chars:', len(chars))

#Dictionaries to convert char to num & num to char
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


corpus length: 2117498
Chars list:  ['\n', ' ', '!', '"', '#', '$', '%', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '«', '»', '¿', 'à', 'á', 'é', 'í', 'ï', 'ñ', 'ó', 'ù', 'ú', 'ü', '\ufeff']
total chars: 72

In [3]:
# cut the text in semi-redundant sequences of maxlen characters
# One sentence of length 20 for each 3 characters
maxlen = 20
step = 3
sentences = []
next_chars = []
for i in range(300, len(text) - maxlen, step): #Start in line 30 to exclude Gutenberg header.
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print(sentences[4996], '-', next_chars[4996])


nb sequences: 705726
tregará a medea; si  - d

In [ ]:


In [4]:
'''
X: One row by sentence
    in each row a matrix of bool 0/1 of dim length_sentence x num_chars coding the sentence. Dummy variables
y: One row by sentence
    in each row a vector of bool of lengt num_chars with 1 in the next char position
'''

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.float32)
y = np.zeros((len(sentences), len(chars)), dtype=np.int64)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

print('X shape: ',X.shape)
print('y shape: ',y.shape)


Vectorization...
X shape:  (705726, 20, 72)
y shape:  (705726, 72)

In [5]:
# Define the tensorflow graph
import tensorflow as tf

batch_size = 512
dim_lstm = 512

sess = tf.InteractiveSession()


x_input = tf.placeholder(tf.float32, shape=[batch_size, 20, 72])
y_input = tf.placeholder(tf.int64, shape=[batch_size, 72])
#word_lens = tf.placeholder(tf.int64, shape=[batch_size])

keep_prob = tf.placeholder(tf.float32)
learning_rate = tf.placeholder(tf.float32, name='learning_rate')

cell_1 = tf.nn.rnn_cell.LSTMCell(dim_lstm, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123))
cell_1 = tf.nn.rnn_cell.DropoutWrapper(cell_1, output_keep_prob=keep_prob)
#x_seq = [tf.squeeze(xi) for xi in tf.split(1, 20, x_input)]
lstm_outputs_1, _ = tf.nn.dynamic_rnn(cell_1, x_input, dtype=tf.float32, scope='rnn1')
 
cell_2 = tf.nn.rnn_cell.LSTMCell(dim_lstm, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123))
cell_2 = tf.nn.rnn_cell.DropoutWrapper(cell_2, output_keep_prob=keep_prob)
lstm_outputs_2, _ = tf.nn.dynamic_rnn(cell_2,lstm_outputs_1 , dtype=tf.float32, scope='rnn2')


'''
x_seq = [tf.squeeze(xi) for xi in tf.split(1, 20, x_input)]
cell_fw = tf.nn.rnn_cell.LSTMCell(dim_lstm, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123))
cell_bw = tf.nn.rnn_cell.LSTMCell(dim_lstm, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113))
outputs, output_state_fw, output_state_bw = tf.nn.bidirectional_rnn(cell_fw, cell_bw, x_seq,
                                    dtype=tf.float32, sequence_length=word_lens)
print(outputs) 
'''

W_dense = tf.Variable(tf.truncated_normal([dim_lstm, 72], stddev=0.1), name='W_dense')
b_dense = tf.Variable(tf.constant(0.1, shape=[72]), name='b_dense')
dense_output = tf.nn.relu(tf.matmul(lstm_outputs_2[:,-1,:], W_dense) + b_dense)
print(dense_output)

#Prediction
y_pred = tf.nn.softmax(dense_output)

# Loss function
global_step = tf.Variable(0, trainable=False)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(dense_output, y_input, name='cross_entropy')
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,momentum=0.9)
train_op = optimizer.minimize(cross_entropy, global_step=global_step, name='trainer')
    
    
#Accuracy
correct_prediction = tf.equal(tf.argmax(dense_output,1), tf.argmax(y_input,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


Tensor("Relu:0", shape=(512, 72), dtype=float32)

In [6]:
print(lstm_outputs_2)


Tensor("rnn2/transpose:0", shape=(512, 20, 512), dtype=float32)

In [18]:
#batch generator
def batch_generator(x, y, batch_size=32):
    from sklearn.utils import shuffle
    x_shuffle, y_shuffle = shuffle(x, y, random_state=0)
    for i in range(0, x.shape[0]-batch_size, batch_size):
        x_batch = x_shuffle[i:i+batch_size,:]
        y_batch = y_shuffle[i:i+batch_size]
        yield x_batch, y_batch
    
seq = batch_generator(X, y, batch_size=20)
print(next(seq))


(array([[[ 1.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],

       [[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],

       [[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  1.,  0., ...,  0.,  0.,  0.]],

       ..., 
       [[ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],

       [[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],

       [[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]], dtype=float32), array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]))

In [19]:
num_cases=0
batch_list = batch_generator(X[:600000], y[:600000], batch_size=batch_size)
for x_batch, y_batch in batch_list:
    num_cases += x_batch.shape[0]
print(num_cases)


599552

In [8]:
#Inicialization.
print('Initializing')
sess.run(tf.initialize_all_variables())


Initializing

In [20]:
# Execute the graph to train a network
nEpochs = 50

for epoch in range(nEpochs):
    ce_c=[]
    acc_c=[]
    ce_c_tst=[]
    acc_c_tst=[]
        
    batch_list = batch_generator(X[:600000], y[:600000], batch_size=batch_size)
    for i, batch in enumerate(batch_list):
        feedDict = {x_input: batch[0], y_input: batch[1], 
                    keep_prob: 0.3, learning_rate: 0.001} 
        _, ce, acc = sess.run([train_op, cross_entropy, accuracy], feed_dict=feedDict)
        ce_c += [ce]
        acc_c += [acc]
        if i%10==0:
            print(epoch, np.mean(ce_c), np.mean(acc_c))
            
    batch_list_tst = batch_generator(X[600000:], y[600000:], batch_size=batch_size)
    for x_batch, y_batch in batch_list_tst:
        feedDict = {x_input: x_batch, y_input: y_batch,
                    keep_prob: 1}
        ce_tst, acc_tst = sess.run([cross_entropy, accuracy], feed_dict=feedDict)
        ce_c_tst += [ce_tst]
        acc_c_tst += [acc_tst]

    print(epoch, np.mean(ce_c), np.mean(acc_c), np.mean(ce_c_tst), np.mean(acc_c_tst))


0 3.0669 0.158203
0 3.0985 0.150746
0 3.09113 0.146019
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-20-52201fba9582> in <module>()
     13         feedDict = {x_input: batch[0], y_input: batch[1], 
     14                     keep_prob: 0.3, learning_rate: 0.001} 
---> 15         _, ce, acc = sess.run([train_op, cross_entropy, accuracy], feed_dict=feedDict)
     16         ce_c += [ce]
     17         acc_c += [acc]

/Users/jorge/anaconda3/lib/python3.5/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    715     try:
    716       result = self._run(None, fetches, feed_dict, options_ptr,
--> 717                          run_metadata_ptr)
    718       if run_metadata:
    719         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/Users/jorge/anaconda3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
    913     if final_fetches or final_targets:
    914       results = self._do_run(handle, final_targets, final_fetches,
--> 915                              feed_dict_string, options, run_metadata)
    916     else:
    917       results = []

/Users/jorge/anaconda3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
    963     if handle is None:
    964       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
--> 965                            target_list, options, run_metadata)
    966     else:
    967       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/Users/jorge/anaconda3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
    970   def _do_call(self, fn, *args):
    971     try:
--> 972       return fn(*args)
    973     except errors.OpError as e:
    974       message = compat.as_text(e.message)

/Users/jorge/anaconda3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
    952         return tf_session.TF_Run(session, options,
    953                                  feed_dict, fetch_list, target_list,
--> 954                                  status, run_metadata)
    955 
    956     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [ ]:


In [15]:
batch_list_tst = batch_generator(X[600000:], y[600000:], batch_size=batch_size)
x_batch, y_batch = next(batch_list_tst)
feedDict = {x_input: x_batch, y_input: y_batch, keep_prob: 1}
pred = y_pred.eval(feed_dict=feedDict)
print(pred)


[[ 0.01745436  0.17834605  0.0010904  ...,  0.0010904   0.0010904
   0.0010904 ]
 [ 0.0174567   0.17807572  0.00109281 ...,  0.00109281  0.00109281
   0.00109281]
 [ 0.01742481  0.17907928  0.00108694 ...,  0.00108694  0.00108694
   0.00108694]
 ..., 
 [ 0.01763684  0.17448832  0.00113155 ...,  0.00113155  0.00113155
   0.00113155]
 [ 0.01750312  0.17818648  0.00109263 ...,  0.00109263  0.00109263
   0.00109263]
 [ 0.01763884  0.17448185  0.00113203 ...,  0.00113203  0.00113203
   0.00113203]]

In [10]:
maxlen = 20


def sample(a, diversity=1.0):
    '''
    helper function to sample an index from a probability array
    - Diversity control the level of randomless
    '''
    a = np.log(a) / diversity
    a = np.exp(a) / np.sum(np.exp(a), axis=0)
    a /= np.sum(a+0.0000001) #Precission error
    return np.argmax(np.random.multinomial(1, a, 1))


def generate_text(sentence, diversity, current_model, num_char=400):
    sentence_init = sentence
    generated = ''
    for i in range(400):
        x = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x[0, t, char_indices[char]] = 1.
        preds = current_model.predict(x, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        generated += next_char
        sentence = sentence[1:] + next_char
    print()
    print('DIVERSITY: ',diversity)
    print(sentence_init + generated)

In [11]:
sentence = 'mire vuestra merced '
generate_text(sentence, 0.2, model1)
generate_text(sentence, 0.5, model1)
generate_text(sentence, 1,   model1)
generate_text(sentence, 1.2, model1)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-ea1745f98db2> in <module>()
      1 sentence = 'mire vuestra merced '
----> 2 generate_text(sentence, 0.2, model1)
      3 generate_text(sentence, 0.5, model1)
      4 generate_text(sentence, 1,   model1)
      5 generate_text(sentence, 1.2, model1)

NameError: name 'model1' is not defined