In [1]:
# Import Packages
import numpy as np
import tensorflow as tf
import collections
import argparse
import time
import os
from six.moves import cPickle
print ("Packages Imported")


Packages Imported

In [2]:
# Load text
data_dir    = "data/linux_kernel"
save_dir    = "data/linux_kernel"
input_file  = os.path.join(data_dir, "input.txt")
with open(input_file, "r") as f:
    data = f.read()
print ("Text loaded from '%s'" % (input_file))


Text loaded from 'data/linux_kernel/input.txt'

In [3]:
# Preprocess Text
# First, count the number of characters
counter = collections.Counter(data)
count_pairs = sorted(counter.items(), key=lambda x: -x[1]) # <= Sort
print ("Type of 'counter.items()' is %s and length is %d" 
       % (type(counter.items()), len(counter.items()))) 
for i in range(5):
    print ("[%d/%d]" % (i, 3)), # <= This comma remove '\n'
    print (list(counter.items())[i])

print (" ")
print ("Type of 'count_pairs' is %s and length is %d" 
       % (type(count_pairs), len(count_pairs))) 
for i in range(5):
    print ("[%d/%d]" % (i, 3)), # <= This comma remove '\n'
    print (count_pairs[i])


Type of 'counter.items()' is <type 'list'> and length is 99
[0/3] (' ', 171222)
[1/3] ('$', 61)
[2/3] ('(', 23412)
[3/3] (',', 17025)
[4/3] ('0', 4322)
 
Type of 'count_pairs' is <type 'list'> and length is 99
[0/3] (' ', 171222)
[1/3] ('e', 113021)
[2/3] ('t', 102154)
[3/3] ('r', 76185)
[4/3] ('i', 75486)

In [4]:
# Let's make dictionary
chars, counts = zip(*count_pairs)
vocab = dict(zip(chars, range(len(chars))))
print ("Type of 'chars' is %s and length is %d" 
    % (type(chars), len(chars))) 
for i in range(5):
    print ("[%d/%d]" % (i, 3)), # <= This comma remove '\n'
    print ("chars[%d] is '%s'" % (i, chars[i]))
    
print ("")

print ("Type of 'vocab' is %s and length is %d" 
    % (type(vocab), len(vocab))) 
for i in range(5):
    print ("[%d/%d]" % (i, 3)), # <= This comma remove '\n'
    print ("vocab['%s'] is %s" % (chars[i], vocab[chars[i]]))
    
# SAve chars and vocab
with open(os.path.join(save_dir, 'chars_vocab.pkl'), 'wb') as f:
    cPickle.dump((chars, vocab), f)


Type of 'chars' is <type 'tuple'> and length is 99
[0/3] chars[0] is ' '
[1/3] chars[1] is 'e'
[2/3] chars[2] is 't'
[3/3] chars[3] is 'r'
[4/3] chars[4] is 'i'

Type of 'vocab' is <type 'dict'> and length is 99
[0/3] vocab[' '] is 0
[1/3] vocab['e'] is 1
[2/3] vocab['t'] is 2
[3/3] vocab['r'] is 3
[4/3] vocab['i'] is 4

chars[0] converts index to char

vocab['a'] converts char to index


In [5]:
# Now convert all text to index using vocab! 
corpus = np.array(list(map(vocab.get, data)))
print ("Type of 'corpus' is %s, shape is %s, and length is %d" 
    % (type(corpus), corpus.shape, len(corpus)))

check_len = 10
print ("\n'corpus' looks like %s" % (corpus[0:check_len]))
for i in range(check_len):
    _wordidx = corpus[i]
    print ("[%d/%d] chars[%02d] corresponds to '%s'" 
           % (i, check_len, _wordidx, chars[_wordidx]))


Type of 'corpus' is <type 'numpy.ndarray'>, shape is (1708871,), and length is 1708871

'corpus' looks like [36 22  7  0 22  0  0 13  4  8]
[0/10] chars[36] corresponds to '/'
[1/10] chars[22] corresponds to '*'
[2/10] chars[07] corresponds to '
'
[3/10] chars[00] corresponds to ' '
[4/10] chars[22] corresponds to '*'
[5/10] chars[00] corresponds to ' '
[6/10] chars[00] corresponds to ' '
[7/10] chars[13] corresponds to 'l'
[8/10] chars[04] corresponds to 'i'
[9/10] chars[08] corresponds to 'n'

In [6]:
# Generate batch data 
batch_size  = 50
seq_length  = 200
num_batches = int(corpus.size / (batch_size * seq_length))
# First, reduce the length of corpus to fit batch_size
corpus_reduced = corpus[:(num_batches*batch_size*seq_length)]
xdata = corpus_reduced
ydata = np.copy(xdata)
ydata[:-1] = xdata[1:]
ydata[-1]  = xdata[0]
print ('xdata is ... %s and length is %d' % (xdata, xdata.size))
print ('ydata is ... %s and length is %d' % (ydata, xdata.size))
print ("")

# Second, make batch 
xbatches = np.split(xdata.reshape(batch_size, -1), num_batches, 1)
ybatches = np.split(ydata.reshape(batch_size, -1), num_batches, 1)
print ("Type of 'xbatches' is %s and length is %d" 
    % (type(xbatches), len(xbatches)))
print ("Type of 'ybatches' is %s and length is %d" 
    % (type(ybatches), len(ybatches)))
print ("")

# How can we access to xbatches?? 
nbatch = 5
temp = xbatches[0:nbatch]
print ("Type of 'temp' is %s and length is %d" 
    % (type(temp), len(temp)))
for i in range(nbatch):
    temp2 = temp[i]
    print ("Type of 'temp[%d]' is %s and shape is %s" % (i, type(temp2), temp2.shape,))


xdata is ... [36 22  7 ..., 11 25  3] and length is 1700000
ydata is ... [22  7  0 ..., 25  3 36] and length is 1700000

Type of 'xbatches' is <type 'list'> and length is 170
Type of 'ybatches' is <type 'list'> and length is 170

Type of 'temp' is <type 'list'> and length is 5
Type of 'temp[0]' is <type 'numpy.ndarray'> and shape is (50, 200)
Type of 'temp[1]' is <type 'numpy.ndarray'> and shape is (50, 200)
Type of 'temp[2]' is <type 'numpy.ndarray'> and shape is (50, 200)
Type of 'temp[3]' is <type 'numpy.ndarray'> and shape is (50, 200)
Type of 'temp[4]' is <type 'numpy.ndarray'> and shape is (50, 200)

Now, we are ready to make our RNN model with seq2seq


In [7]:
# Important RNN parameters 
vocab_size = len(vocab)
rnn_size   = 128
num_layers = 2
grad_clip  = 5.

def unit_cell():
    return tf.contrib.rnn.BasicLSTMCell(rnn_size,state_is_tuple=True,reuse=tf.get_variable_scope().reuse)
cell = tf.contrib.rnn.MultiRNNCell([unit_cell() for _ in range(num_layers)])

input_data = tf.placeholder(tf.int32, [batch_size, seq_length])
targets    = tf.placeholder(tf.int32, [batch_size, seq_length])
istate     = cell.zero_state(batch_size, tf.float32)
# Weigths 
with tf.variable_scope('rnnlm'):
    softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size])
    softmax_b = tf.get_variable("softmax_b", [vocab_size])
    with tf.device("/cpu:0"):
        embedding = tf.get_variable("embedding", [vocab_size, rnn_size])
        inputs = tf.split(tf.nn.embedding_lookup(embedding, input_data), seq_length, 1)
        inputs = [tf.squeeze(_input, [1]) for _input in inputs]
# Output
def loop(prev, _):
    prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b)
    prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
    return tf.nn.embedding_lookup(embedding, prev_symbol)
"""
    loop_function: If not None, this function will be applied to the i-th output
    in order to generate the i+1-st input, and decoder_inputs will be ignored,
    except for the first element ("GO" symbol).
""" 
outputs, last_state = tf.contrib.rnn.static_rnn(cell, inputs, istate
                , scope='rnnlm')
output = tf.reshape(tf.concat(outputs, 1), [-1, rnn_size])
logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
probs  = tf.nn.softmax(logits)
# Loss
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], # Input
    [tf.reshape(targets, [-1])], # Target
    [tf.ones([batch_size * seq_length])], # Weight 
    vocab_size)
# Optimizer
cost     = tf.reduce_sum(loss) / batch_size / seq_length
final_state = last_state
lr       = tf.Variable(0.0, trainable=False)
tvars    = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)
_optm    = tf.train.AdamOptimizer(lr)
optm     = _optm.apply_gradients(zip(grads, tvars))

print ("Network Ready")


Network Ready

In [9]:
# Train the model!
num_epochs    = 50
save_every    = 500
learning_rate = 0.002
decay_rate    = 0.97

sess = tf.Session()
sess.run(tf.initialize_all_variables())
summary_writer = tf.summary.FileWriter(save_dir, graph=sess.graph)
saver = tf.train.Saver(tf.all_variables())
init_time = time.time()
for epoch in range(num_epochs):
    # Learning rate scheduling 
    sess.run(tf.assign(lr, learning_rate * (decay_rate ** epoch)))
    state     = sess.run(istate)
    batchidx  = 0
    for iteration in range(num_batches):
        start_time   = time.time()
        randbatchidx = np.random.randint(num_batches)
        xbatch       = xbatches[batchidx]
        ybatch       = ybatches[batchidx]
        batchidx     = batchidx + 1
        
        # Note that, num_batches = len(xbatches)
        # Train! 
        train_loss, state, _ = sess.run([cost, final_state, optm]
            , feed_dict={input_data: xbatch, targets: ybatch, istate: state}) 
        total_iter = epoch*num_batches + iteration
        end_time     = time.time();
        duration     = end_time - start_time
        
        if total_iter % 100 == 0:
            print ("[%d/%d] cost: %.4f / Each batch learning took %.4f sec" 
                   % (total_iter, num_epochs*num_batches, train_loss, duration))
        if total_iter % save_every == 0: 
            ckpt_path = os.path.join(save_dir, 'model.ckpt')
            saver.save(sess, ckpt_path, global_step = total_iter)
            # Save network! 
            print("model saved to '%s'" % (ckpt_path))


WARNING:tensorflow:From <ipython-input-9-c08af8068626>:8: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
WARNING:tensorflow:From <ipython-input-9-c08af8068626>:10: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.
[0/8500] cost: 4.6006 / Each batch learning took 2.2222 sec
model saved to 'data/linux_kernel/model.ckpt'
[100/8500] cost: 3.1259 / Each batch learning took 0.3366 sec
[200/8500] cost: 2.5992 / Each batch learning took 0.3258 sec
[300/8500] cost: 2.4603 / Each batch learning took 0.3260 sec
[400/8500] cost: 2.2591 / Each batch learning took 0.3136 sec
[500/8500] cost: 2.0035 / Each batch learning took 0.3140 sec
model saved to 'data/linux_kernel/model.ckpt'
[600/8500] cost: 1.9589 / Each batch learning took 0.3695 sec
[700/8500] cost: 1.8066 / Each batch learning took 0.3130 sec
[800/8500] cost: 1.7801 / Each batch learning took 0.3119 sec
[900/8500] cost: 1.7433 / Each batch learning took 0.4185 sec
[1000/8500] cost: 1.6289 / Each batch learning took 0.3153 sec
model saved to 'data/linux_kernel/model.ckpt'
[1100/8500] cost: 1.6194 / Each batch learning took 0.3388 sec
[1200/8500] cost: 1.4603 / Each batch learning took 0.3129 sec
[1300/8500] cost: 1.5877 / Each batch learning took 0.3141 sec
[1400/8500] cost: 1.5235 / Each batch learning took 0.3087 sec
[1500/8500] cost: 1.5317 / Each batch learning took 0.3440 sec
model saved to 'data/linux_kernel/model.ckpt'
[1600/8500] cost: 1.5362 / Each batch learning took 0.4632 sec
[1700/8500] cost: 1.4946 / Each batch learning took 0.3351 sec
[1800/8500] cost: 1.4392 / Each batch learning took 0.3374 sec
[1900/8500] cost: 1.4224 / Each batch learning took 0.3323 sec
[2000/8500] cost: 1.4797 / Each batch learning took 0.3115 sec
model saved to 'data/linux_kernel/model.ckpt'
[2100/8500] cost: 1.4381 / Each batch learning took 0.3863 sec
[2200/8500] cost: 1.3570 / Each batch learning took 0.3080 sec
[2300/8500] cost: 1.3689 / Each batch learning took 0.3120 sec
[2400/8500] cost: 1.3241 / Each batch learning took 0.3174 sec
[2500/8500] cost: 1.3431 / Each batch learning took 0.3326 sec
model saved to 'data/linux_kernel/model.ckpt'
[2600/8500] cost: 1.3311 / Each batch learning took 0.4586 sec
[2700/8500] cost: 1.2888 / Each batch learning took 0.3147 sec
[2800/8500] cost: 1.3359 / Each batch learning took 0.3262 sec
[2900/8500] cost: 1.1899 / Each batch learning took 0.3310 sec
[3000/8500] cost: 1.3265 / Each batch learning took 0.3324 sec
model saved to 'data/linux_kernel/model.ckpt'
[3100/8500] cost: 1.2806 / Each batch learning took 0.5395 sec
[3200/8500] cost: 1.3113 / Each batch learning took 0.3448 sec
[3300/8500] cost: 1.3262 / Each batch learning took 0.3422 sec
[3400/8500] cost: 1.3011 / Each batch learning took 0.3195 sec
[3500/8500] cost: 1.2781 / Each batch learning took 0.3138 sec
model saved to 'data/linux_kernel/model.ckpt'
[3600/8500] cost: 1.2607 / Each batch learning took 0.3156 sec
[3700/8500] cost: 1.2897 / Each batch learning took 0.4064 sec
[3800/8500] cost: 1.2809 / Each batch learning took 0.3063 sec
[3900/8500] cost: 1.2301 / Each batch learning took 0.3330 sec
[4000/8500] cost: 1.2372 / Each batch learning took 0.3157 sec
model saved to 'data/linux_kernel/model.ckpt'
[4100/8500] cost: 1.2088 / Each batch learning took 0.3536 sec
[4200/8500] cost: 1.2277 / Each batch learning took 0.3146 sec
[4300/8500] cost: 1.2095 / Each batch learning took 0.3148 sec
[4400/8500] cost: 1.1840 / Each batch learning took 0.3425 sec
[4500/8500] cost: 1.2459 / Each batch learning took 0.3368 sec
model saved to 'data/linux_kernel/model.ckpt'
[4600/8500] cost: 1.0941 / Each batch learning took 0.4124 sec
[4700/8500] cost: 1.2265 / Each batch learning took 0.3164 sec
[4800/8500] cost: 1.1862 / Each batch learning took 0.3307 sec
[4900/8500] cost: 1.2198 / Each batch learning took 0.3371 sec
[5000/8500] cost: 1.2345 / Each batch learning took 0.3298 sec
model saved to 'data/linux_kernel/model.ckpt'
[5100/8500] cost: 1.2081 / Each batch learning took 0.3418 sec
[5200/8500] cost: 1.2043 / Each batch learning took 0.3105 sec
[5300/8500] cost: 1.1929 / Each batch learning took 0.3377 sec
[5400/8500] cost: 1.2155 / Each batch learning took 0.3373 sec
[5500/8500] cost: 1.2052 / Each batch learning took 0.3908 sec
model saved to 'data/linux_kernel/model.ckpt'
[5600/8500] cost: 1.1683 / Each batch learning took 0.3207 sec
[5700/8500] cost: 1.1695 / Each batch learning took 0.3358 sec
[5800/8500] cost: 1.1485 / Each batch learning took 0.3392 sec
[5900/8500] cost: 1.1671 / Each batch learning took 0.3451 sec
[6000/8500] cost: 1.1481 / Each batch learning took 0.3391 sec
model saved to 'data/linux_kernel/model.ckpt'
[6100/8500] cost: 1.1262 / Each batch learning took 0.3186 sec
[6200/8500] cost: 1.1943 / Each batch learning took 0.4622 sec
[6300/8500] cost: 1.0425 / Each batch learning took 0.3805 sec
[6400/8500] cost: 1.1697 / Each batch learning took 0.3373 sec
[6500/8500] cost: 1.1365 / Each batch learning took 0.3838 sec
model saved to 'data/linux_kernel/model.ckpt'
[6600/8500] cost: 1.1704 / Each batch learning took 0.3196 sec
[6700/8500] cost: 1.1841 / Each batch learning took 0.3364 sec
[6800/8500] cost: 1.1521 / Each batch learning took 0.3404 sec
[6900/8500] cost: 1.1598 / Each batch learning took 0.3631 sec
[7000/8500] cost: 1.1523 / Each batch learning took 0.3372 sec
model saved to 'data/linux_kernel/model.ckpt'
[7100/8500] cost: 1.1689 / Each batch learning took 0.3289 sec
[7200/8500] cost: 1.1579 / Each batch learning took 0.3935 sec
[7300/8500] cost: 1.1316 / Each batch learning took 0.3154 sec
[7400/8500] cost: 1.1284 / Each batch learning took 0.3672 sec
[7500/8500] cost: 1.1087 / Each batch learning took 0.3855 sec
model saved to 'data/linux_kernel/model.ckpt'
[7600/8500] cost: 1.1276 / Each batch learning took 0.5031 sec
[7700/8500] cost: 1.1090 / Each batch learning took 0.3774 sec
[7800/8500] cost: 1.0901 / Each batch learning took 0.4558 sec
[7900/8500] cost: 1.1609 / Each batch learning took 0.4658 sec
[8000/8500] cost: 1.0116 / Each batch learning took 0.3612 sec
model saved to 'data/linux_kernel/model.ckpt'
[8100/8500] cost: 1.1309 / Each batch learning took 0.3854 sec
[8200/8500] cost: 1.1066 / Each batch learning took 0.3160 sec
[8300/8500] cost: 1.1417 / Each batch learning took 0.3196 sec
[8400/8500] cost: 1.1568 / Each batch learning took 0.3270 sec

Run the command line

tensorboard --logdir=/tmp/tf_logs/char_rnn_tutorial

Open http://localhost:6006/ into your web browser


In [9]:
print ("Done!! It took %.4f second. " %(time.time() - init_time))


Done!! It took 5238.4040 second.