Import dependencies


In [1]:
import tensorflow as tf
import data_utils
from sklearn.model_selection import train_test_split
import numpy as np
import time

Read a dataset


In [2]:
X, Y, word2idx, idx2word, vocab = data_utils.read_data_set('data.pkl')

Inspect data


In [3]:
print 'First article headline - encoded:\n', Y[0]
print [idx2word[idx] for idx in Y[0]]
print '\nFirst article text - encoded:\n', X[0]
print '\nMost freq. words:\n', vocab[:50]


First article headline - encoded:
[2636, 5032, 44, 195, 1249]
['ECB', 'defends', 'England', 'tour', 'schedule']

First article text - encoded:
[19, 44, 6, 117, 481, 702, 21, 1279, 403, 636, 5, 2, 14744, 7, 2, 195, 3, 145, 218, 13107, 7, 2, 5366, 8742, 30, 307, 264, 5, 1805, 1745, 2, 55, 7, 79, 66, 3, 66, 435, 2636, 589, 7, 230, 5344, 410, 2369, 23, 60, 2424, 160, 9939, 48, 14, 906, 2, 41, 7204, 27, 2, 145, 709, 702, 4049, 145, 218, 15, 111, 37, 5800, 9278, 1393, 27, 55, 894, 3, 2, 171, 7, 1308, 145, 218, 4883, 258, 2251, 3, 307, 1779, 6, 24, 2081, 36, 1988, 13, 161, 46, 584, 6, 420, 991, 60, 267, 3, 59, 413, 135, 242, 6, 308, 312, 791, 240, 2, 669, 7, 2, 136, 1393, 3, 4268, 2, 4320, 17, 3095, 5047, 58, 7, 476, 230, 60, 33, 513, 3, 12894, 4, 1297, 1204, 7, 230, 47, 2, 1520, 357, 6289, 137, 32, 4570, 17, 5403, 5, 4, 85, 5394, 131, 5, 1410, 7, 48, 2, 230, 10, 119, 61, 69, 89, 1771, 17, 2, 7472, 7, 176, 230, 18, 2, 544, 6, 145, 2086, 1393, 5, 1178, 2019, 13717, 732, 7, 160, 195, 31, 3, 20, 9278, 84, 4, 76, 4674, 1018, 60, 4063, 5, 4, 483, 3, 4275, 4, 15755, 5, 2, 200, 7, 135, 242, 102, 312, 791, 1321, 2369, 50, 5561, 2, 1249, 12, 342, 27, 2510, 8, 2, 57, 6, 23, 2, 1953, 7, 36, 5403, 12, 11231, 2, 3834, 2681, 422, 2, 106, 60, 15, 3, 767, 160, 1486, 54, 701, 13, 2, 145, 709, 639, 6, 332, 17, 45, 1520, 25, 2217, 34, 4, 5934, 527, 2369, 115, 612, 44, 354, 670, 7268, 4366, 1939, 148, 79, 435, 66, 3, 66, 26, 7910, 6, 23, 2369, 6, 44, 104, 2340, 2722, 182, 241, 15, 955, 3, 25, 38, 815, 1210, 18, 1695, 2, 6105, 1398, 7, 2, 44, 135, 167, 130, 2, 225, 72, 12, 122, 322, 5, 4899, 87, 2, 1249, 3876, 135, 242, 5, 79, 6, 4, 216, 335, 10, 4, 411, 1249, 1149, 122, 654, 87, 17, 14, 23, 198, 2, 41, 216, 7, 73, 275, 829, 125, 11, 195, 6, 8, 105, 102, 204, 335, 6, 9, 1334, 109, 593, 2120, 148, 4, 516, 7, 1315, 156, 377, 10, 76, 704, 598, 4819, 148, 204, 135, 242, 13, 122, 156, 5, 283, 818, 2, 131, 4615, 3534, 33, 88, 162, 1165, 19, 156, 152, 6141, 6, 1478, 22, 1149, 76, 346, 63, 3, 539, 548, 262, 162, 69, 4, 610, 7, 314, 3, 95, 11, 13, 25, 60, 58, 1103, 280, 2, 195, 12, 2515, 162, 56, 58, 30, 79, 135, 242, 765, 464, 4, 734, 5, 1805, 1745, 22, 1149, 122, 1481, 3, 20, 271, 121, 87, 25, 2350, 310, 720, 5, 737]

Most freq. words:
['the', 'to', 'a', 'in', 'and', 'of', 'for', 'I', 'is', 'on', 'was', 'with', 'he', 'have', 'his', 'that', 'at', 'The', 'be', 'has', 'but', 'said', 'will', 'it', 'as', 'from', 'not', 'by', 'after', 'had', 'we', 'are', 'been', 'who', 'their', 'an', 'But', 'out', '-', 'first', 'they', 'game', 'England', 'this', 'against', 'over', 'when', 'would', 'He', 'win']

Data preprocessing


In [4]:
vocab_size = len(vocab) + 4
word2idx['<pad>'] =  vocab_size - 2
idx2word[vocab_size - 2] = '<pad>'
word2idx['<go>'] =  vocab_size - 1
idx2word[vocab_size - 1] = '<go>'

# data padding
def padding(x, y):
    
    labels = []
    for i in range(len(y)):
        labels.append([word2idx['<go>']] + y[i] + [word2idx['<eos>']] + (8 - len(y[i])) * [word2idx['<pad>']])
    
    inputs = []
    for i in range(len(x)):
        for j in range(0, len(x[i]) - 50, 50):
            part = x[i][j:j+100]
            part = (100 - len(part)) * [word2idx['<pad>']] + part
            inputs.append((part, i)) 
    return inputs, labels

# data spliting
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1)

del X
del Y

X_train, Y_train = padding(X_train, Y_train)

Bulding a model


In [5]:
input_seq_len = 100
output_seq_len = 10

# placeholders for sequences
encoder_inputs = []
for _ in range(input_seq_len):
    encoder_inputs.append(tf.placeholder(tf.int32, shape = [None], name = 'encoder{}'.format(_)))

decoder_inputs = []
for _ in range(output_seq_len):
    decoder_inputs.append(tf.placeholder(tf.int32, shape = [None], name = 'decoder{}'.format(_)))
    
targets = [decoder_inputs[i+1] for i in range(len(decoder_inputs)-1)]

# output projection - dim reduction
output_dim = 512
w_t = tf.get_variable("proj_w", [vocab_size, output_dim], dtype=tf.float32)
w = tf.transpose(w_t)
b = tf.get_variable("proj_b", [vocab_size], dtype=tf.float32)
output_projection = (w, b)

outputs, states = tf.nn.seq2seq.embedding_attention_seq2seq(
                                                encoder_inputs, 
                                                decoder_inputs, 
                                                tf.nn.rnn_cell.BasicLSTMCell(output_dim),
                                                num_encoder_symbols = vocab_size,
                                                num_decoder_symbols = vocab_size,
                                                embedding_size = 100,
                                                feed_previous= False,
                                                output_projection = output_projection,
                                                dtype = tf.float32)

Definition of loss function


In [6]:
def sampled_loss(labels, logits):
    
    return tf.nn.sampled_softmax_loss(
                weights=w_t,
                biases=b,
                labels=tf.reshape(labels, [-1, 1]),
                inputs=logits,
                num_sampled=256,
                num_classes=vocab_size)

Some helper functions


In [7]:
# helper function for feeding data into placeholders
def feed_dict(x, y, batch_size = 64):
    
    idxes = np.random.choice([i for i in range(len(x))], size = batch_size)
    
    feed = {}
    for i in range(input_seq_len):
            feed[encoder_inputs[i].name] = np.array([x[j][0][i] for j in idxes])
            
    for i in range(output_seq_len):
            feed[decoder_inputs[i].name] = np.array([y[x[j][1]][i] for j in idxes])
            
    return feed
    
# decoding output seq - headline
def decode_output_seq(output_seq):
    
    words = []
    
    for t in range(output_seq_len):
        smax = softmax(output_seq[t])
        idx = np.argmax(smax)
        words.append(idx2word[idx])
        
    return words

# decoding label
def decode_label(label):
    
    words = []
    for idx in label:
        words.append(idx2word[idx])
    return words

# simple softmax function
def softmax(x):
    n = np.max(x)
    e_x = np.exp(x - n)
    return e_x / e_x.sum()

Definitions of params and ops


In [8]:
# parameters
steps = 10
learning_rate = 0.5
batch_size = 64

# adding one more target
targets.append(np.full(shape = [batch_size], fill_value = word2idx['<pad>']))

# ops for projecting outputs
outputs_proj = [tf.matmul(outputs[i], output_projection[0]) + output_projection[1] for i in range(output_seq_len)]

# calculate the loss for a whole seq
def calculate_loss():
    loss = sampled_loss(targets[0], outputs[0])
    
    for i in range(1, output_seq_len):
        loss += sampled_loss(targets[i], outputs[i])
        
    return tf.reduce_mean(loss)

# loss and optimizer ops
loss = calculate_loss()
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

# initialization op
init = tf.global_variables_initializer()

# generating headlines for a batch of articles
def forward_step(sess, feed):
    output_sequences = sess.run(outputs_proj, feed_dict = feed)
    return output_sequences

# training step
def backward_step(sess, feed):
    sess.run(optimizer, feed_dict = feed)

Training the model


In [9]:
with tf.Session() as sess:
    
    sess.run(init)
    
    print '---THIS IS WHAT MODEL GENERATES BEFORE TRAINING---\n'
    feed = feed_dict(X_train, Y_train, 1)
    output_sequences = forward_step(sess, feed)
    
    # decoding generated headline     
    output_seq = np.reshape(output_sequences, [output_seq_len, vocab_size])
    words = decode_output_seq(output_seq)
    print 'Predicted headline:'
    for word in words:
        print word,
    print '\n'
    
    # decoding corresponding label
    labels = sess.run(targets[:-1], feed_dict = feed)
    label = [labels[i][0] for i in range(output_seq_len-1)] + [word2idx['<pad>']]
    words = decode_label(label)
    print 'Actual headline:'
    for word in words:
        print word,
    print '\n\n---------TRAINING---------\n\n'
    
    # training
    t = time.time()
    for step in range(steps):
        feed = feed_dict(X_train, Y_train)
            
        backward_step(sess, feed)
        
        if step % (steps-1) == 0 or step == 0:
            loss_value = sess.run(loss, feed_dict = feed)
            print 'step: {}, loss: {}'.format(step, loss_value)
            
    print 'Training time for {} steps:{}s'.format(steps, time.time() - t)


---THIS IS WHAT MODEL GENERATES BEFORE TRAINING---

Predicted headline:
continuity continuity continuity continuity continuity continuity continuity Bovina Bovina Bovina 

Actual headline:
Trial date is set for Balco case <eos> <pad> <pad> 

---------TRAINING---------


step: 0, loss: 55.201877594
step: 9, loss: 38.7748413086
Training time for 10 steps:15.2158019543s

I will train the model for more steps later and test it!!!