In [1]:
import tensorflow as tf
import data_utils
from sklearn.model_selection import train_test_split
import numpy as np
import time
In [2]:
X, Y, word2idx, idx2word, vocab = data_utils.read_data_set('data.pkl')
In [3]:
print 'First article headline - encoded:\n', Y[0]
print [idx2word[idx] for idx in Y[0]]
print '\nFirst article text - encoded:\n', X[0]
print '\nMost freq. words:\n', vocab[:50]
In [4]:
vocab_size = len(vocab) + 4
word2idx['<pad>'] = vocab_size - 2
idx2word[vocab_size - 2] = '<pad>'
word2idx['<go>'] = vocab_size - 1
idx2word[vocab_size - 1] = '<go>'
# data padding
def padding(x, y):
labels = []
for i in range(len(y)):
labels.append([word2idx['<go>']] + y[i] + [word2idx['<eos>']] + (8 - len(y[i])) * [word2idx['<pad>']])
inputs = []
for i in range(len(x)):
for j in range(0, len(x[i]) - 50, 50):
part = x[i][j:j+100]
part = (100 - len(part)) * [word2idx['<pad>']] + part
inputs.append((part, i))
return inputs, labels
# data spliting
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1)
del X
del Y
X_train, Y_train = padding(X_train, Y_train)
In [5]:
input_seq_len = 100
output_seq_len = 10
# placeholders for sequences
encoder_inputs = []
for _ in range(input_seq_len):
encoder_inputs.append(tf.placeholder(tf.int32, shape = [None], name = 'encoder{}'.format(_)))
decoder_inputs = []
for _ in range(output_seq_len):
decoder_inputs.append(tf.placeholder(tf.int32, shape = [None], name = 'decoder{}'.format(_)))
targets = [decoder_inputs[i+1] for i in range(len(decoder_inputs)-1)]
# output projection - dim reduction
output_dim = 512
w_t = tf.get_variable("proj_w", [vocab_size, output_dim], dtype=tf.float32)
w = tf.transpose(w_t)
b = tf.get_variable("proj_b", [vocab_size], dtype=tf.float32)
output_projection = (w, b)
outputs, states = tf.nn.seq2seq.embedding_attention_seq2seq(
encoder_inputs,
decoder_inputs,
tf.nn.rnn_cell.BasicLSTMCell(output_dim),
num_encoder_symbols = vocab_size,
num_decoder_symbols = vocab_size,
embedding_size = 100,
feed_previous= False,
output_projection = output_projection,
dtype = tf.float32)
In [6]:
def sampled_loss(labels, logits):
return tf.nn.sampled_softmax_loss(
weights=w_t,
biases=b,
labels=tf.reshape(labels, [-1, 1]),
inputs=logits,
num_sampled=256,
num_classes=vocab_size)
In [7]:
# helper function for feeding data into placeholders
def feed_dict(x, y, batch_size = 64):
idxes = np.random.choice([i for i in range(len(x))], size = batch_size)
feed = {}
for i in range(input_seq_len):
feed[encoder_inputs[i].name] = np.array([x[j][0][i] for j in idxes])
for i in range(output_seq_len):
feed[decoder_inputs[i].name] = np.array([y[x[j][1]][i] for j in idxes])
return feed
# decoding output seq - headline
def decode_output_seq(output_seq):
words = []
for t in range(output_seq_len):
smax = softmax(output_seq[t])
idx = np.argmax(smax)
words.append(idx2word[idx])
return words
# decoding label
def decode_label(label):
words = []
for idx in label:
words.append(idx2word[idx])
return words
# simple softmax function
def softmax(x):
n = np.max(x)
e_x = np.exp(x - n)
return e_x / e_x.sum()
In [8]:
# parameters
steps = 10
learning_rate = 0.5
batch_size = 64
# adding one more target
targets.append(np.full(shape = [batch_size], fill_value = word2idx['<pad>']))
# ops for projecting outputs
outputs_proj = [tf.matmul(outputs[i], output_projection[0]) + output_projection[1] for i in range(output_seq_len)]
# calculate the loss for a whole seq
def calculate_loss():
loss = sampled_loss(targets[0], outputs[0])
for i in range(1, output_seq_len):
loss += sampled_loss(targets[i], outputs[i])
return tf.reduce_mean(loss)
# loss and optimizer ops
loss = calculate_loss()
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
# initialization op
init = tf.global_variables_initializer()
# generating headlines for a batch of articles
def forward_step(sess, feed):
output_sequences = sess.run(outputs_proj, feed_dict = feed)
return output_sequences
# training step
def backward_step(sess, feed):
sess.run(optimizer, feed_dict = feed)
In [9]:
with tf.Session() as sess:
sess.run(init)
print '---THIS IS WHAT MODEL GENERATES BEFORE TRAINING---\n'
feed = feed_dict(X_train, Y_train, 1)
output_sequences = forward_step(sess, feed)
# decoding generated headline
output_seq = np.reshape(output_sequences, [output_seq_len, vocab_size])
words = decode_output_seq(output_seq)
print 'Predicted headline:'
for word in words:
print word,
print '\n'
# decoding corresponding label
labels = sess.run(targets[:-1], feed_dict = feed)
label = [labels[i][0] for i in range(output_seq_len-1)] + [word2idx['<pad>']]
words = decode_label(label)
print 'Actual headline:'
for word in words:
print word,
print '\n\n---------TRAINING---------\n\n'
# training
t = time.time()
for step in range(steps):
feed = feed_dict(X_train, Y_train)
backward_step(sess, feed)
if step % (steps-1) == 0 or step == 0:
loss_value = sess.run(loss, feed_dict = feed)
print 'step: {}, loss: {}'.format(step, loss_value)
print 'Training time for {} steps:{}s'.format(steps, time.time() - t)