In [1]:
import numpy as np
from distutils.version import LooseVersion
import warnings
import tensorflow as tf
import gensim
# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.1'), 'Please use TensorFlow version 1.1 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))
# Check for a GPU
if not tf.test.gpu_device_name():
warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
In [2]:
# Load word2vec model
w2v = gensim.models.KeyedVectors.load('data/w2v-773752559-1000000-300-5-5-OpenSubtitles2016.bin')
In [3]:
def get_inputs(output_dim=300):
"""
Create TF Placeholders for input, targets, learning_rate and input_sequence_length.
:return: Tuple (input_, targets, learning_rate, keep_prob, input_sequence_length)
"""
input_ = tf.placeholder(tf.int32, [None, None], name='input')
targets = tf.placeholder(tf.float32, [None, output_dim])
learning_rate = tf.placeholder(tf.float32)
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
input_sequence_length = tf.placeholder(tf.int32, [None], name="input_sequence_length")
return (input_, targets, learning_rate, keep_prob, input_sequence_length)
In [4]:
def build_lstm(lstm_size, num_layers, batch_size, keep_prob, inputs, num_classes, input_sequence_length):
''' Build LSTM cell.
Arguments
---------
keep_prob: Scalar tensor (tf.placeholder) for the dropout keep probability
lstm_size: Size of the hidden layers in the LSTM cells
num_layers: Number of LSTM layers
batch_size: Batch size
'''
# one_hot encode input
x_one_hot = tf.one_hot(inputs, num_classes) # num_classes = len(vocab)
def build_cell(rnn_size):
cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.contrib.layers.xavier_initializer())
return cell
# Construct a stacked tf.contrib.rnn.LSTMCell...
stacked_cell = tf.contrib.rnn.MultiRNNCell([build_cell(lstm_size) for _ in range(num_layers)])
# ...wrapped in a tf.contrib.rnn.DropoutWrapper
cell = tf.contrib.rnn.DropoutWrapper(stacked_cell, output_keep_prob=keep_prob)
# Pass cell and embedded input to tf.nn.dynamic_rnn()
rnn_output, rnn_state = tf.nn.dynamic_rnn(cell, x_one_hot, sequence_length=input_sequence_length, dtype=tf.float32)
# Initial state
initial_state = tf.identity(stacked_cell.zero_state(batch_size, tf.float32), name="initial_state")
return rnn_output, rnn_state, initial_state
We only care about the final rnn cell output. So we need to grab it with outputs[:, -1].
In [5]:
def build_output(cell, keep_prob, hidden_dim=1024, output_dim=300):
input_ = cell[:, -1]
dense = tf.contrib.layers.fully_connected(inputs=input_, num_outputs=hidden_dim, activation_fn=tf.nn.tanh)
# dense = tf.nn.dropout(dense, keep_prob)
dense = tf.layers.batch_normalization(dense)
return tf.contrib.layers.fully_connected(dense, num_outputs=output_dim, activation_fn=None)
# return tf.contrib.layers.fully_connected(inputs=cell[:, -1], num_outputs=output_dim, activation_fn=tf.nn.relu)
In [6]:
def get_loss(pred, Y):
pred=tf.nn.l2_normalize(pred,0)
Y=tf.nn.l2_normalize(Y,0)
return tf.reduce_mean (1 - tf.reduce_sum(tf.multiply(pred,Y), axis=(1,), keep_dims=True))
# return tf.losses.cosine_distance(pred, Y, dim=1)
In [7]:
# build vocabulary
vocab = sorted(set(" ".join(w2v.wv.index2word)))
vocab_to_int = {c: i for i, c in enumerate(vocab,1 )}
In [8]:
batch_size=256
lstm_size=1024
num_layers=3
keep_probability=0.8
num_classes=len(vocab)
output_dim=300
# learning_rate=0.001
learning_rate=0.0005
save_dir = './model/seq2vec'
# Create the graph object
graph = tf.Graph()
with graph.as_default():
(input_, targets, lr, keep_prob, input_sequence_length) = get_inputs()
with tf.variable_scope('LSTM'):
rnn_output, rnn_state, initial_state = build_lstm(lstm_size, num_layers, batch_size, keep_prob, input_, num_classes, input_sequence_length)
with tf.variable_scope('OUTPUT'):
output = build_output(rnn_output, keep_prob, output_dim)
with tf.variable_scope('LOSS'):
loss = get_loss(output, targets)
with tf.variable_scope('OPTIMIZER'):
# Optimizer
optimizer = tf.train.AdamOptimizer(learning_rate)
gradients = optimizer.compute_gradients(loss)
# clip gradients
capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients)
with tf.Session(graph=graph) as sess:
#writer = tf.summary.FileWriter("log", sess.graph)
sess.run(tf.global_variables_initializer())
# tensorboard summary scalar for loss
tf.summary.scalar("loss", loss)
merged_summary = tf.summary.merge_all()
# tensorboard writer graph
writer=tf.summary.FileWriter("writer/1")
writer.add_graph(sess.graph)
# Save session
saver = tf.train.Saver()
saver.save(sess, save_dir, global_step=0)
# # Optimizer for training, using gradient clipping to control exploding gradients
# tvars = tf.trainable_variables()
# grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
# train_op = tf.train.AdamOptimizer(learning_rate)
# optimizer = train_op.apply_gradients(zip(grads, tvars))
In [9]:
# def word2seq(word):
# return np.array([vocab_to_int.get(c,0) for c in word])
# def get_train_subset(model=w2v, seed_words=500, topn=7):
# top_words=model.wv.index2word[120:120+seed_words]
# top_words=np.append(np.array(top_words),np.array([np.array(model.most_similar_cosmul(w, topn=topn))[:,0] for w in top_words]))
# top_words=top_words.flatten()
# top_words=set(top_words)
# return top_words
# input_list=list(get_train_subset())
In [10]:
def get_padded_int_batch(input_batch, vocab_to_int=vocab_to_int):
max_len = max([len(word) for word in input_batch])
int_batch = [[0] * (max_len - len(l)) + [vocab_to_int[w] for w in l] for l in input_batch]
return int_batch
def get_batch(input_list=w2v.wv.index2word, batch_size=batch_size, vocab=vocab, vocab_to_int=vocab_to_int, model=w2v):
"""
Batch generator.
Input: train_set - list of words
Returns touple:
(pad_input_batch, pad_input_lengths, output_batch)
"""
for batch_i in range(0, len(input_list)//batch_size):
start_i = batch_i * batch_size
# Slice the right amount for the batch
input_batch = input_list[start_i:start_i + batch_size]
# Pad
pad_input_batch = np.array(get_padded_int_batch(input_batch, vocab_to_int))
# Need the lengths for the _lengths parameters
pad_input_lengths = []
for line in pad_input_batch:
pad_input_lengths.append(len(line))
# output batch
output_batch=np.array([w2v.wv.word_vec(w) for w in input_batch])
yield (pad_input_batch, pad_input_lengths, output_batch)
# for (batch_i, (pad_input_batch, pad_input_lengths, output)) in enumerate(get_batch(w2v.wv.index2word[:1000], batch_size=50)):
# print (batch_i)
# pass
train_size= 6000#00
train_input = w2v.wv.index2word[:train_size]
valid_input = w2v.wv.index2word[train_size:train_size+batch_size]
In [12]:
#%%time
#tf.reset_default_graph()
num_epochs=100
display_step=1
with tf.Session(graph=graph) as sess:
# sess.run(tf.global_variables_initializer())
# loader = tf.train.import_meta_graph(save_dir + '.meta')
saver.restore(sess, saver.last_checkpoints[-1])
for epoch_i in range(1, num_epochs):
for batch_i, (pad_input_batch, pad_input_lengths, out_vec) in enumerate(get_batch(train_input)):
_, _, l = sess.run([initial_state, train_op, loss], {
input_: pad_input_batch,
targets: out_vec,
lr: learning_rate,
keep_prob: keep_probability,
input_sequence_length: pad_input_lengths,
})
if (epoch_i % display_step ==0):
(pad_input_batch, pad_input_lengths, out_vec) = next(get_batch(valid_input))
_, valid_loss, ms = sess.run([initial_state, loss, merged_summary],{
input_: pad_input_batch,
targets: out_vec,
keep_prob: 1.0,
input_sequence_length: pad_input_lengths,
})
print("Epoch: {:3} | Loss: {:2.4}\t validation loss: {:2.4}".format(epoch_i, l, valid_loss))
writer.add_summary(ms, epoch_i)
# save model
saver.save(sess, save_dir, global_step=epoch_i)
In [19]:
saver.last_checkpoints[-1]
writer.add_graph(sess.graph)
In [20]:
%%time
tf.reset_default_graph()
learning_rate=0.00001
num_epochs=100
display_step=1
# tensorboard writer graph
writer=tf.summary.FileWriter("writer/3")
with tf.Session(graph=graph) as sess:
# sess.run(tf.global_variables_initializer())
# loader = tf.train.import_meta_graph(save_dir + '.meta')
saver.restore(sess, saver.last_checkpoints[-1])
for epoch_i in range(num_epochs):
for batch_i, (pad_input_batch, pad_input_lengths, out_vec) in enumerate(get_batch(train_input)):
_, _, l = sess.run([initial_state, train_op, loss], {
input_: pad_input_batch,
targets: out_vec,
lr: learning_rate,
keep_prob: keep_probability,
input_sequence_length: pad_input_lengths,
})
if (epoch_i % display_step ==0):
(pad_input_batch, pad_input_lengths, out_vec) = next(get_batch(valid_input))
_, valid_loss, ms = sess.run([initial_state, loss, merged_summary],{
input_: pad_input_batch,
targets: out_vec,
keep_prob: 1.0,
input_sequence_length: pad_input_lengths,
})
print("Epoch: {:3} | Loss: {:2.4}\t validation loss: {:2.4}".format(epoch_i, l, valid_loss))
writer.add_summary(ms, epoch_i)
# save model
saver.save(sess, save_dir)
In [15]:
%%time
tf.reset_default_graph()
# Load model and use a global session
sess = tf.Session(graph=graph)
#loader = tf.train.import_meta_graph(save_dir + '.meta')
saver.restore(sess, saver.last_checkpoints[-1])
In [16]:
def get_word2vec(word, sess=sess):
pad_input_batch=get_padded_int_batch([word])
#print(len(pad_input_batch[0]))
_, outputs = sess.run([initial_state, output],{
input_: pad_input_batch,
keep_prob: 1.0,
input_sequence_length: [len(pad_input_batch[0])],
})
return outputs[0]
In [17]:
#word=train_input[200]
word="kapitan"
wordvec=get_word2vec(word)
print(word)
w2v.wv.similar_by_vector(wordvec)
Out[17]:
In [18]:
get_word2vec("ula")
w2v.wv.index2word[990:1010]
Out[18]:
In [ ]: