In [ ]:
import tensorflow as tf
import numpy as np
import os
In [ ]:
corpus_path = os.path.normpath("../Dataset/arvix_abstracts.txt")
with open(corpus_path,"r") as f:
corpus = "".join(f.readlines()).split("\n")
In [ ]:
class WordCodec:
def __init__(self, word_flow):
self._index_to_word = []
self._word_to_index = {}
for word in word_flow:
assert type(word)!="str", "Got type {} instead of str".format(type(word))
if word not in self._word_to_index:
self._word_to_index[word] = len(self._index_to_word)
self._index_to_word.append(word)
def __getitem__(self, key):
if type(key) == int:
return self._index_to_word[key]
elif type(key) == str:
return self._word_to_index[key]
else:
raise TypeError("key must be either int or str.")
@property
def vocab_size(self):
return len(self._index_to_word)
In [ ]:
def word_flow():
for paragraph in corpus:
for word in paragraph.split(" "):
yield word
one_hot_codec = WordCodec(word_flow())
print("Total number of word in the vocabulary: {}".format(one_hot_codec.vocab_size))
In [ ]:
class CBOW:
def __init__(self, vocab_size, context_length, embedding_dim):
with tf.variable_scope("CBOW"):
self._context_input = tf.placeholder(shape=[2*context_length], dtype=tf.int32)
V = tf.get_variable(shape=[vocab_size, embedding_dim], dtype=tf.float32, name="V") # Embedding
self._embedding = tf.nn.embedding_lookup(V, self._context_input)
hidden = tf.reduce_mean(self._embedding, axis=0, keep_dims=True)
U = tf.get_variable(shape=[embedding_dim, vocab_size], name="U")
self._output = tf.matmul(hidden, U)
@property
def input(self):
return self._context_input
@property
def output(self):
return self._output
@property
def embedding(self):
return self._embedding
In [ ]:
context_length = 2
embedding_dim = 1000
cbow_model = CBOW(one_hot_codec.vocab_size, context_length, embedding_dim)
In [ ]:
metadata_path = os.path.normpath("./graphs/word_codec")
with open(metadata_path, "w") as f:
f.write("Index\tWord\n")
for i in range(one_hot_codec.vocab_size):
f.write("{}\t{}\n".format(i, one_hot_codec[i]))
In [ ]:
def training_sample_generator(corpus, codec, context_length):
for paragraph in corpus:
paragraph = np.array([codec[word] for word in paragraph.split(" ")])
for i in range(context_length, np.shape(paragraph)[0]-context_length):
yield np.concatenate([paragraph[i-context_length:i], paragraph[i+1:i+context_length+1]],axis=0), paragraph[i:i+1]
target_output=tf.placeholder(shape=[1],dtype=tf.int32, name="target_output")
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=cbow_model.output, labels=target_output))
lr = tf.Variable(1e-4, trainable=False)
global_step = tf.Variable(0, trainable=False)
train_op=tf.train.AdamOptimizer(learning_rate=lr).minimize(loss, global_step=global_step)
training_set = training_sample_generator(corpus, one_hot_codec, context_length)
with tf.name_scope("summary") as scope:
summary_op = tf.summary.scalar(name="loss",tensor=loss)
num_epoch = 20
graph_path = "./graphs"
model_checkpoint_path = os.path.join("./graphs", "CBOW")
save_every = 1000
with tf.Session() as sess:
saver = tf.train.Saver()
writer = tf.summary.FileWriter(logdir=graph_path, graph=sess.graph)
ckpt = tf.train.get_checkpoint_state(model_checkpoint_path)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
print("Continue training")
else:
sess.run(tf.global_variables_initializer())
for epoch in range(num_epoch):
for x, y in training_set:
feed_dict={cbow_model.input: x, target_output: y}
_, summary = sess.run([train_op, summary_op], feed_dict)
writer.add_summary(summary=summary, global_step=global_step.eval(sess))
n_iter = global_step.eval(sess)
if (n_iter % save_every) == 0:
saver.save(sess=sess, save_path=model_checkpoint_path, global_step=n_iter)
saver.save(sess=sess, save_path=model_checkpoint_path, global_step=n_iter)
print("Training Complete")
In [ ]: