Word2Vec

Brief

A tutorial for CBOW model within TensorFlow (API r1.3) framework.

Reference

Lecture note from Stanford

Import


In [ ]:
import tensorflow as tf
import numpy as np
import os

Load corpus


In [ ]:
corpus_path = os.path.normpath("../Dataset/arvix_abstracts.txt")
with open(corpus_path,"r") as f:
    corpus = "".join(f.readlines()).split("\n")

Define class for codec


In [ ]:
class WordCodec:
    def __init__(self, word_flow):
        self._index_to_word = []
        self._word_to_index = {}
        for word in word_flow:
            assert type(word)!="str", "Got type {} instead of str".format(type(word))
            if word not in self._word_to_index:
                self._word_to_index[word] = len(self._index_to_word)
                self._index_to_word.append(word)

    def __getitem__(self, key):
        if type(key) == int:
            return self._index_to_word[key]
        elif type(key) == str:
            return self._word_to_index[key]
        else:
            raise TypeError("key must be either int or str.")

    @property
    def vocab_size(self):
        return len(self._index_to_word)

In [ ]:
def word_flow():
    for paragraph in corpus:
        for word in paragraph.split(" "):
            yield word
one_hot_codec = WordCodec(word_flow())
print("Total number of word in the vocabulary: {}".format(one_hot_codec.vocab_size))

Define CBOW Model


In [ ]:
class CBOW:
    def __init__(self, vocab_size, context_length, embedding_dim):
        with tf.variable_scope("CBOW"):
            self._context_input = tf.placeholder(shape=[2*context_length], dtype=tf.int32)
            V = tf.get_variable(shape=[vocab_size, embedding_dim], dtype=tf.float32, name="V")  # Embedding
            self._embedding = tf.nn.embedding_lookup(V, self._context_input)
            hidden = tf.reduce_mean(self._embedding, axis=0, keep_dims=True)
            U = tf.get_variable(shape=[embedding_dim, vocab_size], name="U")
            self._output = tf.matmul(hidden, U)
    
    @property
    def input(self):
        return self._context_input
    
    @property
    def output(self):
        return self._output
    
    @property
    def embedding(self):
        return self._embedding

Create CBOW Model


In [ ]:
context_length = 2
embedding_dim = 1000
cbow_model = CBOW(one_hot_codec.vocab_size, context_length, embedding_dim)

Create Metadata File for Embedding Visualization


In [ ]:
metadata_path = os.path.normpath("./graphs/word_codec")
with open(metadata_path, "w") as f:
    f.write("Index\tWord\n")
    for i in range(one_hot_codec.vocab_size):
        f.write("{}\t{}\n".format(i, one_hot_codec[i]))

Define Loss Function and Start Training


In [ ]:
def training_sample_generator(corpus, codec, context_length):
    for paragraph in corpus:
        paragraph = np.array([codec[word] for word in paragraph.split(" ")])
        for i in range(context_length, np.shape(paragraph)[0]-context_length):
            yield np.concatenate([paragraph[i-context_length:i], paragraph[i+1:i+context_length+1]],axis=0), paragraph[i:i+1]
target_output=tf.placeholder(shape=[1],dtype=tf.int32, name="target_output")
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=cbow_model.output, labels=target_output))
lr = tf.Variable(1e-4, trainable=False)
global_step = tf.Variable(0, trainable=False)
train_op=tf.train.AdamOptimizer(learning_rate=lr).minimize(loss, global_step=global_step)
training_set = training_sample_generator(corpus, one_hot_codec, context_length)
with tf.name_scope("summary") as scope:
    summary_op = tf.summary.scalar(name="loss",tensor=loss)
num_epoch = 20
graph_path = "./graphs"
model_checkpoint_path = os.path.join("./graphs", "CBOW")
save_every = 1000
with tf.Session() as sess:
    saver = tf.train.Saver()
    writer = tf.summary.FileWriter(logdir=graph_path, graph=sess.graph)
    ckpt = tf.train.get_checkpoint_state(model_checkpoint_path)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        print("Continue training")
    else:
        sess.run(tf.global_variables_initializer())
    for epoch in range(num_epoch):
        for x, y in training_set:
            feed_dict={cbow_model.input: x, target_output: y}
            _, summary = sess.run([train_op, summary_op], feed_dict)
            writer.add_summary(summary=summary, global_step=global_step.eval(sess))
            n_iter = global_step.eval(sess)
            if (n_iter % save_every) == 0:
                saver.save(sess=sess, save_path=model_checkpoint_path, global_step=n_iter)   
        saver.save(sess=sess, save_path=model_checkpoint_path, global_step=n_iter)
    print("Training Complete")

In [ ]: