In [1]:
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
import mxnet as mx
import time
import numpy as np

In [93]:
batch_size = 2048
neg = 5

In [4]:
current = time.time()
it = mx.io.Word2VecIter(file_path="./text8",
                        vocab_path="./word2vec_vocab", # To save the vocabulary
                        batch_size=batch_size,
                        prefetch_buffer=100,
                        negative_samples=neg)
print(time.time() - current)


4.00090694427

In [102]:
def get_sym_makeloss(vocab_size, dim, batch_size, neg):
    labels = mx.sym.one_hot(mx.sym.zeros(batch_size,), depth = neg+1) #1 positive and k "0" labels
    center_word = mx.sym.Variable('data')
    target_words = mx.sym.Variable('softmax_label') # 1 target + k negative samples
    center_vector = mx.sym.Embedding(data = center_word, input_dim = vocab_size,
                                  output_dim = dim, name = 'syn0_embedding')
    target_vectors = mx.sym.Embedding(data = target_words, input_dim = vocab_size,
                                   output_dim = dim, name = 'syn1_embedding')
    pred = mx.sym.batch_dot(target_vectors, center_vector, transpose_b=True)
    sigmoid = mx.sym.sigmoid(mx.sym.flatten(pred))
    loss = -1 * mx.sym.sum(labels * mx.sym.log(sigmoid) + (1 - labels) * mx.sym.log(1 - sigmoid), axis=1)
    loss_layer = mx.sym.MakeLoss(loss)
    return loss_layer

In [26]:
# Read the list of words from the binary file that has null terminated strings.
# This file is produced by the data iterator after doing the preprocessing in C++
def read_binary(f, bufsize):
    buf = ""
    data = True
    while data:
        data = f.read(bufsize)
        buf += data
        lines = buf.split('\x00')
        buf = lines.pop()
        for line in lines:
            yield line
    yield buf
    
def mean_loss(label, pred):
    return np.mean(pred)

In [ ]:
fin = open("./word2vec_vocab", "rb") # Iterator saves this automatically
words_list = read_binary(fin, 1024*1024)
vocab = [word for word in words_list]
word_to_index = {vocab[i]:i for i in range(len(vocab))}

In [97]:
vector_dim = 100

In [123]:
sym = get_sym_makeloss(vocab_size, vector_dim, batch_size, neg)
network = mx.mod.Module(sym ,context=mx.gpu())
network.bind(data_shapes=it.provide_data, label_shapes=it.provide_label)

In [124]:
opt = mx.optimizer.Adam(learning_rate=.001, rescale_grad=1.0/batch_size)

In [125]:
current_time = time.time()
network.fit(it, num_epoch=1,optimizer=opt,
            eval_metric=mx.metric.CustomMetric(mean_loss),
            batch_end_callback=mx.callback.Speedometer(batch_size, 1000),
            initializer=mx.initializer.Uniform(scale=.05))
print time.time() - current_time


WARNING:Already bound, ignoring bind()
INFO:Epoch[0] Batch [1000]	Speed: 216630.06 samples/sec	mean_loss=3.182736
INFO:Epoch[0] Batch [2000]	Speed: 218407.69 samples/sec	mean_loss=2.276461
INFO:Epoch[0] Batch [3000]	Speed: 220015.31 samples/sec	mean_loss=2.253708
INFO:Epoch[0] Batch [4000]	Speed: 222719.81 samples/sec	mean_loss=2.172011
INFO:Epoch[0] Batch [5000]	Speed: 221057.09 samples/sec	mean_loss=2.090425
INFO:Epoch[0] Batch [6000]	Speed: 231495.19 samples/sec	mean_loss=2.189624
INFO:Epoch[0] Batch [7000]	Speed: 228004.42 samples/sec	mean_loss=2.076356
INFO:Epoch[0] Batch [8000]	Speed: 227395.95 samples/sec	mean_loss=2.116351
INFO:Epoch[0] Batch [9000]	Speed: 227821.90 samples/sec	mean_loss=2.084706
INFO:Epoch[0] Batch [10000]	Speed: 222348.91 samples/sec	mean_loss=2.013445
INFO:Epoch[0] Batch [11000]	Speed: 218110.94 samples/sec	mean_loss=2.156342
INFO:Epoch[0] Batch [12000]	Speed: 224086.13 samples/sec	mean_loss=2.040256
INFO:Epoch[0] Batch [13000]	Speed: 215975.93 samples/sec	mean_loss=2.081876
INFO:Epoch[0] Batch [14000]	Speed: 223211.34 samples/sec	mean_loss=2.063716
INFO:Epoch[0] Batch [15000]	Speed: 219123.44 samples/sec	mean_loss=1.991266
INFO:Epoch[0] Batch [16000]	Speed: 225015.97 samples/sec	mean_loss=2.140804
INFO:Epoch[0] Batch [17000]	Speed: 217679.88 samples/sec	mean_loss=2.040144
INFO:Epoch[0] Batch [18000]	Speed: 226393.41 samples/sec	mean_loss=2.048723
INFO:Epoch[0] Batch [19000]	Speed: 227344.13 samples/sec	mean_loss=2.030444
INFO:Epoch[0] Batch [20000]	Speed: 221361.97 samples/sec	mean_loss=1.954846
INFO:Epoch[0] Batch [21000]	Speed: 223903.49 samples/sec	mean_loss=2.086621
INFO:Epoch[0] Batch [22000]	Speed: 220753.43 samples/sec	mean_loss=2.017514
INFO:Epoch[0] Batch [23000]	Speed: 224974.23 samples/sec	mean_loss=2.026557
INFO:Epoch[0] Batch [24000]	Speed: 225014.76 samples/sec	mean_loss=2.066373
INFO:Epoch[0] Batch [25000]	Speed: 221482.20 samples/sec	mean_loss=1.987588
INFO:Epoch[0] Batch [26000]	Speed: 233582.96 samples/sec	mean_loss=2.097982
INFO:Epoch[0] Batch [27000]	Speed: 223752.16 samples/sec	mean_loss=2.046453
INFO:Epoch[0] Train-mean_loss=2.007799
INFO:Epoch[0] Time cost=255.937
255.949116945

In [126]:
from sklearn.preprocessing import normalize

all_vecs = network.get_params()[0]["syn0_embedding_weight"].asnumpy()
all_vecs = normalize(all_vecs, copy=False)

def find_most_similar(word, vocab, word_to_index):
    if word not in vocab:
        print("Sorry word not found. Please try another one.")
    else:  
        i1 = word_to_index[word]
        prod = all_vecs.dot(all_vecs[i1])
        i2 = (-prod).argsort()[1:10]
        for i in i2:
            print vocab[i]

In [128]:
find_most_similar("car", vocab, word_to_index)


cars
driver
motorcycle
racing
truck
taxi
motocross
bike
seater