Compiling cython code


In [ ]:
!python setup.py build_ext --inplace

In [1]:
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

In [46]:
import time
import numpy as np
import mxnet as mx
from gensim.models import Word2Vec
from gensim.models.word2vec import Text8Corpus
from preprocessing import data_iterator_cython
import logging
import sys, random, time, math
from collections import namedtuple
from operator import itemgetter
from sklearn.preprocessing import normalize

In [ ]:
!wget http://mattmahoney.net/dc/text8.zip -O text8.gz && gzip -d text8.gz -f

In [ ]:
corpus = Text8Corpus("text8")
current_time = time.time()
model = Word2Vec(iter=1, sg=1)
model.build_vocab(corpus)
print "Building vocab took %s seconds" % (time.time() - current_time)

In [5]:
batch_data = []
batch_label = []
batch_label_weight = []

In [6]:
current_time = time.time()
job_batch, batch_size = [], 0
for sent_idx, sentence in enumerate(corpus):
    sentence_length = model._raw_word_count([sentence])

    # can we fit this sentence into the existing job batch?
    if batch_size + sentence_length <= model.batch_words:
        # yes => add it to the current job
        job_batch.append(sentence)
        batch_size += sentence_length
    else:
        sents = data_iterator_cython(model, job_batch, model.alpha)
        for sent in sents:
            batch_data.append(sent[0])
            batch_label.append(sent[1:])
        job_batch[:] = []
        batch_size = 0
print time.time() - current_time
print "Data prep took: ", time.time() - current_time


60.5669488907
Data prep took:  60.5673439503

In [7]:
batch_data = mx.nd.array(batch_data)
batch_label = mx.nd.array(batch_label)

In [8]:
target_weight = mx.nd.zeros((batch_data.shape[0], model.negative+1))
target_weight[:,0] = 1

In [9]:
batch_data = mx.nd.expand_dims(batch_data, axis = 1)

In [76]:
batch_size = 512

In [77]:
nd_iter = mx.io.NDArrayIter(data = {"center_word" : batch_data, "target_words": batch_label},
                            label={ "labels":target_weight},
                            batch_size=batch_size, shuffle = True)

In [78]:
neg_dim = model.negative
vocab_size = len(model.wv.vocab)
dim = model.vector_size

In [79]:
def get_sym_makeloss(vocab_size, dim, batch_size):
    labels = mx.sym.Variable('labels') #1 positive and k "0" labels
    center_word = mx.sym.Variable('center_word')
    target_words = mx.sym.Variable('target_words') # 1 target + k negative samples
    center_vector = mx.sym.Embedding(data = center_word, input_dim = vocab_size,
                                  output_dim = dim, name = 'syn0_embedding')
    target_vectors = mx.sym.Embedding(data = target_words, input_dim = vocab_size,
                                   output_dim = dim, name = 'syn1_embedding')
    pred = mx.sym.batch_dot(target_vectors, center_vector, transpose_b=True)
    sigmoid = mx.sym.sigmoid(mx.sym.flatten(pred))
    loss = mx.sym.sum(labels * mx.sym.log(sigmoid) + (1 - labels) * mx.sym.log(1 - sigmoid), axis=1)
    loss *= -1.0
    loss_layer = mx.sym.MakeLoss(loss, normalization="batch")
    return loss_layer

In [ ]:
def mean_loss(label, pred):
    return np.mean(pred)

In [ ]:
nd_iter.reset()
sym = get_sym_makeloss(vocab_size, dim, batch_size)
network = mx.mod.Module(sym, data_names=("center_word", "target_words",), label_names=("labels",),context=mx.gpu())
network.bind(data_shapes=nd_iter.provide_data, label_shapes=nd_iter.provide_label)
current_time = time.time()
network.fit(nd_iter, num_epoch=1,optimizer='adam',
            eval_metric=mx.metric.CustomMetric(mean_loss),
            optimizer_params={'learning_rate': .001},
            batch_end_callback=mx.callback.Speedometer(batch_size, 1000),
            initializer=mx.initializer.Uniform(scale=.01))
print time.time() - current_time


WARNING:Already bound, ignoring bind()
INFO:Epoch[0] Batch [1000]	Speed: 67514.10 samples/sec	mean_loss=3.324762
INFO:Epoch[0] Batch [2000]	Speed: 68750.90 samples/sec	mean_loss=2.749573
INFO:Epoch[0] Batch [3000]	Speed: 68509.34 samples/sec	mean_loss=2.640605
INFO:Epoch[0] Batch [4000]	Speed: 68214.37 samples/sec	mean_loss=2.589391
INFO:Epoch[0] Batch [5000]	Speed: 68182.27 samples/sec	mean_loss=2.556170
INFO:Epoch[0] Batch [6000]	Speed: 68160.76 samples/sec	mean_loss=2.530014
INFO:Epoch[0] Batch [7000]	Speed: 68195.24 samples/sec	mean_loss=2.510201
INFO:Epoch[0] Batch [8000]	Speed: 68133.54 samples/sec	mean_loss=2.492381
INFO:Epoch[0] Batch [9000]	Speed: 68027.80 samples/sec	mean_loss=2.477695
INFO:Epoch[0] Batch [10000]	Speed: 67952.16 samples/sec	mean_loss=2.463432
INFO:Epoch[0] Batch [11000]	Speed: 67983.85 samples/sec	mean_loss=2.451389
INFO:Epoch[0] Batch [12000]	Speed: 68103.80 samples/sec	mean_loss=2.442641
INFO:Epoch[0] Batch [13000]	Speed: 67980.46 samples/sec	mean_loss=2.433064
INFO:Epoch[0] Batch [14000]	Speed: 67730.39 samples/sec	mean_loss=2.424111
INFO:Epoch[0] Batch [15000]	Speed: 67852.51 samples/sec	mean_loss=2.414362
INFO:Epoch[0] Batch [16000]	Speed: 67830.14 samples/sec	mean_loss=2.408149
INFO:Epoch[0] Batch [17000]	Speed: 67816.72 samples/sec	mean_loss=2.402137
INFO:Epoch[0] Batch [18000]	Speed: 67779.51 samples/sec	mean_loss=2.396613
INFO:Epoch[0] Batch [19000]	Speed: 67794.72 samples/sec	mean_loss=2.390748
INFO:Epoch[0] Batch [20000]	Speed: 67896.24 samples/sec	mean_loss=2.385985
INFO:Epoch[0] Batch [21000]	Speed: 67810.95 samples/sec	mean_loss=2.382373
INFO:Epoch[0] Batch [22000]	Speed: 67743.80 samples/sec	mean_loss=2.377004
INFO:Epoch[0] Batch [23000]	Speed: 67762.66 samples/sec	mean_loss=2.372241
INFO:Epoch[0] Batch [24000]	Speed: 67720.10 samples/sec	mean_loss=2.369666
INFO:Epoch[0] Batch [25000]	Speed: 67725.52 samples/sec	mean_loss=2.366825
INFO:Epoch[0] Batch [26000]	Speed: 67697.73 samples/sec	mean_loss=2.361496
INFO:Epoch[0] Batch [27000]	Speed: 67686.33 samples/sec	mean_loss=2.358756
INFO:Epoch[0] Batch [28000]	Speed: 67740.80 samples/sec	mean_loss=2.354761
INFO:Epoch[0] Batch [29000]	Speed: 67735.33 samples/sec	mean_loss=2.355581
INFO:Epoch[0] Batch [30000]	Speed: 67715.76 samples/sec	mean_loss=2.351064
INFO:Epoch[0] Batch [31000]	Speed: 67751.33 samples/sec	mean_loss=2.347874
INFO:Epoch[0] Batch [32000]	Speed: 67656.19 samples/sec	mean_loss=2.343565
INFO:Epoch[0] Batch [33000]	Speed: 67759.25 samples/sec	mean_loss=2.341243
INFO:Epoch[0] Batch [34000]	Speed: 67711.36 samples/sec	mean_loss=2.338863
INFO:Epoch[0] Batch [35000]	Speed: 67727.28 samples/sec	mean_loss=2.337944
INFO:Epoch[0] Batch [36000]	Speed: 67809.66 samples/sec	mean_loss=2.335212
INFO:Epoch[0] Batch [37000]	Speed: 67680.95 samples/sec	mean_loss=2.333533
INFO:Epoch[0] Batch [38000]	Speed: 67876.51 samples/sec	mean_loss=2.332975
INFO:Epoch[0] Batch [39000]	Speed: 67886.54 samples/sec	mean_loss=2.332215
INFO:Epoch[0] Batch [40000]	Speed: 67921.23 samples/sec	mean_loss=2.327820
INFO:Epoch[0] Batch [41000]	Speed: 67713.19 samples/sec	mean_loss=2.326940
INFO:Epoch[0] Batch [42000]	Speed: 67681.45 samples/sec	mean_loss=2.325459
INFO:Epoch[0] Batch [43000]	Speed: 67913.52 samples/sec	mean_loss=2.323512
INFO:Epoch[0] Batch [44000]	Speed: 67631.49 samples/sec	mean_loss=2.321704
INFO:Epoch[0] Batch [45000]	Speed: 67738.53 samples/sec	mean_loss=2.321032
INFO:Epoch[0] Batch [46000]	Speed: 67643.81 samples/sec	mean_loss=2.318252
INFO:Epoch[0] Batch [47000]	Speed: 67939.77 samples/sec	mean_loss=2.316331
INFO:Epoch[0] Batch [48000]	Speed: 67909.84 samples/sec	mean_loss=2.314698
INFO:Epoch[0] Batch [49000]	Speed: 67648.43 samples/sec	mean_loss=2.315520
INFO:Epoch[0] Batch [50000]	Speed: 67928.81 samples/sec	mean_loss=2.311994
INFO:Epoch[0] Batch [51000]	Speed: 67713.57 samples/sec	mean_loss=2.311395
INFO:Epoch[0] Batch [52000]	Speed: 67876.75 samples/sec	mean_loss=2.311811
INFO:Epoch[0] Batch [53000]	Speed: 67721.82 samples/sec	mean_loss=2.309732
INFO:Epoch[0] Batch [54000]	Speed: 67903.75 samples/sec	mean_loss=2.308115
INFO:Epoch[0] Batch [55000]	Speed: 67938.74 samples/sec	mean_loss=2.310613
INFO:Epoch[0] Batch [56000]	Speed: 67961.86 samples/sec	mean_loss=2.307433
INFO:Epoch[0] Batch [57000]	Speed: 67958.09 samples/sec	mean_loss=2.304702
INFO:Epoch[0] Batch [58000]	Speed: 67836.98 samples/sec	mean_loss=2.303065
INFO:Epoch[0] Batch [59000]	Speed: 67706.65 samples/sec	mean_loss=2.305031
INFO:Epoch[0] Batch [60000]	Speed: 67841.63 samples/sec	mean_loss=2.303309
INFO:Epoch[0] Batch [61000]	Speed: 67702.88 samples/sec	mean_loss=2.301929
INFO:Epoch[0] Batch [62000]	Speed: 67615.42 samples/sec	mean_loss=2.300102
INFO:Epoch[0] Batch [63000]	Speed: 67681.34 samples/sec	mean_loss=2.300638
INFO:Epoch[0] Batch [64000]	Speed: 67955.00 samples/sec	mean_loss=2.298462
INFO:Epoch[0] Batch [65000]	Speed: 67890.57 samples/sec	mean_loss=2.296858

In [69]:
all_vecs = network.get_params()[0]["syn0_embedding_weight"].asnumpy()
all_vecs = normalize(all_vecs, copy=False)

In [70]:
model.wv.syn0 = all_vecs
model.wv.syn0norm = all_vecs

In [75]:
model.most_similar("car")


Out[75]:
[(u'driver', 0.7786462306976318),
 (u'motorcycle', 0.7644623517990112),
 (u'airplane', 0.7162174582481384),
 (u'taxi', 0.7073050141334534),
 (u'supercar', 0.6959617137908936),
 (u'jumbo', 0.6949251294136047),
 (u'cars', 0.6885921359062195),
 (u'racing', 0.6778541207313538),
 (u'truck', 0.6697883605957031),
 (u'automobiles', 0.6633998155593872)]

In [ ]: