In [88]:
%matplotlib inline

In [89]:
import numpy as np
from numpy.linalg import norm

In [90]:
import h5py

In [91]:
h5file = h5py.File("../reads/lt.h5","r")

In [92]:
wordvecs = h5file['weights'][:]

In [93]:
wordvecs.shape


Out[93]:
(10003, 650)

In [102]:
wordfile = "../convert/ptb.targ.dict"

In [103]:
id2word = {}
word2id = {}
with open(wordfile, "r") as f:
    for line in f:
        k,v = line.split()
        id2word[int(v)-1] = k
        word2id[k] = int(v)-1

In [ ]:


In [104]:
len(id2word)


Out[104]:
10002

In [105]:
#word2id = {id2word[i]:wordvecs[i] for i in id2word.keys()}

In [106]:
def nearest(vec, dot=False):
    vnorm = norm(vec)
    scores = []
    for i in range(len(id2word)):
        wvnorm = norm(wordvecs[i])
        if not dot:
            scores.append(np.dot(wordvecs[i], vec) / (vnorm * wvnorm))
        else:
            scores.append(np.dot(wordvecs[i], vec))
    score_ids = [(s, i) for i, s in enumerate(scores)]
    score_ids.sort()
    score_ids.reverse()
    return score_ids

In [109]:
closests = nearest(wordvecs[word2id["billion"]])

In [110]:
def print_nbest(closests):
    for score, i in closests[:20]:
        print("%s\t%s"%(id2word[i],score))
print_nbest(closests)


billion	1.0
trillion	0.579835
million	0.458194
subordinated	0.446064
stronger	0.41422
narrower	0.408159
six-month	0.408086
financial-services	0.404751
diminished	0.402304
n.c.	0.397402
high-technology	0.396394
10-year	0.394112
seven-year	0.393716
fourth	0.393411
freight	0.388148
magic	0.387952
broadcast	0.38463
generous	0.37885
unpopular	0.377596
three-month	0.376659

In [ ]: