notebook.community

Edit and run



In [88]:

    
%matplotlib inline



In [89]:

    
import numpy as np
from numpy.linalg import norm



In [90]:

    
import h5py



In [91]:

    
h5file = h5py.File("../reads/lt.h5","r")



In [92]:

    
wordvecs = h5file['weights'][:]



In [93]:

    
wordvecs.shape









    Out[93]:





(10003, 650)



In [102]:

    
wordfile = "../convert/ptb.targ.dict"



In [103]:

    
id2word = {}
word2id = {}
with open(wordfile, "r") as f:
    for line in f:
        k,v = line.split()
        id2word[int(v)-1] = k
        word2id[k] = int(v)-1



In [ ]:



In [104]:

    
len(id2word)









    Out[104]:





10002



In [105]:

    
#word2id = {id2word[i]:wordvecs[i] for i in id2word.keys()}



In [106]:

    
def nearest(vec, dot=False):
    vnorm = norm(vec)
    scores = []
    for i in range(len(id2word)):
        wvnorm = norm(wordvecs[i])
        if not dot:
            scores.append(np.dot(wordvecs[i], vec) / (vnorm * wvnorm))
        else:
            scores.append(np.dot(wordvecs[i], vec))
    score_ids = [(s, i) for i, s in enumerate(scores)]
    score_ids.sort()
    score_ids.reverse()
    return score_ids



In [109]:

    
closests = nearest(wordvecs[word2id["billion"]])



In [110]:

    
def print_nbest(closests):
    for score, i in closests[:20]:
        print("%s\t%s"%(id2word[i],score))
print_nbest(closests)









    



billion	1.0
trillion	0.579835
million	0.458194
subordinated	0.446064
stronger	0.41422
narrower	0.408159
six-month	0.408086
financial-services	0.404751
diminished	0.402304
n.c.	0.397402
high-technology	0.396394
10-year	0.394112
seven-year	0.393716
fourth	0.393411
freight	0.388148
magic	0.387952
broadcast	0.38463
generous	0.37885
unpopular	0.377596
three-month	0.376659



In [ ]: