In [88]:
%matplotlib inline
In [89]:
import numpy as np
from numpy.linalg import norm
In [90]:
import h5py
In [91]:
h5file = h5py.File("../reads/lt.h5","r")
In [92]:
wordvecs = h5file['weights'][:]
In [93]:
wordvecs.shape
Out[93]:
In [102]:
wordfile = "../convert/ptb.targ.dict"
In [103]:
id2word = {}
word2id = {}
with open(wordfile, "r") as f:
for line in f:
k,v = line.split()
id2word[int(v)-1] = k
word2id[k] = int(v)-1
In [ ]:
In [104]:
len(id2word)
Out[104]:
In [105]:
#word2id = {id2word[i]:wordvecs[i] for i in id2word.keys()}
In [106]:
def nearest(vec, dot=False):
vnorm = norm(vec)
scores = []
for i in range(len(id2word)):
wvnorm = norm(wordvecs[i])
if not dot:
scores.append(np.dot(wordvecs[i], vec) / (vnorm * wvnorm))
else:
scores.append(np.dot(wordvecs[i], vec))
score_ids = [(s, i) for i, s in enumerate(scores)]
score_ids.sort()
score_ids.reverse()
return score_ids
In [109]:
closests = nearest(wordvecs[word2id["billion"]])
In [110]:
def print_nbest(closests):
for score, i in closests[:20]:
print("%s\t%s"%(id2word[i],score))
print_nbest(closests)
In [ ]: