To play with this notebook, you'll need Numpy, Annoy, Gensim, and the GoogleNews word2vec model
Inspired by: https://github.com/chrisjmccormick/inspect_word2vec
In [1]:
# import and init
from annoy import AnnoyIndex
import gensim
import os.path
import numpy as np
prefix_filename = 'word2vec'
ann_filename = prefix_filename + '.ann'
i2k_filename = prefix_filename + '_i2k.npy'
k2i_filename = prefix_filename + '_k2i.npy'
In [2]:
# Load Google's pre-trained Word2Vec model.
print "load GoogleNews Model"
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
print "loading done"
hello = model['hello']
vector_size = len(hello)
print 'model size=', len(model.vocab)
print 'vector size=', vector_size
In [3]:
# process the model and save a model
# or load the model directly
vocab = model.vocab.keys()
#indexNN = AnnoyIndex(vector_size, metric='angular')
indexNN = AnnoyIndex(vector_size)
index2key = [None]*len(model.vocab)
key2index = {}
if not os.path.isfile(ann_filename):
print 'creating indexes'
i = 0
try:
for key in vocab:
indexNN.add_item(i, model[key])
key2index[key]=i
index2key[i]=key
i=i+1
if (i%10000==0):
print i, key
except TypeError:
print 'Error with key', key
print 'building 10 trees'
indexNN.build(10) # 10 trees
print 'save files'
indexNN.save(ann_filename)
np.save(i2k_filename, index2key)
np.save(k2i_filename, key2index)
print 'done'
else:
print "loading files"
indexNN.load(ann_filename)
index2key = np.load(i2k_filename)
key2index = np.load(k2i_filename)
print "loading done:", indexNN.get_n_items(), "items"
In [10]:
what_vec = model['king'] - model['male'] + model['female']
what_indexes = indexNN.get_nns_by_vector(what_vec, 1)
print index2key[what_indexes[0]]
In [12]:
what_vec = model['king'] - model['boy'] + model['girl']
what_indexes = indexNN.get_nns_by_vector(what_vec, 1)
print index2key[what_indexes[0]]
In [15]:
what_vec = model['king'] - model['man'] + model['women']
what_indexes = indexNN.get_nns_by_vector(what_vec, 1)
print index2key[what_indexes[0]]
In [14]:
what_vec = model['Berlin'] - model['Germany'] + model['France']
what_indexes = indexNN.get_nns_by_vector(what_vec, 1)
print index2key[what_indexes[0]]
In [12]:
what_vec = model['Trump'] + model['Germany'] - model['USA']
what_indexes = indexNN.get_nns_by_vector(what_vec, 1)
for i in what_indexes:
print index2key[i]
In [53]:
man2women = - model['boy'] + model['girl']
word_list = ["king","prince", "male", "boy","dad", "father", "president", "dentist",
"scientist", "efficient", "teacher", "doctor", "minister", "lover"]
for word in word_list:
what_vec = model[word] + man2women
what_indexes = indexNN.get_nns_by_vector(what_vec, 1)
print word, "for him,", index2key[what_indexes[0]], "for her."
In [54]:
capital = model['Berlin'] - model['Germany']
word_list = ["Germany", "France", "Italy", "USA", "Russia", "boys", "cars", "flowers", "soldiers",
"scientists", ]
for word in word_list:
what_vec = model[word] + capital
what_indexes = indexNN.get_nns_by_vector(what_vec, 1)
print index2key[what_indexes[0]], "is the capital of", word
If you play with this notebook and find good word2vec equation, please tweet them to me!
@dh7net