In [2]:
from __future__ import print_function
from annoy import AnnoyIndex
import numpy as np
import torch
from tqdm import tqdm_notebook as tqdm
from local_settings import settings, datautils
In [3]:
def load_word_vectors(filename=settings.GLOVE_FILENAME):
word_to_index = {}
word_vectors = []
with open(filename) as fp:
for line in tqdm(fp.readlines()):
line = line.split(" ")
word = line[0]
word_to_index[word] = len(word_to_index)
vec = np.array([float(x) for x in line[1:]])
word_vectors.append(vec)
word_vector_size = len(word_vectors[0])
return word_to_index, word_vectors, word_vector_size
In [4]:
class PreTrainedEmbeddings(object):
def __init__(self):
self.word_to_index, self.word_vectors, self.word_vector_size = load_word_vectors()
self.index_to_word = {v: k for k, v in self.word_to_index.items()}
self.index = AnnoyIndex(self.word_vector_size, metric='euclidean')
print('Building Index')
for _, i in tqdm(self.word_to_index.items()):
self.index.add_item(i, self.word_vectors[i])
self.index.build(50)
print('Finished!')
def get_embedding(self, word):
return self.word_vectors[self.word_to_index[word]]
def closest(self, word, n=1):
vector = self.get_embedding(word)
nn_indices = self.index.get_nns_by_vector(vector, n)
return [self.index_to_word[neighbor] for neighbor in nn_indices]
def closest_v(self, vector, n=1):
nn_indices = self.index.get_nns_by_vector(vector, n)
return [self.index_to_word[neighbor] for neighbor in nn_indices]
def sim(self, w1, w2):
return np.dot(self.get_embedding(w1), self.get_embedding(w2))
In [5]:
glove = PreTrainedEmbeddings()
In [6]:
glove.closest('apple', n=5)
Out[6]:
In [7]:
glove.closest('plane', n=5)
Out[7]:
In [8]:
glove.sim('beer', 'wine'), glove.sim('beer', 'gasoline')
Out[8]:
Lexical relationships uncovered by word embeddings
In [10]:
def SAT_analogy(w1, w2, w3):
'''
Solves problems of the type:
w1 : w2 :: w3 : __
'''
closest_words = []
try:
w1v = glove.get_embedding(w1)
w2v = glove.get_embedding(w2)
w3v = glove.get_embedding(w3)
w4v = w3v + (w2v - w1v)
closest_words = glove.closest_v(w4v, n=5)
closest_words = [w for w in closest_words if w not in [w1, w2, w3]]
except:
pass
if len(closest_words) == 0:
print(':-(')
else:
print('{} : {} :: {} : {}'.format(w1, w2, w3, closest_words[0]))
Pronouns
In [11]:
SAT_analogy('man', 'he', 'woman')
Verb-Noun relationships
In [12]:
SAT_analogy('fly', 'plane', 'sail')
Noun-Noun relationships
In [20]:
SAT_analogy('cat', 'kitten', 'dog')
Hypernymy
In [23]:
SAT_analogy('blue', 'color', 'dog')
Meronymy
In [57]:
SAT_analogy('leg', 'legs', 'hand')
Troponymy
In [32]:
SAT_analogy('talk', 'communicate', 'read')
Metonymy
In [41]:
SAT_analogy('blue', 'democrat', 'red')
Misc
In [13]:
SAT_analogy('man', 'doctor', 'woman')
In [14]:
SAT_analogy('man', 'leader', 'woman')