In [1]:
import numpy as np
from scipy.spatial.distance import cosine
In [2]:
from dstoolbox.data import load_w2v_format
In [3]:
word2idx, word_embeddings = load_w2v_format('/mnt/data01/work/deep_query/data/word2vec/w2v_no_cooccs_CH_min_count20.csv')
word2idx
is a mapping from a word from the vocabulary to the index in the embedding matrix
In [4]:
word2idx['damen'], word2idx['herren'], word2idx['adidas'], word2idx['hilfiger']
Out[4]:
word_embeddings
are the word embeddings with shape vocabulary size x embedding size (syn0
in gensim)
In [5]:
word_embeddings.shape, word_embeddings.dtype
Out[5]:
In [6]:
word_embeddings[word2idx['damen']]
Out[6]:
In [7]:
def most_similar(word, topn=5):
idx2word = {val:key for key, val in word2idx.items()}
idx = word2idx[word]
vec = word_embeddings[idx]
dists = [cosine(vec, embedding) for embedding in word_embeddings]
top_simils = 1 - np.sort(dists)[1:topn + 1]
top_args = np.argsort(dists)[1:topn + 1]
top_words = [idx2word[i] for i in top_args]
return [(w, s) for w, s in zip(top_words, top_simils)]
In [8]:
most_similar('damen')
Out[8]:
In [12]:
most_similar('adidas')
Out[12]:
In [10]:
most_similar('tommy')
Out[10]:
In [ ]: