Using dstoolbox data utilities

Table of contents

Imports


In [1]:
import numpy as np
from scipy.spatial.distance import cosine

In [2]:
from dstoolbox.data import load_w2v_format

Load

load_w2v_format

Load data in the non-binary word2vec format without the need for gensim


In [3]:
word2idx, word_embeddings = load_w2v_format('/mnt/data01/work/deep_query/data/word2vec/w2v_no_cooccs_CH_min_count20.csv')

word2idx is a mapping from a word from the vocabulary to the index in the embedding matrix


In [4]:
word2idx['damen'], word2idx['herren'], word2idx['adidas'], word2idx['hilfiger']


Out[4]:
(0, 6, 3, 359)

word_embeddings are the word embeddings with shape vocabulary size x embedding size (syn0 in gensim)


In [5]:
word_embeddings.shape, word_embeddings.dtype


Out[5]:
((13560, 200), dtype('float64'))

To get the word embedding of a specific word, combine word2idx and word_embeddings


In [6]:
word_embeddings[word2idx['damen']]


Out[6]:
array([ 0.126052, -0.133172,  0.231597,  0.068036, -0.186931,  0.221891,
        0.272988,  0.300522,  0.164216,  0.157207, -0.22044 ,  0.196053,
        0.023566, -0.162425, -0.224334, -0.04167 ,  0.060861,  0.26892 ,
        0.215793,  0.439066,  0.019564, -0.125361, -0.037923,  0.273964,
        0.01485 ,  0.433455,  0.092299, -0.041746,  0.199729, -0.196166,
       -0.185454, -0.016025,  0.134996, -0.058808, -0.050719,  0.081409,
        0.036141, -0.251181,  0.08519 , -0.217346, -0.046914, -0.178017,
        0.227496,  0.265843, -0.266565, -0.063123, -0.177667,  0.123981,
       -0.309695,  0.008315,  0.391209, -0.243507, -0.260893,  0.367366,
        0.306098, -0.075811,  0.010251,  0.439092,  0.072738, -0.170283,
       -0.168254, -0.092516, -0.052548,  0.047091,  0.103203, -0.029597,
       -0.150255, -0.294971,  0.285867,  0.00501 , -0.197888,  0.083567,
       -0.166958, -0.202782,  0.352706, -0.012503,  0.042107,  0.299672,
        0.238865,  0.207802,  0.081623, -0.19322 , -0.363524, -0.057961,
        0.170283, -0.114691, -0.401281,  0.0774  ,  0.117782,  0.235933,
        0.111119, -0.211915,  0.410658, -0.193761,  0.10827 , -0.115976,
        0.105021, -0.150864, -0.042099, -0.100297, -0.286243, -0.084713,
        0.142059, -0.203064, -0.049878,  0.177673,  0.374136, -0.01401 ,
        0.288655, -0.084777,  0.278725, -0.246398, -0.020053,  0.269745,
       -0.168721, -0.292739,  0.144909, -0.086093, -0.17659 ,  0.135142,
       -0.303793,  0.173628, -0.037406,  0.319385,  0.05712 , -0.155994,
       -0.029666, -0.092107, -0.040732, -0.020144, -0.253143, -0.223423,
       -0.07731 , -0.279991,  0.050479,  0.095285, -0.162759, -0.297515,
       -0.078398, -0.159061, -0.127364, -0.109226, -0.008495, -0.086207,
       -0.2173  , -0.207722,  0.31633 ,  0.053002,  0.21598 ,  0.222069,
        0.157973, -0.026178, -0.096137,  0.037588, -0.03823 ,  0.310542,
       -0.006801,  0.194841,  0.061619,  0.048722,  0.257044,  0.11697 ,
        0.284349,  0.335223, -0.009635, -0.118353, -0.206398, -0.083029,
       -0.018467, -0.115973, -0.152569,  0.116636,  0.152055,  0.283252,
        0.342408,  0.119983, -0.209392, -0.107931, -0.056978,  0.343351,
        0.123252, -0.202132, -0.233932,  0.383042, -0.004454,  0.139824,
       -0.006091, -0.188954, -0.067386,  0.149429, -0.215873, -0.187914,
        0.087506,  0.046866,  0.004376, -0.112804, -0.365671, -0.462638,
        0.170785, -0.021571])

Use this data to find most similar words


In [7]:
def most_similar(word, topn=5):
    idx2word = {val:key for key, val in word2idx.items()}

    idx = word2idx[word]
    vec = word_embeddings[idx]

    dists = [cosine(vec, embedding) for embedding in word_embeddings]
    top_simils = 1 - np.sort(dists)[1:topn + 1]
    top_args = np.argsort(dists)[1:topn + 1]
    top_words = [idx2word[i] for i in top_args]
    
    return [(w, s) for w, s in zip(top_words, top_simils)]

In [8]:
most_similar('damen')


Out[8]:
[('frauen', 0.59448118821235674),
 ('herren', 0.58320111876510017),
 ('männer', 0.53813647316330504),
 ('heren', 0.50012515907729549),
 ('dünne', 0.49973127394486072)]

In [12]:
most_similar('adidas')


Out[12]:
[('performance', 0.75540151212044893),
 ('originals', 0.72882013475813734),
 ('superstar', 0.72711877150689253),
 ('80s', 0.70762354239093939),
 ('climacool', 0.67462843711748022)]

In [10]:
most_similar('tommy')


Out[10]:
[('hilfiger', 0.8613838934633643),
 ('thommy', 0.81397617179709092),
 ('schumuk', 0.59097077329733827),
 ('hifliger', 0.5783918831505992),
 ('ralph', 0.54271942741674994)]

In [ ]: