In [2]:
import word2vec
# https://github.com/danielfrg/word2vec

Training


In [3]:
# Note -- you need to unzip text8.zip on the command-line before you start!
%time word2vec.word2phrase('text8', 'text8-phrases', verbose=False)


CPU times: user 21.8 ms, sys: 13.5 ms, total: 35.2 ms
Wall time: 46.1 s

In [4]:
%time word2vec.word2vec('text8-phrases', 'text8.bin', size=100, verbose=False)


CPU times: user 408 ms, sys: 159 ms, total: 567 ms
Wall time: 1min 23s

In [5]:
%time word2vec.word2clusters('text8', 'text8-clusters.txt', 100, verbose=False)


CPU times: user 410 ms, sys: 161 ms, total: 571 ms
Wall time: 1min 32s

Prediction


In [8]:
%time model = word2vec.load('text8.bin')


CPU times: user 1.97 s, sys: 78.9 ms, total: 2.05 s
Wall time: 2.06 s

In [9]:
model.vectors.shape


Out[9]:
(98331, 100)

In [10]:
model['dog'].shape


Out[10]:
(100,)

In [11]:
indexes, metrics = model.cosine('dog')

In [12]:
model.vocab[indexes]


Out[12]:
array(['cat', 'cow', 'goat', 'rat', 'pig', 'dogs', 'bird', 'hamster',
       'girl', 'wolf'], 
      dtype='<U78')

In [13]:
metrics


Out[13]:
array([ 0.86270157,  0.84419873,  0.7717624 ,  0.76496641,  0.76001853,
        0.75595624,  0.75547447,  0.75198746,  0.75165411,  0.74832082])

In [14]:
indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'], n=10)

In [15]:
model.vocab[indexes]


Out[15]:
array(['queen', 'prince', 'empress', 'regent', 'wife', 'emperor',
       'monarch', 'aragon', 'son', 'throne'], 
      dtype='<U78')

In [19]:
clusters = word2vec.load_clusters('text8-clusters.txt')

In [20]:
clusters.get_words_on_cluster(90)


Out[20]:
array([b'pens', b'dumping', b'appropriately', ..., b'udder',
       b'toxicological', b'skirting'], dtype=object)

In [ ]: