In [2]:

    
import word2vec
# https://github.com/danielfrg/word2vec

Training



In [3]:

    
# Note -- you need to unzip text8.zip on the command-line before you start!
%time word2vec.word2phrase('text8', 'text8-phrases', verbose=False)









    



CPU times: user 21.8 ms, sys: 13.5 ms, total: 35.2 ms
Wall time: 46.1 s



In [4]:

    
%time word2vec.word2vec('text8-phrases', 'text8.bin', size=100, verbose=False)









    



CPU times: user 408 ms, sys: 159 ms, total: 567 ms
Wall time: 1min 23s



In [5]:

    
%time word2vec.word2clusters('text8', 'text8-clusters.txt', 100, verbose=False)









    



CPU times: user 410 ms, sys: 161 ms, total: 571 ms
Wall time: 1min 32s

Prediction



In [8]:

    
%time model = word2vec.load('text8.bin')









    



CPU times: user 1.97 s, sys: 78.9 ms, total: 2.05 s
Wall time: 2.06 s



In [9]:

    
model.vectors.shape









    Out[9]:





(98331, 100)



In [10]:

    
model['dog'].shape









    Out[10]:





(100,)



In [11]:

    
indexes, metrics = model.cosine('dog')



In [12]:

    
model.vocab[indexes]









    Out[12]:





array(['cat', 'cow', 'goat', 'rat', 'pig', 'dogs', 'bird', 'hamster',
       'girl', 'wolf'], 
      dtype='<U78')



In [13]:

    
metrics









    Out[13]:





array([ 0.86270157,  0.84419873,  0.7717624 ,  0.76496641,  0.76001853,
        0.75595624,  0.75547447,  0.75198746,  0.75165411,  0.74832082])



In [14]:

    
indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'], n=10)



In [15]:

    
model.vocab[indexes]









    Out[15]:





array(['queen', 'prince', 'empress', 'regent', 'wife', 'emperor',
       'monarch', 'aragon', 'son', 'throne'], 
      dtype='<U78')



In [19]:

    
clusters = word2vec.load_clusters('text8-clusters.txt')



In [20]:

    
clusters.get_words_on_cluster(90)









    Out[20]:





array([b'pens', b'dumping', b'appropriately', ..., b'udder',
       b'toxicological', b'skirting'], dtype=object)



In [ ]: