notebook.community

Edit and run



In [1]:

    
%pylab inline
import matplotlib.pyplot as plt
from word2vec import Corpus, CBOW









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
corpus = Corpus('data/', vocab_size=8000, corpus_file='corpus')



In [4]:

    
vector_size = 150
trainer = CBOW(vector_size, corpus)



In [5]:

    
trainer.load('cbow.npy')



In [6]:

    
import numpy as np
from operator import itemgetter
# Taken from here https://github.com/mhjabreel/word2vec_theano/blob/master/w2v.ipynb
word_vectors = trainer.W_in.eval()
top_words = sorted(corpus.word_freq.items(), key=itemgetter(1), reverse=True)[:100]

visualize_words = zip(*top_words)[0]
 
visualize_idx = [corpus.word_to_idx[word] for word in visualize_words]
visualize_vecs = word_vectors[visualize_idx, :]
temp = (visualize_vecs - np.mean(visualize_vecs, axis=0))
covariance = 1.0 / len(visualize_idx) * temp.T.dot(temp)
U, S, V = np.linalg.svd(covariance)
coord = temp.dot(U[:,0:2]) 

fig = plt.figure()
fig.set_size_inches((16, 10))
for i in xrange(len(visualize_words)):
    plt.text(coord[i,0], coord[i,1], visualize_words[i], bbox=dict(facecolor='green', alpha=0.1))
    
plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
plt.show()



In [7]:

    
import cPickle as pickle
with open('cbow_losses.npy', 'r') as f:
    loss_changes = pickle.load(f)[::100]

t = list(range(len(loss_changes)))

plt.plot(t, loss_changes, 'r')
plt.show()



In [8]:

    
def export_embeddings(corpus, word_vectors, file_name):
    with open(file_name, 'w+') as f:
        for word, idx in corpus.word_to_idx.items():
            print >>f, word, ' '.join(map(str, word_vectors[idx]))



In [9]:

    
def export_dict(corpus, file_name):
    with open(file_name, 'w+') as f:
        for word, freq in corpus.word_freq.items():
            print >>f, word, freq



In [10]:

    
export_embeddings(corpus, word_vectors, 'cbow_embeddings.txt')
export_dict(corpus, 'cbow_dict.txt')



In [11]:

    
%%bash
python evaluation/GloVe/eval/python/evaluate.py --vocab_file cbow_dict.txt --vectors_file cbow_embeddings.txt
python evaluation/eval-word-vectors/all_wordsim.py cbow_embeddings.txt evaluation/eval-word-vectors/data/word-sim/









    



capital-common-countries.txt:
ACCURACY TOP1: 0.83% (2/240)
capital-world.txt:
ACCURACY TOP1: 0.66% (1/152)
currency.txt:
ACCURACY TOP1: 7.14% (2/28)
city-in-state.txt:
ACCURACY TOP1: 0.23% (1/443)
family.txt:
ACCURACY TOP1: 0.00% (0/182)
gram1-adjective-to-adverb.txt:
ACCURACY TOP1: 0.00% (0/420)
gram2-opposite.txt:
ACCURACY TOP1: 4.17% (3/72)
gram3-comparative.txt:
ACCURACY TOP1: 0.18% (1/552)
gram4-superlative.txt:
ACCURACY TOP1: 0.00% (0/132)
gram5-present-participle.txt:
ACCURACY TOP1: 0.00% (0/462)
gram6-nationality-adjective.txt:
ACCURACY TOP1: 0.68% (5/737)
gram7-past-tense.txt:
ACCURACY TOP1: 0.00% (0/702)
gram8-plural.txt:
ACCURACY TOP1: 0.59% (3/506)
gram9-plural-verbs.txt:
ACCURACY TOP1: 0.00% (0/272)
Questions seen/total: 25.07% (4900/19544)
Semantic accuracy: 0.57%  (6/1045)
Syntactic accuracy: 0.31%  (12/3855)
Total accuracy: 0.37%  (18/4900)
=================================================================================
Serial              Dataset       Num Pairs       Not found             Rho
=================================================================================
     1    EN-SIMLEX-999.txt             999             387          0.0661
     2        EN-YP-130.txt             130              85         -0.0063
     3      EN-VERB-143.txt             144               7         -0.0114
     4         EN-MC-30.txt              30              20          0.0667
     5     EN-MTurk-771.txt             771             305          0.1847
     6   EN-RW-STANFORD.txt            2034            1910          0.2226
     7    EN-WS-353-SIM.txt             203              65          0.3420
     8     EN-MTurk-287.txt             287             116          0.3973
     9    EN-WS-353-ALL.txt             353             109          0.3356
    10     EN-MEN-TR-3k.txt            3000            1728          0.3091
    11         EN-RG-65.txt              65              53          0.4545
    12    EN-WS-353-REL.txt             252              70          0.2358
Sum of scores:          2.5968






    



Vectors read from: cbow_embeddings.txt



In [ ]: