Downloadable version of GloVe embedding (with fallback source), as well as downloadable versions of sparsified GloVe embedding from own hosting.
Include instructions for Levy test-suite installation, so that any given embedding can be tested.
And functions/tools to play with the loaded embedding (of whatever type).
In [ ]:
RCL_BASE=('http://redcatlabs.com/downloads/'+
'deep-learning-workshop/notebooks/data/'+
'research/ICONIP-2016/')
"""
# http://redcatlabs.com/downloads/deep-learning-workshop/LICENSE
# Files in : ${RCL_BASE} :
# :: These are either as downloaded from GloVe site, or generated by Levy code
# 507206240 Oct 25 2015 2-pretrained-vectors_glove.6B.300d.hkl
# 160569440 May 14 14:57 1-glove-1-billion-and-wiki_window11-lc-36_vectors.2-17.hkl
# :: These are originals - citation desired...
# 53984642 May 15 14:13 sparse.6B.300d_S-21_2n-shuf-noise-after-norm_.2.01_6-75_4000_GPU-sparse_matrix.hkl
# 148011260 May 15 14:13 sparse.6B.300d_S-21_2n-shuf-noise-after-norm_.2.01_6-75_4000_GPU-sparsity_recreate.hkl
# 57907373 May 15 02:53 sparse.6B.300d_S-21_2n-shuf-noise-after-norm_4k_.2.01_1-50_5000_GPU-sparse_matrix.hkl
# 147946219 May 15 02:52 sparse.6B.300d_S-21_2n-shuf-noise-after-norm_4k_.2.01_1-50_5000_GPU-sparsity_recreate.hkl
# Spare?
# 122248980 May 2 13:09 misc/sparse.6B.300d_T-21_3500.1024@0.05-GPU-sparse_matrix.hkl
# 447610336 May 2 13:04 misc/sparse.6B.300d_T-21_3500.1024@0.05-GPU-sparsity_recreate.hkl
# 53312127 May 11 14:10 misc/sparse.6B.300d_S-21_2n-shuf_1024@6.75_2000_GPU-sparse_matrix.hkl
# 148027055 May 11 14:10 misc/sparse.6B.300d_S-21_2n-shuf_1024@6.75_2000_GPU-sparsity_recreate.hkl
# 57054795 May 11 12:09 misc/sparse.6B.300d_S-21_2n-shuf_4096@1.50_2000_GPU-sparse_matrix.hkl
# 147997824 May 11 12:09 misc/sparse.6B.300d_S-21_2n-shuf_4096@1.50_2000_GPU-sparsity_recreate.hkl
"""
import os, requests
def get_embedding_file( hkl ):
if not os.path.isfile(os.path.join('data', hkl)):
# ... requests.get( RCL_BASE + hkl)
pass
See : https://levyomer.files.wordpress.com/2015/03/improving-distributional-similarity-tacl-2015.pdf
To download the test suite, please run the script download-tests.bash
.
In [ ]:
import os, subprocess
from __future__ import print_function
def test_embedding_file(vectors_txt, vocab_max=131072 ):
# Do we need to process VECTORS_FILE->{ VECTORS_WORDS, VECTORS_NPY }?
# Answer = YES : the .words is required, and is used to create .npy and .vocab
vectors_txt_words = '%s.words' % (vectors_txt,)
if not os.path.isfile(vectors_txt_words) or os.stat(vectors_txt).st_mtime>os.stat(vectors_txt_words).st_mtime:
print("Reading %s" % (vectors_txt,))
# This is just a copy of 'text file' with the vocab_size and embedding_size pre-pended
#echo "131072 300" > ${VECTORS_WORDS}
#head -131072 ${VECTORS_FILE} >> ${VECTORS_WORDS}
with open(vectors_txt) as fin:
first_line = fin.readline()
embedding_dim = len(first_line.strip().split()) -1
vocab_size = len(fin.readlines()) +1 # Ouch! - read in whole file to find length
if vocab_size>vocab_max:
vocab_size=vocab_max
print("Building %s" % (vectors_txt_words,))
with open(vectors_txt) as fin:
with open(vectors_txt_words, 'wt') as fout:
# Write the first line, which, ironically, will be discarded by the omerlevy code
fout.write("%d %d\n" % (vocab_size, embedding_dim))
# And copy over at most vocab_max lines of the original file
for i, line in enumerate(fin.readlines()):
if i>vocab_size:
break
fout.write(line)
print("Built %s as %d %d-d vectors" % (vectors_txt_words, vocab_size, embedding_dim))
vectors_txt_npy = '%s.npy' % (vectors_txt_words,)
vectors_txt_vocab = '%s.vocab' % (vectors_txt_words,)
if not os.path.isfile(vectors_txt_npy) or os.stat(vectors_txt_words).st_mtime>os.stat(vectors_txt_npy).st_mtime:
print("Building %s and %s" % (vectors_txt_npy, vectors_txt_vocab, ))
# Sadly, we can't just invoke this as a python function - need to go via shell...
subprocess.call([ "python", "../omerlevy/hyperwords/text2numpy.py", vectors_txt_words ])
print("Built %s and %s" % (vectors_txt_npy, vectors_txt_vocab, ))
def run_word_test(test_str, test_cmd):
print(" %s" % ((test_str+' '*30)[:30],), end='')
#subprocess.call(test_cmd)
try:
res = subprocess.check_output( test_cmd, stderr=subprocess.STDOUT,)
print(" : %s" % (res.strip(),))
except subprocess.CalledProcessError as e:
print(" : ERROR : %s" % (test_str,))
print(e)
def run_word_similarity(test_str, test_set):
test_cmd = [
"python", "../omerlevy/hyperwords/ws_eval.py", "VECTORS", vectors_txt,
"../omerlevy/testsets/ws/%s" % (test_set,)
]
run_word_test(test_str, test_cmd)
def run_word_analogy(test_str, test_set):
#python ../omerlevy/hyperwords/analogy_eval.py VECTORS ${VECTORS_FILE} ../omerlevy/testsets/analogy/google.txt
test_cmd = [
"python", "../omerlevy/hyperwords/analogy_eval.py", "VECTORS", vectors_txt,
"../omerlevy/testsets/analogy/%s" % (test_set,)
]
run_word_test(test_str, test_cmd)
if True:
print("Word Similarity Tests (~5 seconds each)")
run_word_similarity("WS353 Similarity ", "ws353_similarity.txt")
run_word_similarity("WS353 Relatedness ", "ws353_relatedness.txt")
run_word_similarity("Bruni MEN ", "bruni_men.txt")
run_word_similarity("Radinsky M.Turk ", "radinsky_mturk.txt")
run_word_similarity("Luoung Rare Words ", "luong_rare.txt")
if True:
print("Word Analogy Tests (~60 seconds each)")
run_word_analogy("Google Analogy ", "google.txt")
run_word_analogy("MSR Analogy ", "msr.txt")
In [ ]:
In [ ]:
import numpy as np
import hickle
In [ ]:
#embedding_file = '../data/2-pretrained-vectors_glove.6B.300d.hkl'
#embedding_file = '../data/1-glove-1-billion-and-wiki_window11-lc-36_vectors.2-17.hkl'
#embedding_file = '../data/lloyds_normed_8.hkl'
# 1024-d embeddings : sparse and recreated
embedding_file = '../data/sparse.6B.300d_S-21_2n-shuf-noise-after-norm_.2.01_6-75_4000_GPU-sparse_matrix.hkl'
#embedding_file = '../data/sparse.6B.300d_S-21_2n-shuf-noise-after-norm_.2.01_6-75_4000_GPU-sparsity_recreate.hkl'
# 4096-d embeddings : sparse and recreated
#embedding_file = '../data/sparse.6B.300d_S-21_2n-shuf-noise-after-norm_4k_.2.01_1-50_5000_GPU-sparse_matrix.hkl'
#embedding_file = '../data/sparse.6B.300d_S-21_2n-shuf-noise-after-norm_4k_.2.01_1-50_5000_GPU-sparsity_recreate.hkl'
In [ ]:
d = hickle.load(embedding_file)
vocab, embedding = d['vocab'], d['embedding']
vocab_orig = d.get('vocab_orig', vocab)
dictionary = dict( (word, i) for i,word in enumerate(vocab) if i<len(embedding) )
dictionary_orig = dict( (word, i) for i,word in enumerate(vocab_orig) if i<len(embedding) )
print("Embedding loaded :", embedding.shape) # (vocab_size, embedding_dimension)=(rows, columns)
embedding_normed = embedding / np.linalg.norm(embedding, axis=1)[:, np.newaxis]
def save_embedding_for_tests(vocab, embedding, save_filename_txt='../data/tmp.embedding.txt'):
with open(save_filename_txt, 'wb') as f:
for l in range(0, embedding.shape[0]):
f.write("%s %s\n" % (vocab[l], ' '.join([ ('0' if x==0. else ("%.6f" % (x,))) for x in embedding[l, :].tolist() ]), ))
print("Saved to %s" % (save_filename_txt, ))
def save_embedding_to_hickle(vocab, embedding_save, save_filename_hkl, vocab_orig=None):
print("About to save to %s" % (save_filename_hkl,))
d=dict(
vocab=vocab,
vocab_orig=vocab if vocab_orig is None else vocab,
embedding=embedding_save,
)
hickle.dump(d, save_filename_hkl, mode='w', compression='gzip')
print("Saved to %s" % (save_filename_hkl,))
In [ ]:
vocab[0]
entries = [ x for x in embedding[0].tolist() if x!=0.0 ]
len(entries)
#45
for w in 'the iraq baghdad uk london criminal apple some hypothesis maximal innocuous'.split(' '):
i=dictionary[w]
entries = [ x for x in embedding[i].tolist() if x!=0.0 ]
print("%20s @%6d len=%d" % (w,dictionary_orig[w],len(entries),))
# the @ 0 len=18
# some @ 60 len=18
# london @ 266 len=91
# uk @ 448 len=82
# iraq @ 606 len=113
# criminal @ 1449 len=104
# apple @ 2046 len=112
# baghdad @ 2320 len=116
# hypothesis @ 6957 len=136
# maximal @ 27962 len=107
# innocuous @ 30111 len=86
if False:
# Look at per-position best words
for i in range(0, embedding.shape[1], 10):
best_words_j = np.argsort( -embedding[:, i ] )[0:10]
for j in best_words_j:
print("%4i -> %s" % (i, vocab[j],))
print('')
if False:
#i=2000
values = [x for x in (-np.sort( -embedding[i] )).tolist() if x>0. ]
print("values: ["+', '.join([ ('%.4f' % (x,)) for x in values ])+']')
#values: [1.1442, 0.9337, 0.9333, 0.9257, 0.7520, 0.5529, 0.4818, 0.4740, 0.4568, 0.4554, 0.4434, 0.4419, 0.4334, 0.4187, 0.4175, 0.4068, 0.4005, 0.3989, 0.3698, 0.3421, 0.3206, 0.3151, 0.3150, 0.3120, 0.3119, 0.3067, 0.3010, 0.2948, 0.2853, 0.2828, 0.2816, 0.2815, 0.2799, 0.2793, 0.2764, 0.2714, 0.2636, 0.2570, 0.2507, 0.2487, 0.2336, 0.2336, 0.2335, 0.2328, 0.2325, 0.2323, 0.2255, 0.2227, 0.2227, 0.2226, 0.2208, 0.2178, 0.2159, 0.2134, 0.2067, 0.2049, 0.1947, 0.1935, 0.1932, 0.1926, 0.1921, 0.1914, 0.1897, 0.1894, 0.1832, 0.1782, 0.1766, 0.1730, 0.1714, 0.1683, 0.1662, 0.1638, 0.1629, 0.1602, 0.1568, 0.1561, 0.1452, 0.1419, 0.1399, 0.1372, 0.1370, 0.1352, 0.1350, 0.1342, 0.1334, 0.1334, 0.1302, 0.1289, 0.1268, 0.1243, 0.1230, 0.1211, 0.1192, 0.1113, 0.1051]
print("changes: ["+', '.join([ ('%.1f' % (values[i+1]/values[i]*100.,)) for i in range(0,len(values)-1) ])+']')
#changes: [81.6, 100.0, 99.2, 81.2, 73.5, 87.1, 98.4, 96.4, 99.7, 97.3, 99.7, 98.1, 96.6, 99.7, 97.4, 98.4, 99.6, 92.7, 92.5, 93.7, 98.3, 100.0, 99.0, 100.0, 98.3, 98.1, 97.9, 96.8, 99.1, 99.6, 100.0, 99.4, 99.8, 99.0, 98.2, 97.1, 97.5, 97.6, 99.2, 93.9, 100.0, 100.0, 99.7, 99.9, 99.9, 97.1, 98.8, 100.0, 100.0, 99.2, 98.6, 99.1, 98.9, 96.9, 99.1, 95.0, 99.4, 99.9, 99.7, 99.8, 99.6, 99.1, 99.8, 96.7, 97.3, 99.1, 98.0, 99.1, 98.2, 98.8, 98.6, 99.4, 98.3, 97.9, 99.5, 93.1, 97.7, 98.6, 98.1, 99.8, 98.7, 99.9, 99.4, 99.4, 100.0, 97.6, 99.0, 98.4, 98.0, 99.0, 98.4, 98.5, 93.4, 94.4]
w='motorcycle'
w_i=dictionary[w]
#top_i =np.argmax(embedding[w_i])
good_i =np.argsort( -embedding[w_i] )
for i in range(0,10):
best_words_j = np.argsort( -embedding[:, good_i[i] ] )[0:12]
#for j in best_words_j:
# print("%s" % (vocab[j],))
#print('')
print("%s" % (', '.join( [ vocab[j] for j in best_words_j] ), ) )
In [ ]:
In [ ]:
In [ ]:
In [ ]:
def vector_for(w):
w_i=dictionary[w]
return embedding[w_i]
def l2_normed(e):
return e / np.sqrt( np.dot(e,e) )
def cosine(a,b):
return np.dot(l2_normed(a), l2_normed(b))
def top_senses_for(e):
good_i = np.argsort( -e )
for i in range(0,10):
best_words_j = np.argsort( -embedding[:, good_i[i] ] )[0:12]
print("%s" % (', '.join( [ vocab[j] for j in best_words_j] ), ) )
def closest_to(e, n=10):
closest = np.argsort( - np.dot(embedding_normed, l2_normed(e) ) )
return "%s" % (', '.join( [ vocab[j] for j in closest[0:n] ] ), )
def count_positive(e):
return len( [ x for x in e.tolist() if x>0.0 ] )
def nonzero_positions(e):
return [ i for (i,x) in enumerate(e.tolist()) if x!=0.0 ]
def nonneg(e):
return np.maximum(0, e)
def closest_dist(s):
ab,xy = s.split('=')
(a,b),(x,y) = ab.split(':'), xy.split(':')
print( "%s is to %s as %s is to ?%s? " % (a,b,x,y,))
(a,b,x,y) = map(vector_for, [a,b,x,y]) # Convert to vectors
print(' x+b-a = %s' % (closest_to( x + b - a ),))
print(' [x+b-a] = %s' % (closest_to( nonneg(x + b - a) ),))
print(' x+[b-a] = %s' % (closest_to( x + nonneg(b-a) ),))
print(' [x-a]+b = %s' % (closest_to( nonneg(x-a) + b ),))
print(' [2x-a]+[2b-a] = %s' % (closest_to( nonneg(2*x-a) + nonneg(2*b-a) ),))
print(' x+[b-a]+b+[x-a] = %s' % (closest_to( x+nonneg(b-a) + b+nonneg(x-a) ),))
In [ ]:
top_senses_for(vector_for('motorbike'))
man = vector_for('man')
woman = vector_for('woman')
king = vector_for('king')
queen = vector_for('queen')
#top_senses_for(man)
#top_senses_for(woman)
#top_senses_for(king)
#top_senses_for(queen)
#top_senses_for(man * woman) # Intersection
#top_senses_for(man + woman) # Union
#top_senses_for(man - woman) # ??
closest_to(man)
#man, woman, girl, person, men, teenager, she, friend, he, father, her, boy, someone, mother, him, his, victim, son, who, guy
closest_to(woman)
#woman, man, girl, mother, teenager, daughter, wife, women, her, person, she, girlfriend, friend, men, husband, widow, couple, boy, someone, victim
closest_to(king)
#king, queen, henry, mswati, mongkut, eirik, charles, vajiravudh, thoden, wenceslaus, zvonimir, athelstan, vladislaus, thelred, gojong, prince, jayavarman, kalkaua, sweyn, pomare
closest_to(queen)
#queen, princess, elizabeth, king, margrethe, empress, lady, sister, prince, sirikit, mary, cixi, monarch, daughter, duchess, olten, mother, infanta, rania, widow
closest_dist('pound:england=franc:france')
england,pound,america,dollar = map(vector_for, 'england pound america dollar'.split())
curr = england,pound,america,dollar = map(vector_for, 'england pound america dollar'.split())
map(count_positive, curr)
#[84, 126, 94, 134]
map(count_positive, [ england*pound, america*dollar, england*america, pound*dollar])
#[12, 14, 17, 56]
total = england+pound+america+dollar
map(count_positive, [ england*101-100*total, pound*101-100*total, america*101-100*total, dollar*101-100*total])
In [ ]:
save_embedding_for_tests(vocab, embedding, save_filename_txt='../data/tmp.embedding.txt')
In [ ]:
test_embedding_file('../data/tmp.embedding.txt', vocab_max=131072)
In [ ]:
#save_embedding_for_tests(vocab, embedding, save_filename_txt='../data/lloyds_normed_8.txt')
#test_embedding_file('../data/lloyds_normed_8.txt', vocab_max=131072)
In [ ]:
In [ ]:
len(nonzero_positions(embedding[10202]))
In [ ]:
# Let's make a *random* transformation matrix to 'recreate' and approximate 300d dense embedding
A = np.random.normal(loc=0.0, scale=1.0, size=(embedding.shape[1], 300))
embedding_reconstructed = np.dot(embedding, A) # Project back down to 300d using random matrix
embedding_reconstructed.shape
In [ ]:
embedding_reconstructed = embedding/5.0 # Check that scaling is irrelevant (ACTUALLY, THIS CHECK IS BOGUS)
In [ ]:
embedding_reconstructed = np.where(embedding>0,1,0) # Binarized
In [ ]:
embedding_reconstructed = np.where(embedding>0,0,1) # Binarized (inverted) is TERRIBLE
In [ ]:
def construct_d(a_i, b_i, c_i, emb):
a = emb[a_i]
b = emb[b_i]
c = emb[c_i]
d = c+b-a # This is the standard np() vector constructor
return np.dot(emb, d.T) # Return the score of this vs all the embedding vectors
# Let's build an analogy tester right here...
def test_analogies(test_set="msr", emb=embedding, construction_fn=construct_d):
trials, trials_w,total,total_possible=[],[],0,0
with open("../omerlevy/testsets/analogy/%s.txt" % test_set) as f: # 'google' or 'msr'
for trial in f.readlines():
t_w = [ w for w in trial.strip().split() ]
t = [ dictionary.get(w, None) for w in t_w ]
total+=1
if None not in t:
trials_w.append(t_w)
trials.append(t)
total_possible+=1
print(trials_w[0], trials[0])
count, correct = 0, 0
for i,trial in enumerate(trials):
if i % 10 >0: # 10x thinning factor for speed (1 for accuracy)
continue
d_score = construction_fn( trial[0], trial[1], trial[2], emb)
# Set the scores for the original vectors to useless values
d_score[ trial[0] ] = d_score[ trial[1] ] = d_score[ trial[2] ] = -1
# Now find the argmax score:
d_i = np.argmax( d_score )
#print( d_score.shape, d_i, i )
if True and False:
w = trials_w[i]
print("'%s:%s=%s:%s'? :: %s" % ( w[0], w[1], w[2], vocab[ d_i ],
('WIN' if trial[3]==d_i else "FAIL"),
))
count += 1
if trial[3]==d_i:
correct += 1
if i % 250==0:
print("At %d : %.2f%%" % (count, 100.*correct/count))
print("Local(%s) final : %.2f%% using %d, which is %.2f%% of total" % (test_set,
100.*correct/count, count, 100.*correct/count*total_possible/total,))
In [ ]:
test_analogies(emb=embedding_normed) # , test_set="google")
#NO test_analogies(emb=embedding) # , test_set="google")
In [ ]:
def remax(a):
return a / np.amax(a)
def a_with_b(a, b):
return remax(a+b)
def a_without_b(a, b):
c = a.copy()
c[ b>0 ]=0.
return remax(c)
def closest_dist(s):
ab,xy = s.split('=')
(a,b),(x,y) = ab.split(':'), xy.split(':')
print( "%s is to %s as %s is to ?%s? " % (a,b,x,y,))
(a,b,x,y) = map(vector_for, [a,b,x,y]) # Convert to vectors
print(' x+b-a = %s' % (closest_to( x + b - a ),))
print(' [x+b-a] = %s' % (closest_to( nonneg(x + b - a) ),))
print(' x+[b-a] = %s' % (closest_to( x + nonneg(b-a) ),))
print(' [x-a]+b = %s' % (closest_to( nonneg(x-a) + b ),))
print(' [2x-a]+[2b-a] = %s' % (closest_to( nonneg(2*x-a) + nonneg(2*b-a) ),))
print(' x+[b-a]+b+[x-a] = %s' % (closest_to( x+nonneg(b-a) + b+nonneg(x-a) ),))
setish1 = a_with_b( a_without_b( x, a_without_b(a, b) ), a_without_b(b, a))
print(' setish1 = %s' % (closest_to( setish1 ),))
setish2 = a_with_b( a_without_b( b, a_without_b(a, x) ), a_without_b(x, a))
print(' setish2 = %s' % (closest_to( setish2 ),))
#closest_dist('pound:england=franc:france')
closest_dist('london:england=paris:france')
closest_dist('smallest:smaller=smoothest:smoother') # Better with set ops
#closest_dist('great:greater=classy:classier') # No idea
#closest_dist('richest:richer=meanest:meaner') # No idea
#closest_dist('seem:seems=develop:develops')
closest_dist('few:fewer=friendly:friendlier')
#### All MSR sets with 'fast' in them are bogus
##closest_dist('weak:weakest=fastest:fast') !!!
e=vector_for('england')
l=vector_for('london')
f=vector_for('france')
p=vector_for('paris')
#closest_to( p+e-l )
p_poss1 = a_with_b( a_without_b( p, a_without_b(l, e) ), a_without_b(e, l))
p_poss2 = a_with_b( a_without_b( e, a_without_b(l, p) ), a_without_b(p, l))
print( closest_to( p_poss1 ) )
print( closest_to( p_poss2 ) )
In [ ]:
def construct_d_setish(a_i, b_i, c_i, emb):
a = emb[a_i]
b = emb[b_i]
c = emb[c_i]
#d = c + (b-a) # This is the standard np() vector constructor
#d = a_with_b( a_without_b( c, a_without_b(a, b) ), a_without_b(b, a)) # This is set-ish v1
#d = a_with_b( a_without_b( b, a_without_b(a, c) ), a_without_b(c, a)) # This is set-ish v2
d = a_with_b(a_with_b( c, a_without_b(b, a) ), a_with_b( b, a_without_b(c, a))) # This is set-ish v3
d += c + (b-a)
return np.dot(emb, d.T) # Return the score of this vs all the embedding vectors
test_analogies(emb=embedding_normed, construction_fn=construct_d_setish, test_set="google")
In [13]:
# =0,=1, A/B,A+B,B/A, A/C,A+C,C/A
a = np.array( [0.,1., 1.,1.,0., 1.,1.,0.])
b = np.array( [0.,1., 0.,1.,1., 0.,0.,0.])
c = np.array( [0.,1., 0.,0.,0., 0.,1.,1.])
d = np.array( [0.,1., 0.,0.,1., 0.,0.,1.]) # Should this de-emphasise 'a', for instance?
d0 = c + (b-a)
d1 = a_with_b( a_without_b( c, a_without_b(a, b) ), a_without_b(b, a)) # This is set-ish v1
d2 = a_with_b( a_without_b( b, a_without_b(a, c) ), a_without_b(c, a)) # This is set-ish v2
d3 = a_with_b(a_with_b( c, a_without_b(b, a) ), a_with_b( b, a_without_b(c, a))) # This is set-ish v3
d0,d3, d1,d2
Out[13]:
In [ ]:
embedding_reconstructed_normed = embedding_reconstructed / np.linalg.norm(embedding_reconstructed, axis=1)[:, np.newaxis]
save_embedding_for_tests(vocab, embedding_reconstructed_normed, save_filename_txt='../data/tmp.embedding.txt')
test_embedding_file('../data/tmp.embedding.txt', vocab_max=131072)
In [ ]:
vocab[ 0:10 ]
In [ ]:
embedding[0, 0:100]
In [ ]:
embedding_reconstructed[0, 0:100]
In [ ]: