Compressing Word Embeddings

Downloadable version of GloVe embedding (with fallback source), as well as downloadable versions of sparsified GloVe embedding from own hosting.

Include instructions for Levy test-suite installation, so that any given embedding can be tested.

And functions/tools to play with the loaded embedding (of whatever type).

Download Pre-Built Embeddings

The following needs to be Pythonized :


In [ ]:
RCL_BASE=('http://redcatlabs.com/downloads/'+
          'deep-learning-workshop/notebooks/data/'+
          'research/ICONIP-2016/')

"""
# http://redcatlabs.com/downloads/deep-learning-workshop/LICENSE

# Files in : ${RCL_BASE} :

# :: These are either as downloaded from GloVe site, or generated by Levy code
# 507206240 Oct 25  2015 2-pretrained-vectors_glove.6B.300d.hkl
# 160569440 May 14 14:57 1-glove-1-billion-and-wiki_window11-lc-36_vectors.2-17.hkl

# :: These are originals - citation desired...
#  53984642 May 15 14:13 sparse.6B.300d_S-21_2n-shuf-noise-after-norm_.2.01_6-75_4000_GPU-sparse_matrix.hkl
# 148011260 May 15 14:13 sparse.6B.300d_S-21_2n-shuf-noise-after-norm_.2.01_6-75_4000_GPU-sparsity_recreate.hkl

#  57907373 May 15 02:53 sparse.6B.300d_S-21_2n-shuf-noise-after-norm_4k_.2.01_1-50_5000_GPU-sparse_matrix.hkl
# 147946219 May 15 02:52 sparse.6B.300d_S-21_2n-shuf-noise-after-norm_4k_.2.01_1-50_5000_GPU-sparsity_recreate.hkl


# Spare?
# 122248980 May  2 13:09 misc/sparse.6B.300d_T-21_3500.1024@0.05-GPU-sparse_matrix.hkl
# 447610336 May  2 13:04 misc/sparse.6B.300d_T-21_3500.1024@0.05-GPU-sparsity_recreate.hkl
#  53312127 May 11 14:10 misc/sparse.6B.300d_S-21_2n-shuf_1024@6.75_2000_GPU-sparse_matrix.hkl
# 148027055 May 11 14:10 misc/sparse.6B.300d_S-21_2n-shuf_1024@6.75_2000_GPU-sparsity_recreate.hkl
#  57054795 May 11 12:09 misc/sparse.6B.300d_S-21_2n-shuf_4096@1.50_2000_GPU-sparse_matrix.hkl
# 147997824 May 11 12:09 misc/sparse.6B.300d_S-21_2n-shuf_4096@1.50_2000_GPU-sparsity_recreate.hkl

"""

import os, requests

def get_embedding_file( hkl ):  
    if not os.path.isfile(os.path.join('data', hkl)):
        # ... requests.get( RCL_BASE + hkl)
        pass

Download the Omer-Levy Test Regime

See : https://levyomer.files.wordpress.com/2015/03/improving-distributional-similarity-tacl-2015.pdf

To download the test suite, please run the script download-tests.bash.

Function to test an Embedding .txt File


In [ ]:
import os, subprocess
from __future__ import print_function

def test_embedding_file(vectors_txt, vocab_max=131072 ):
  # Do we need to process VECTORS_FILE->{ VECTORS_WORDS, VECTORS_NPY }?
  # Answer = YES : the .words is required, and is used to create .npy and .vocab
    
  vectors_txt_words = '%s.words' % (vectors_txt,)
  if not os.path.isfile(vectors_txt_words) or os.stat(vectors_txt).st_mtime>os.stat(vectors_txt_words).st_mtime:
    print("Reading %s" % (vectors_txt,))
    # This is just a copy of 'text file' with the vocab_size and embedding_size pre-pended
    #echo "131072 300" > ${VECTORS_WORDS}
    #head -131072 ${VECTORS_FILE} >> ${VECTORS_WORDS}
    with open(vectors_txt) as fin:
      first_line = fin.readline()
      embedding_dim = len(first_line.strip().split()) -1 
      vocab_size = len(fin.readlines()) +1  # Ouch! - read in whole file to find length

      if vocab_size>vocab_max:
        vocab_size=vocab_max
            
    print("Building %s" % (vectors_txt_words,))
    with open(vectors_txt) as fin:
      with open(vectors_txt_words, 'wt') as fout:
        # Write the first line, which, ironically, will be discarded by the omerlevy code
        fout.write("%d %d\n" % (vocab_size, embedding_dim))
                
        # And copy over at most vocab_max lines of the original file 
        for i, line in enumerate(fin.readlines()):
          if i>vocab_size:
            break
          fout.write(line)
    print("Built %s as %d %d-d vectors" % (vectors_txt_words, vocab_size, embedding_dim))
    
  vectors_txt_npy   = '%s.npy' % (vectors_txt_words,)
  vectors_txt_vocab = '%s.vocab' % (vectors_txt_words,)
  if not os.path.isfile(vectors_txt_npy) or os.stat(vectors_txt_words).st_mtime>os.stat(vectors_txt_npy).st_mtime:
    print("Building %s and %s" % (vectors_txt_npy, vectors_txt_vocab, ))
    # Sadly, we can't just invoke this as a python function - need to go via shell...
    subprocess.call([ "python", "../omerlevy/hyperwords/text2numpy.py", vectors_txt_words ])
    print("Built %s and %s" % (vectors_txt_npy, vectors_txt_vocab, ))

   
  def run_word_test(test_str, test_cmd):
    print("  %s" % ((test_str+' '*30)[:30],), end='')
    #subprocess.call(test_cmd)
    try:
      res = subprocess.check_output( test_cmd, stderr=subprocess.STDOUT,)
      print(" : %s" % (res.strip(),))
    except subprocess.CalledProcessError as e:
      print(" : ERROR : %s" % (test_str,))
      print(e)

  def run_word_similarity(test_str, test_set):
    test_cmd = [ 
        "python", "../omerlevy/hyperwords/ws_eval.py", "VECTORS", vectors_txt, 
        "../omerlevy/testsets/ws/%s" % (test_set,) 
    ]
    run_word_test(test_str, test_cmd)

  def run_word_analogy(test_str, test_set):
    #python ../omerlevy/hyperwords/analogy_eval.py VECTORS ${VECTORS_FILE} ../omerlevy/testsets/analogy/google.txt
    test_cmd = [ 
        "python", "../omerlevy/hyperwords/analogy_eval.py", "VECTORS", vectors_txt, 
        "../omerlevy/testsets/analogy/%s" % (test_set,) 
    ]
    run_word_test(test_str, test_cmd)

  if True:
    print("Word Similarity Tests (~5 seconds each)")
    run_word_similarity("WS353 Similarity  ", "ws353_similarity.txt")
    run_word_similarity("WS353 Relatedness ", "ws353_relatedness.txt")
    run_word_similarity("Bruni MEN         ", "bruni_men.txt")
    run_word_similarity("Radinsky M.Turk   ", "radinsky_mturk.txt")
    run_word_similarity("Luoung Rare Words ", "luong_rare.txt")

  if True:
    print("Word Analogy Tests (~60 seconds each)")
    run_word_analogy("Google Analogy    ", "google.txt")
    run_word_analogy("MSR Analogy       ", "msr.txt")

In [ ]:


In [ ]:
import numpy as np
import hickle

Choose an embedding to Load


In [ ]:
#embedding_file = '../data/2-pretrained-vectors_glove.6B.300d.hkl'
#embedding_file = '../data/1-glove-1-billion-and-wiki_window11-lc-36_vectors.2-17.hkl'

#embedding_file = '../data/lloyds_normed_8.hkl'

# 1024-d embeddings : sparse and recreated
embedding_file = '../data/sparse.6B.300d_S-21_2n-shuf-noise-after-norm_.2.01_6-75_4000_GPU-sparse_matrix.hkl'
#embedding_file = '../data/sparse.6B.300d_S-21_2n-shuf-noise-after-norm_.2.01_6-75_4000_GPU-sparsity_recreate.hkl'

# 4096-d embeddings : sparse and recreated
#embedding_file = '../data/sparse.6B.300d_S-21_2n-shuf-noise-after-norm_4k_.2.01_1-50_5000_GPU-sparse_matrix.hkl'
#embedding_file = '../data/sparse.6B.300d_S-21_2n-shuf-noise-after-norm_4k_.2.01_1-50_5000_GPU-sparsity_recreate.hkl'

In [ ]:
d = hickle.load(embedding_file)
vocab, embedding = d['vocab'], d['embedding']
vocab_orig = d.get('vocab_orig', vocab)

dictionary = dict( (word, i) for i,word in enumerate(vocab) if i<len(embedding) )
dictionary_orig = dict( (word, i) for i,word in enumerate(vocab_orig) if i<len(embedding) )

print("Embedding loaded :", embedding.shape)   # (vocab_size, embedding_dimension)=(rows, columns)
embedding_normed = embedding / np.linalg.norm(embedding, axis=1)[:, np.newaxis]

def save_embedding_for_tests(vocab, embedding, save_filename_txt='../data/tmp.embedding.txt'):
  with open(save_filename_txt, 'wb') as f:
    for l in range(0, embedding.shape[0]):
      f.write("%s %s\n" % (vocab[l], ' '.join([ ('0' if x==0. else ("%.6f" % (x,))) for x in embedding[l, :].tolist() ]), ))
  print("Saved to %s" % (save_filename_txt, ))

def save_embedding_to_hickle(vocab, embedding_save, save_filename_hkl, vocab_orig=None):
  print("About to save to %s" % (save_filename_hkl,))
  d=dict( 
    vocab=vocab, 
    vocab_orig=vocab if vocab_orig is None else vocab,
    embedding=embedding_save,
  )
  hickle.dump(d, save_filename_hkl, mode='w', compression='gzip')
  print("Saved to %s" % (save_filename_hkl,))

In [ ]:
vocab[0]
entries = [ x for x in embedding[0].tolist() if x!=0.0 ]
len(entries)
#45 

for w in 'the iraq baghdad uk london criminal apple some hypothesis maximal innocuous'.split(' '):
  i=dictionary[w]
  entries = [ x for x in embedding[i].tolist() if x!=0.0 ]
  print("%20s @%6d len=%d" % (w,dictionary_orig[w],len(entries),))

  #               the @     0 len=18
  #              some @    60 len=18
  #            london @   266 len=91
  #                uk @   448 len=82
  #              iraq @   606 len=113
  #          criminal @  1449 len=104
  #             apple @  2046 len=112
  #           baghdad @  2320 len=116
  #        hypothesis @  6957 len=136
  #           maximal @ 27962 len=107
  #         innocuous @ 30111 len=86


if False:
  # Look at per-position best words
  for i in range(0, embedding.shape[1], 10):
    best_words_j = np.argsort( -embedding[:, i ] )[0:10]
    for j in best_words_j:
      print("%4i -> %s" % (i, vocab[j],))
    print('')

if False:
  #i=2000
  values = [x for x in (-np.sort( -embedding[i] )).tolist() if x>0. ]
  print("values: ["+', '.join([ ('%.4f' % (x,)) for x in values ])+']')
  #values: [1.1442, 0.9337, 0.9333, 0.9257, 0.7520, 0.5529, 0.4818, 0.4740, 0.4568, 0.4554, 0.4434, 0.4419, 0.4334, 0.4187, 0.4175, 0.4068, 0.4005, 0.3989, 0.3698, 0.3421, 0.3206, 0.3151, 0.3150, 0.3120, 0.3119, 0.3067, 0.3010, 0.2948, 0.2853, 0.2828, 0.2816, 0.2815, 0.2799, 0.2793, 0.2764, 0.2714, 0.2636, 0.2570, 0.2507, 0.2487, 0.2336, 0.2336, 0.2335, 0.2328, 0.2325, 0.2323, 0.2255, 0.2227, 0.2227, 0.2226, 0.2208, 0.2178, 0.2159, 0.2134, 0.2067, 0.2049, 0.1947, 0.1935, 0.1932, 0.1926, 0.1921, 0.1914, 0.1897, 0.1894, 0.1832, 0.1782, 0.1766, 0.1730, 0.1714, 0.1683, 0.1662, 0.1638, 0.1629, 0.1602, 0.1568, 0.1561, 0.1452, 0.1419, 0.1399, 0.1372, 0.1370, 0.1352, 0.1350, 0.1342, 0.1334, 0.1334, 0.1302, 0.1289, 0.1268, 0.1243, 0.1230, 0.1211, 0.1192, 0.1113, 0.1051]

  print("changes: ["+', '.join([ ('%.1f' % (values[i+1]/values[i]*100.,)) for i in range(0,len(values)-1) ])+']')
  #changes: [81.6, 100.0, 99.2, 81.2, 73.5, 87.1, 98.4, 96.4, 99.7, 97.3, 99.7, 98.1, 96.6, 99.7, 97.4, 98.4, 99.6, 92.7, 92.5, 93.7, 98.3, 100.0, 99.0, 100.0, 98.3, 98.1, 97.9, 96.8, 99.1, 99.6, 100.0, 99.4, 99.8, 99.0, 98.2, 97.1, 97.5, 97.6, 99.2, 93.9, 100.0, 100.0, 99.7, 99.9, 99.9, 97.1, 98.8, 100.0, 100.0, 99.2, 98.6, 99.1, 98.9, 96.9, 99.1, 95.0, 99.4, 99.9, 99.7, 99.8, 99.6, 99.1, 99.8, 96.7, 97.3, 99.1, 98.0, 99.1, 98.2, 98.8, 98.6, 99.4, 98.3, 97.9, 99.5, 93.1, 97.7, 98.6, 98.1, 99.8, 98.7, 99.9, 99.4, 99.4, 100.0, 97.6, 99.0, 98.4, 98.0, 99.0, 98.4, 98.5, 93.4, 94.4]


w='motorcycle'
w_i=dictionary[w]

#top_i =np.argmax(embedding[w_i])
good_i =np.argsort( -embedding[w_i] )

for i in range(0,10):
  best_words_j = np.argsort( -embedding[:, good_i[i] ] )[0:12]
  
  #for j in best_words_j:
  #  print("%s" % (vocab[j],))
  #print('')
  
  print("%s" % (', '.join( [ vocab[j] for j in best_words_j] ), ) )

In [ ]:


In [ ]:


In [ ]:

Embedding Exploration Utility functions


In [ ]:
def vector_for(w):
  w_i=dictionary[w]
  return embedding[w_i]

def l2_normed(e):
  return e / np.sqrt( np.dot(e,e) )

def cosine(a,b):
  return np.dot(l2_normed(a), l2_normed(b))

def top_senses_for(e):
  good_i = np.argsort( -e )
  for i in range(0,10):
    best_words_j = np.argsort( -embedding[:, good_i[i] ] )[0:12]
    print("%s" % (', '.join( [ vocab[j] for j in best_words_j] ), ) )

def closest_to(e, n=10):
  closest = np.argsort( - np.dot(embedding_normed, l2_normed(e) ) )
  return "%s" % (', '.join( [ vocab[j] for j in closest[0:n] ] ), ) 

def count_positive(e):
  return len( [ x for x in e.tolist() if x>0.0 ] )

def nonzero_positions(e):
    return [ i for (i,x) in enumerate(e.tolist()) if x!=0.0 ]

def nonneg(e):
  return np.maximum(0, e)

def closest_dist(s):
  ab,xy = s.split('=')
  (a,b),(x,y) = ab.split(':'), xy.split(':')
  print( "%s is to %s as %s is to ?%s? " % (a,b,x,y,))
  (a,b,x,y) = map(vector_for, [a,b,x,y])  # Convert to vectors
  print('  x+b-a           = %s' % (closest_to( x + b - a ),))
  print('  [x+b-a]         = %s' % (closest_to( nonneg(x + b - a) ),))
  print('  x+[b-a]         = %s' % (closest_to( x + nonneg(b-a) ),))
  print('  [x-a]+b         = %s' % (closest_to( nonneg(x-a) + b ),))
  print('  [2x-a]+[2b-a]   = %s' % (closest_to( nonneg(2*x-a) + nonneg(2*b-a) ),))
  print('  x+[b-a]+b+[x-a] = %s' % (closest_to( x+nonneg(b-a) + b+nonneg(x-a) ),))

In [ ]:
top_senses_for(vector_for('motorbike'))

man   = vector_for('man')
woman = vector_for('woman')
king  = vector_for('king')
queen = vector_for('queen')

#top_senses_for(man)
#top_senses_for(woman)
#top_senses_for(king)
#top_senses_for(queen)

#top_senses_for(man * woman) # Intersection
#top_senses_for(man + woman) # Union
#top_senses_for(man - woman) # ??


closest_to(man)
#man, woman, girl, person, men, teenager, she, friend, he, father, her, boy, someone, mother, him, his, victim, son, who, guy
closest_to(woman)
#woman, man, girl, mother, teenager, daughter, wife, women, her, person, she, girlfriend, friend, men, husband, widow, couple, boy, someone, victim

closest_to(king)
#king, queen, henry, mswati, mongkut, eirik, charles, vajiravudh, thoden, wenceslaus, zvonimir, athelstan, vladislaus, thelred, gojong, prince, jayavarman, kalkaua, sweyn, pomare
closest_to(queen)
#queen, princess, elizabeth, king, margrethe, empress, lady, sister, prince, sirikit, mary, cixi, monarch, daughter, duchess, olten, mother, infanta, rania, widow

closest_dist('pound:england=franc:france')


england,pound,america,dollar = map(vector_for, 'england pound america dollar'.split())

curr = england,pound,america,dollar = map(vector_for, 'england pound america dollar'.split())
map(count_positive, curr)
#[84, 126, 94, 134]

map(count_positive, [ england*pound, america*dollar, england*america, pound*dollar])
#[12, 14, 17, 56]

total = england+pound+america+dollar
map(count_positive, [ england*101-100*total, pound*101-100*total, america*101-100*total, dollar*101-100*total])

In [ ]:
save_embedding_for_tests(vocab, embedding, save_filename_txt='../data/tmp.embedding.txt')

In [ ]:
test_embedding_file('../data/tmp.embedding.txt', vocab_max=131072)

In [ ]:
#save_embedding_for_tests(vocab, embedding, save_filename_txt='../data/lloyds_normed_8.txt')
#test_embedding_file('../data/lloyds_normed_8.txt', vocab_max=131072)

In [ ]:


In [ ]:
len(nonzero_positions(embedding[10202]))

In [ ]:
# Let's make a *random* transformation matrix to 'recreate' and approximate 300d dense embedding
A = np.random.normal(loc=0.0, scale=1.0, size=(embedding.shape[1], 300))

embedding_reconstructed = np.dot(embedding, A)        # Project back down to 300d using random matrix
embedding_reconstructed.shape

In [ ]:
embedding_reconstructed = embedding/5.0               # Check that scaling is irrelevant (ACTUALLY, THIS CHECK IS BOGUS)

In [ ]:
embedding_reconstructed = np.where(embedding>0,1,0)   # Binarized

In [ ]:
embedding_reconstructed = np.where(embedding>0,0,1)   # Binarized (inverted) is TERRIBLE

In [ ]:
def construct_d(a_i, b_i, c_i, emb):
  a = emb[a_i]
  b = emb[b_i]
  c = emb[c_i]
  d = c+b-a              # This is the standard np() vector constructor
  return np.dot(emb, d.T)   # Return the score of this vs all the embedding vectors

# Let's build an analogy tester right here...
def test_analogies(test_set="msr", emb=embedding, construction_fn=construct_d):
  trials, trials_w,total,total_possible=[],[],0,0
  with open("../omerlevy/testsets/analogy/%s.txt" % test_set) as f:  # 'google' or 'msr'
    for trial in f.readlines():
      t_w = [ w for w in trial.strip().split() ]
      t = [ dictionary.get(w, None) for w in t_w ]
      total+=1
      if None not in t:
        trials_w.append(t_w)
        trials.append(t)
        total_possible+=1
    
  print(trials_w[0], trials[0])

  count, correct = 0, 0
  for i,trial in enumerate(trials):
    if i % 10 >0:  # 10x thinning factor for speed (1 for accuracy)
      continue
    d_score = construction_fn( trial[0], trial[1], trial[2], emb)
    
    # Set the scores for the original vectors to useless values
    d_score[ trial[0] ] = d_score[ trial[1] ] = d_score[ trial[2] ] = -1
    
    # Now find the argmax score:
    d_i = np.argmax( d_score )
    #print( d_score.shape, d_i, i )
    
    if True and False:
      w = trials_w[i]
      print("'%s:%s=%s:%s'?  :: %s" % ( w[0], w[1], w[2], vocab[ d_i ],
                ('WIN' if trial[3]==d_i else "FAIL"),
            ))
    
    count += 1
    if trial[3]==d_i:
      correct += 1
    
    if i % 250==0:
      print("At %d : %.2f%%" % (count, 100.*correct/count))
  print("Local(%s) final : %.2f%% using %d, which is %.2f%% of total" % (test_set, 
    100.*correct/count, count, 100.*correct/count*total_possible/total,))

In [ ]:
test_analogies(emb=embedding_normed) # , test_set="google")
#NO test_analogies(emb=embedding) # , test_set="google")

In [ ]:
def remax(a):
    return a / np.amax(a)

def a_with_b(a, b):
    return remax(a+b)

def a_without_b(a, b):
    c = a.copy()
    c[ b>0 ]=0.
    return remax(c)
    
def closest_dist(s):
  ab,xy = s.split('=')
  (a,b),(x,y) = ab.split(':'), xy.split(':')
  print( "%s is to %s as %s is to ?%s? " % (a,b,x,y,))
  (a,b,x,y) = map(vector_for, [a,b,x,y])  # Convert to vectors
  print('  x+b-a           = %s' % (closest_to( x + b - a ),))
  print('  [x+b-a]         = %s' % (closest_to( nonneg(x + b - a) ),))
  print('  x+[b-a]         = %s' % (closest_to( x + nonneg(b-a) ),))
  print('  [x-a]+b         = %s' % (closest_to( nonneg(x-a) + b ),))
  print('  [2x-a]+[2b-a]   = %s' % (closest_to( nonneg(2*x-a) + nonneg(2*b-a) ),))
  print('  x+[b-a]+b+[x-a] = %s' % (closest_to( x+nonneg(b-a) + b+nonneg(x-a) ),))
  setish1 = a_with_b( a_without_b( x,  a_without_b(a, b) ), a_without_b(b, a))
  print('  setish1         = %s' % (closest_to( setish1 ),))
  setish2 = a_with_b( a_without_b( b,  a_without_b(a, x) ), a_without_b(x, a))
  print('  setish2         = %s' % (closest_to( setish2 ),))

#closest_dist('pound:england=franc:france')
closest_dist('london:england=paris:france')
closest_dist('smallest:smaller=smoothest:smoother')  # Better with set ops
#closest_dist('great:greater=classy:classier')  # No idea
#closest_dist('richest:richer=meanest:meaner')  # No idea
#closest_dist('seem:seems=develop:develops')
closest_dist('few:fewer=friendly:friendlier')

#### All MSR sets with 'fast' in them are bogus
##closest_dist('weak:weakest=fastest:fast') !!!

e=vector_for('england')
l=vector_for('london')
f=vector_for('france')
p=vector_for('paris')
#closest_to( p+e-l )

p_poss1 = a_with_b( a_without_b( p,  a_without_b(l, e) ), a_without_b(e, l))
p_poss2 = a_with_b( a_without_b( e,  a_without_b(l, p) ), a_without_b(p, l))
print( closest_to( p_poss1 ) )
print( closest_to( p_poss2 ) )

In [ ]:
def construct_d_setish(a_i, b_i, c_i, emb):
  a = emb[a_i]
  b = emb[b_i]
  c = emb[c_i]
  #d = c + (b-a)              # This is the standard np() vector constructor
  #d = a_with_b( a_without_b( c,  a_without_b(a, b) ), a_without_b(b, a))  # This is set-ish  v1
  #d = a_with_b( a_without_b( b,  a_without_b(a, c) ), a_without_b(c, a))  # This is set-ish  v2
  d = a_with_b(a_with_b( c,  a_without_b(b, a) ), a_with_b( b, a_without_b(c, a)))  # This is set-ish  v3
  d += c + (b-a)
  return np.dot(emb, d.T)   # Return the score of this vs all the embedding vectors

test_analogies(emb=embedding_normed, construction_fn=construct_d_setish, test_set="google")

In [13]:
# =0,=1, A/B,A+B,B/A, A/C,A+C,C/A 
a = np.array( [0.,1., 1.,1.,0., 1.,1.,0.])
b = np.array( [0.,1., 0.,1.,1., 0.,0.,0.])
c = np.array( [0.,1., 0.,0.,0., 0.,1.,1.])
d = np.array( [0.,1., 0.,0.,1., 0.,0.,1.]) # Should this de-emphasise 'a', for instance?
d0 = c + (b-a)
d1 = a_with_b( a_without_b( c,  a_without_b(a, b) ), a_without_b(b, a))  # This is set-ish  v1
d2 = a_with_b( a_without_b( b,  a_without_b(a, c) ), a_without_b(c, a))  # This is set-ish  v2
d3 = a_with_b(a_with_b( c,  a_without_b(b, a) ), a_with_b( b, a_without_b(c, a)))  # This is set-ish  v3
d0,d3, d1,d2


Out[13]:
(array([ 0.,  1., -1.,  0.,  1., -1.,  0.,  1.]),
 array([ 0. ,  1. ,  0. ,  0.5,  1. ,  0. ,  0.5,  1. ]),
 array([ 0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.]),
 array([ 0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.]))

In [ ]:
embedding_reconstructed_normed = embedding_reconstructed / np.linalg.norm(embedding_reconstructed, axis=1)[:, np.newaxis]
save_embedding_for_tests(vocab, embedding_reconstructed_normed, save_filename_txt='../data/tmp.embedding.txt')
test_embedding_file('../data/tmp.embedding.txt', vocab_max=131072)

In [ ]:
vocab[ 0:10 ]

In [ ]:
embedding[0, 0:100]

In [ ]:
embedding_reconstructed[0, 0:100]

In [ ]: