Compressing Word Embeddings

Downloadable version of GloVe embedding (with fallback source).

Then require two main sections :

  • Lloyd embedding generation

  • Sparsified embedding generation

and then saving of the created embeddings to .hkl files.

Download Source Embedding(s)

The following needs to be Pythonized :


In [ ]:
RCL_BASE=('http://redcatlabs.com/downloads/'+
          'deep-learning-workshop/notebooks/data/'+
          'research/ICONIP-2016/')

"""
# http://redcatlabs.com/downloads/deep-learning-workshop/LICENSE

# Files in : ${RCL_BASE} :

# :: These are either as downloaded from GloVe site, or generated by Levy code
#    The downloadable pretrained GloVe is much larger, since it is 400k words, 
#    whereas the 'home-grown' GloVe (based on the 1-billion word Corpus, and a wikipedia
#    snapshot has a vocabularly of 2^17 words, i.e. a ~131k vocab size)
# 507,206,240 Oct 25  2015 2-pretrained-vectors_glove.6B.300d.hkl
# 160,569,440 May 14 14:57 1-glove-1-billion-and-wiki_window11-lc-36_vectors.2-17.hkl
"""

import os, requests

def get_embedding_file( hkl ):
  if os.path.isfile( hkl ):
    print("%s already available locally" % (hkl,))
  else:
    # ... requests.get( RCL_BASE + basename(hkl))
    print("Downloading : %s" % (hkl,))
    with open(hkl, 'wb') as handle:
      response = requests.get(RCL_BASE + (hkl.replace('../data/','')), stream=True)
      if not response.ok:
        # Something went wrong
        print("Failed to download %s" % (hkl,))
      for block in response.iter_content(64*1024):
         handle.write(block)
    print("Downloading : %s :: DONE" % (hkl,))

Load the embedding file


In [ ]:
default_embedding_file = '../data/1-glove-1-billion-and-wiki_window11-lc-36_vectors.2-17.hkl'
#default_embedding_file = '../data/2-pretrained-vectors_glove.6B.300d.hkl'

get_embedding_file( default_embedding_file )

In [ ]:
import time

import numpy as np

import theano
import lasagne

# http://blog.mdda.net/oss/2016/04/07/nvidia-on-fedora-23
#theano.config.nvcc.flags = '-D_GLIBCXX_USE_CXX11_ABI=0'

import sklearn.preprocessing

import hickle

d = hickle.load(default_embedding_file)
vocab, embedding = d['vocab'], d['embedding']

vocab_np = np.array(vocab, dtype=str)
vocab_orig=vocab_np.copy()

#dictionary = dict( (word.lower(), i) for i,word in enumerate(vocab) )
dictionary = dict( (word, i) for i,word in enumerate(vocab) if i<len(embedding) )

print("Embedding loaded :", embedding.shape)   # (vocab_size, embedding_dimension)=(rows, columns)

def NO_NEED_save_to_txt(embedding_save, save_filename_txt):
  with open(save_filename_txt, 'wb') as f:
    embedding_save = embedding_normed
    for l in range(0, embedding_save.shape[0]):
      f.write("%s %s\n" % (
          vocab[l], 
          ' '.join([ ('0' if x==0. else ("%.6f" % (x,))) for x in embedding_save[l, :].tolist() ]), )
      )

def save_embedding_to_hickle(vocab, embedding_save, save_filename_hkl, vocab_orig=None):
  print("About to save to %s" % (save_filename_hkl,))
  d=dict( 
    vocab=vocab, 
    vocab_orig=vocab if vocab_orig is None else vocab,
    embedding=embedding_save,
  )
  hickle.dump(d, save_filename_hkl, mode='w', compression='gzip')
  print("Saved to %s" % (save_filename_hkl,))

Lloyd's Method : 32->3 bits


In [ ]:
quantisation_levels = 8

In [ ]:
def np_int_list(n, mult=100., size=3):  # size includes the +/-
  return "[ " + (', '.join([ ('% +*d') % (size,x,) for x in (n * mult).astype(int).tolist()])) + " ]"

## Quantise each entry into 'pct' (as an integer) level (optimised per vector location)
#    Suppose that v is a vector of levels
#    and c is a list of numbers that needs to be quantised, 
#    each c becomes c' where c' is the closest value in v
#      :: update v so that (c - c')^2 is as low as possible

c_length = embedding.shape[0]

embedding_quantised = np.zeros_like(embedding)

t0 = time.time()
for d in range(embedding.shape[1]):   # Quantise each dimension separately
  levels = quantisation_levels

  i_step = int(c_length/levels)
  i_start = int(i_step/2)

  v_indices = np.arange(start=i_start, stop=c_length, step=i_step, dtype='int')

  #if d != 9: continue  # Weird distribution
  #if d != 1: continue  # Very standard example

  # Initialise v by sorting c, and placing them evenly through the list
  e_column = embedding[:,d].astype('float32')

  c_sorted = np.sort( e_column )
  v_init = c_sorted[ v_indices ]

  # the v_init are the initial centers 
  v=v_init

  t1 = time.time()
  epochs=0
  for epoch in range(0, 1000):
    #print(" Dimension:%3d, Epoch:%3d, %s" % (d, epoch, np_int_list(v),))

    #   works out the values in their middles
    mids_np = (v[:-1] + v[1:])/2.

    mids = mids_np.tolist()
    mids.insert( 0, c_sorted[0] )
    mids.append( c_sorted[-1] +1 )

    centroids=[]
    for i in range( 0, len(mids)-1 ):
      pattern = np.where( (mids[i] <= c_sorted) & (c_sorted < mids[i+1]) )
      centroids.append( c_sorted[ pattern ].mean() )

    centroids_np = np.array(centroids)

    if np.allclose(v, centroids_np):
      if epochs>200: # This only prints out for 'long convergence cases'
        print("  NB : long running convergence : embedding[%3d] - took %d epochs" % (d, epochs,))
      break

    v = centroids_np

    epochs += 1

  if d % 10 ==0:
    print("Ran embedding[%3d] - average time for convergence : %6.2fms" % (d, (time.time() - t1)/epochs*1000.,))


  #print("Check col updated: before ", np_int_list(embedding[0:20,d]))

  # Ok, so now we have the centers in v, and the mids in 'mids'
  for i in range( 0, len(mids)-1 ):
    pattern = np.where( (mids[i] <= e_column) & (e_column < mids[i+1]) )
    embedding_quantised[pattern, d] = v[i]

  #print("Check col updated: after  ", np_int_list(embedding_quantised[0:20,d]))

if False:
  offset=101010  # Check rare-ish words
  for d in range(5, embedding_quantised.shape[1], 25):
    print("Col %3d updated: " % (d,), np_int_list(embedding_quantised[(offset+0):(offset+20),d]))

embedding_normed = sklearn.preprocessing.normalize(embedding_quantised, norm='l2', axis=1, copy=True) 
print("Quantisation finished : results in embedding_quantised and (same, but normalised) in embedding_normed")

To save the created embedding, execute the following :


In [ ]:
# Save the embedding_normed as a hickle file (easy to reload into the 'explore' workbook)
save_embedding_to_hickle(vocab, embedding_normed, '../data/lloyds_normed_%d.hkl' % (quantisation_levels, ) )

Non-Negative Sparse Embeddings

python sparsify_lasagne.py 
       --mode=train       \
       --version=21       \
       --save='./sparse.6B.300d_S-21_2n-shuf-noise-after-norm_.2.01_6-75_%04d.hkl'  \
       --sparsity=0.0675  \
       --random=1         \
       --iters=4000 | tee sparse.6B.300d_S-21_2n-shuf-noise-after-norm_.2.01_6-75.log
      #sparse_dim = 1024, pre-num_units=embedding_dim*8,
# -> 4.0 l2 in 4.0k epochs (sigma=39)  # sparsity_std_:,   0.4742,
python sparsify_lasagne.py 
      --mode=predict \
      --version=21 \
      --load='./sparse.6B.300d_S-21_2n-shuf-noise-after-norm_.2.01_6-75_4000.hkl' \
      --sparsity=0.0675 \
      --random=1 \
      --output=sparse.6B.300d_S-21_2n-shuf-noise-after-norm_.2.01_6-75_4000_GPU-sparsity_recreate.hkl \
      --direct=sparse.6B.300d_S-21_2n-shuf-noise-after-norm_.2.01_6-75_4000_GPU-sparse_matrix.hkl

In [ ]:
sparse_dim,sparsity_goal = 1024, 0.0675
#sparse_dim,sparsity_goal = 4096, 0.0150

shuffle_vocab = True
batchsize = 16384  # (GTX760 requires <20000)

pre_normalize = False

In [ ]:
default_save_file_fmt  = './data/sparse.6B.300d_jupyter_%%04d.hkl'

"""
parser = argparse.ArgumentParser(description='')
parser.add_argument('-m','--mode', help='(train|predict)', type=str, default=None)

parser.add_argument('-i','--iters', help='Number of iterations', type=int, default=10000)

parser.add_argument('-o','--output', help='hickle to *create* embedding for testing', type=str, default=None)
parser.add_argument('-d','--direct', help='hickle to *create* *binary* embedding for testing', type=str, default=None)

parser.add_argument('-p','--param', help='Set param value initially', type=float, default=None)
args = parser.parse_args()

print("Mode : %s" % (args.mode,)) 
"""

if shuffle_vocab:
   np.random.seed(1) # No need to get fancy - just want to mix up the word frequencies into different batches
   perm = np.random.permutation(len(embedding))
   embedding = embedding[perm]
   vocab = vocab_np[perm].tolist()
  
dictionary = dict( (word, i) for i,word in enumerate(vocab) )

print("Embedding loaded :", embedding.shape)   # (vocab_size, embedding_dimension)=(rows, columns)
print("Device=%s, OpenMP=%s" % (theano.config.device, ("True" if theano.config.openmp else "False"), ))

def np_int_list(n, mult=100., size=3):  # size includes the +/-
  return "[ " + (', '.join([ ('% +*d') % (size,x,) for x in (n * mult).astype(int).tolist()])) + " ]"

embedding_dim = embedding.shape[1]

In [ ]:
mode='train'
#mode='predict'

In [ ]:
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

class SparseWinnerTakeAllLayer(lasagne.layers.Layer):
    def __init__(self, incoming, sparsity=0.05, **kwargs):
        super(SparseWinnerTakeAllLayer, self).__init__(incoming, **kwargs)
        self.sparsity = sparsity

    def get_output_for(self, input, **kwargs):
        """
        Parameters
        ----------
        input : tensor
            output from the previous layer
        """
        # Sort within batch (Very likely on the CPU)
        # theano.tensor.sort(self, axis, kind, order)
        sort_input = input.sort( axis=0, kind='quicksort' )

        # Find kth value
        hurdles_raw = sort_input[ int( batchsize * (1.0 - self.sparsity) ), : ]
        hurdles = theano.tensor.maximum(hurdles_raw, 0.0)  # rectification...

        # switch based on >kth value (or create mask), all other entries are zero
        masked = theano.tensor.switch( theano.tensor.ge(input, hurdles), input, 0.0)
        return masked
        
class SparseWinnerTakeAllLayerApprox(lasagne.layers.Layer):
    def __init__(self, incoming, approx_sparsity=0.12, **kwargs):  
        super(SparseWinnerTakeAllLayerApprox, self).__init__(incoming, **kwargs)
        self.sparsity = approx_sparsity

    def get_output_for(self, input, **kwargs):
        """
        Parameters
        ----------
        input : tensor
            output from the previous layer
        """
        # input_shape is [ #in_batch, #vector_entries ] ~ [ 20k, 1024 ]
    
        current_sparsity = self.sparsity
        #print(current_sparsity)  # A theano variable
        
        if False:
          # This is an 'advanced' tail-aware hurdle-level prediction.  
          #   In the end, it works less well than the binary-search version below
            
          # Find the max value in each column - this is the k=1 (top-most) entry
          hurdles_max  = input.max( axis=0 )
          
          input = lasagne.layers.get_output(embedding_batch_middle)
          
          # Find the max value in each column - this is the k=1 (top-most) entry
          hurdles_max  = input.max( axis=0 )
          
          # Find the min value in each column - this is the k=all (bottom-most) entry
          #hurdles_min  = input.min( axis=0 )

          # Let's guess (poorly) that the sparsity hurdle is (0... sparsity ...100%) within these bounds
          #hurdles_guess = hurdles_max * (1.0 - current_sparsity) + hurdles_min * current_sparsity
          
          #hurdles_guess = (hurdles_min + hurdles_max)/2.0
          
          # New approach : We know that the mean() is zero and the std() is 1
          #   simulations suggest that the more stable indicators are at fractions of the max()
          
          hurdles_hi = hurdles_max * 0.5
          hurdles_lo = hurdles_max * 0.3
          
          # Now, let's find the actual sparsity that this creates
          sparsity_flag_hi = theano.tensor.switch( theano.tensor.ge(input, hurdles_hi), 1.0, 0.0)
          sparsity_real_hi = sparsity_flag_hi.mean(axis=0)    # Should be ~ sparsity (likely to be lower, though)

          sparsity_flag_lo = theano.tensor.switch( theano.tensor.ge(input, hurdles_lo), 1.0, 0.0)
          sparsity_real_lo = sparsity_flag_lo.mean(axis=0)    # Should be ~ sparsity (likely to be higher, though)
          
          # But this is wrong!  Let's do another estimate (will be much closer, hopefully) using this knowledge
          #   For each column, the new hurdle guess
          
          #hurdles_better = hurdles_max - (hurdles_max - hurdles_guess) * ( 
          #                              current_sparsity / (sparsity_guess_real + 0.00001) ) )

          if False: # This assumes that the distribution tails are linear (which is not true)
            hurdles_interp = hurdles_hi + (hurdles_lo-hurdles_hi) * (
                        (current_sparsity - sparsity_real_hi) / ((sparsity_real_lo - sparsity_real_hi)+0.00001) )
            
          else:  # Assume that the areas under the tails are ~ exp(-x*x)  
            # See (2) in : https://math.uc.edu/~brycw/preprint/z-tail/z-tail.pdf
            # *** See (Remark 15) in : http://m-hikari.com/ams/ams-2014/ams-85-88-2014/epureAMS85-88-2014.pdf
            
            def tail_transform(z):
              return theano.tensor.sqrt( -theano.tensor.log( z ) )
            
            tail_target = tail_transform(current_sparsity)
            tail_hi = tail_transform(sparsity_real_hi)
            tail_lo = tail_transform(sparsity_real_lo)

            hurdles_interp = hurdles_hi + (hurdles_lo-hurdles_hi) * (
                                           (tail_target - tail_hi) / ((tail_lo - tail_hi)+0.00001) )
          
          #hurdles = theano.tensor.maximum(hurdles_better, 0.0)  # rectification... at mininim... 
          #                                                        (also solves everything-blowing-up problem)
          hurdles = hurdles_interp.clip(hurdles_max*0.2, hurdles_max*0.9)


        if True:  # Simple, but effective : Binary search
          hurdles_hi, hurdles_lo = [], []
          
          hurdles_guess = []
          sparsity_flag = []
          sparsity_real = []
          
          sparsity_hi, sparsity_lo = [], []

          # Find the max value in each column - this is the k=1 (top-most) entry
          hurdles_max  = input.max( axis=0 )
          
          hurdles_hi.append(hurdles_max)
          sparsity_hi.append(hurdles_max * (1./batchsize) ) 
          

          hurdles_lo_temp = input.mean( axis=0 )  # Different estimate idea...

          hurdles_lo.append(hurdles_lo_temp)
          sparsity_lo_temp = theano.tensor.switch( theano.tensor.ge(input, hurdles_lo_temp), 1.0, 0.0)
          sparsity_lo.append( sparsity_lo_temp.mean(axis=0) )
          
          for i in range(10):  
            if True:   # WINS THE DAY!
              hurdles_guess.append(
                (
                  (hurdles_lo[-1] + hurdles_hi[-1]) * 0.5
                )
              )

            if False:  # A 'better approximation' that is actually worse
              hurdles_guess.append(
                (
                  hurdles_hi[-1] + (hurdles_lo[-1] - hurdles_hi[-1]) * 
                    (current_sparsity - sparsity_hi[-1]) / ((sparsity_lo[-1] - sparsity_hi[-1])+0.000001)
                ).clip(hurdles_lo[-1], hurdles_hi[-1])
              )

            if False:  # Another 'better approximation' that is actually worse
              # switch on closeness to getting it correct
              hurdles_guess.append(
                theano.tensor.switch( theano.tensor.lt( sparsity_lo[-1], current_sparsity * 2.0 ),
                  (
                    hurdles_hi[-1] + (hurdles_lo[-1] - hurdles_hi[-1]) * 
                      (current_sparsity - sparsity_hi[-1]) / ((sparsity_lo[-1] - sparsity_hi[-1])+0.000001)
                  ).clip(hurdles_lo[-1], hurdles_hi[-1]),
                  (
                    (hurdles_lo[-1] + hurdles_hi[-1]) * 0.5
                  )
                )
                
              )
              
            
            sparsity_flag.append( theano.tensor.switch( theano.tensor.ge(input, hurdles_guess[-1] ), 1.0, 0.0) )
            sparsity_real.append( sparsity_flag[-1].mean(axis=0) )
            
            # So, based on whether the real sparsity is greater or less than the real value, change the hi or lo values

            hurdles_lo.append( 
              theano.tensor.switch( theano.tensor.gt(current_sparsity, sparsity_real[-1]), 
                                   hurdles_lo[-1], hurdles_guess[-1]) 
            )
            hurdles_hi.append( 
              theano.tensor.switch( theano.tensor.le(current_sparsity, sparsity_real[-1]), 
                                   hurdles_hi[-1], hurdles_guess[-1]) 
            )

          hurdles = hurdles_guess[-1]
          #hurdles = hurdles_lo[-1]  # Better to bound this at the highest relevant sparsity...
          
        masked = theano.tensor.switch( theano.tensor.ge(input, hurdles), input, 0.0)
        return masked

In [ ]:
embedding_N = (embedding)  # No Normalization by default

if pre_normalize:
  embedding_std  = np.std(embedding, axis=1)
  embedding_N = embedding / embedding_std[:, np.newaxis]    # Try Normalizing  std(row) == 1, making sure shapes are right


embedding_shared = theano.shared(embedding_N.astype('float32'))       # 400000, 300
embedding_shared.name = "embedding_shared"

batch_start_index = theano.tensor.scalar('batch_start_index', dtype='int32')

embedding_batch = embedding_shared[ batch_start_index:(batch_start_index+batchsize) ]

network = lasagne.layers.InputLayer( 
    ( batchsize, embedding_dim ), 
    input_var=embedding_batch,
  )

pre_hidden_dim=embedding_dim*8  ## For sparse_dim=1024 and below
if sparse_dim>1024*1.5:
  pre_hidden_dim=sparse_dim*2   ## Larger sparse_dim

network = lasagne.layers.DenseLayer(
    network,
    num_units=pre_hidden_dim,     
    nonlinearity=lasagne.nonlinearities.rectify,
    W=lasagne.init.GlorotUniform(),
    b=lasagne.init.Constant(0.)
  )

network = lasagne.layers.DenseLayer(
    network,
    num_units=sparse_dim,
    nonlinearity=lasagne.nonlinearities.identity,
    W=lasagne.init.GlorotUniform(),
    b=lasagne.init.Constant(0.)
  )

sparse_embedding_batch_linear=network

#def hard01(x):
#  # http://deeplearning.net/software/theano/library/tensor/basic.html#theano.tensor.switch
#  #return theano.tensor.switch( theano.tensor.gt(x, 0.), 0.95, 0.05)
#  return theano.tensor.switch( theano.tensor.gt(x, 0.), 1.0, 0.0)

In [ ]:
if mode == 'train':
  # This adds some 'fuzziness' to smooth out the training process
    
  sigma = theano.tensor.scalar(name='sigma', dtype='float32')

  embedding_batch_middle = lasagne.layers.batch_norm(
      lasagne.layers.NonlinearityLayer( network,  nonlinearity=lasagne.nonlinearities.rectify )
  )

  embedding_batch_middle = lasagne.layers.GaussianNoiseLayer(
            embedding_batch_middle, 
            sigma=0.2 * theano.tensor.exp((-0.01) * sigma ) # Noise should die down over time...
  )  


  sparsity_blend = theano.tensor.exp((-10.) * sigma )  # Goes from 1 to epsilon
  current_sparsity = 0.50*(sparsity_blend) + sparsity_goal*(1. - sparsity_blend)

  sparse_embedding_batch_squashed = SparseWinnerTakeAllLayerApprox(
                                      embedding_batch_middle, 
                                      approx_sparsity=current_sparsity
                                    )
    
    
elif mode == 'predict':
  embedding_batch_middle = lasagne.layers.batch_norm(
      lasagne.layers.NonlinearityLayer( network,  nonlinearity=lasagne.nonlinearities.rectify )
    )
        
  #sparse_embedding_batch_squashed = SparseWinnerTakeAllLayer(
  #                                    embedding_batch_middle, 
  #                                    sparsity=sparsity_goal,
  #                                  )

  sparse_embedding_batch_squashed = SparseWinnerTakeAllLayerApprox(
                                      embedding_batch_middle, 
                                      approx_sparsity=sparsity_goal,   # Jam the actual (final) value in...
                                    )
    
sparse_embedding_batch_probs = sparse_embedding_batch_squashed

network = sparse_embedding_batch_squashed

network = lasagne.layers.DenseLayer(
    network,
    num_units=embedding_dim,
    nonlinearity=lasagne.nonlinearities.linear,
    W=lasagne.init.GlorotUniform(),
    b=lasagne.init.Constant(0.)
  )

prediction = lasagne.layers.get_output(network)

l2_error = lasagne.objectives.squared_error( prediction, embedding_batch )   
l2_error_mean = l2_error.mean()  # This is a per-element error term

interim_output = lasagne.layers.get_output(sparse_embedding_batch_probs)

# Count the number of positive entries
sparse_flag = theano.tensor.switch( theano.tensor.ge(interim_output, 0.0001), 1.0, 0.0)

#sparsity_mean  = sparse_flag.mean() / sparsity_goal  # This is a number 0..1, where 1.0 = perfect = on-target
sparsity_mean  = sparse_flag.mean() * 100.  # This is realised sparsity 

sparsity_std  = (sparse_flag.mean(axis=1) / sparsity_goal).std()     # assess the 'quality' of the sparsity per-row

# This is to monitor learning (not direct it)
sparsity_probe = sparse_flag.mean(axis=1) / sparsity_goal # sparsity across rows may not be ===1.0
#sparsity_probe = sparse_flag.mean(axis=0) / sparsity_goal # sparsity across columns should be ===1.0 (if approx works)

In [ ]:
sparsity_cost=0.0
if mode == 'train':
  mix = theano.tensor.scalar(name='mix', dtype='float32')

  sparsity_cost = -mix*sparsity_mean/1000.  # The 1000 factor is because '10' l2 is Ok, and 1 sparsity_mean is Great
  if version==20 or version==21:
    sparsity_cost = mix*0.
  
cost = l2_error_mean + sparsity_cost

params = lasagne.layers.get_all_params(network, trainable=True)

In [ ]:
epoch_base=0
if args.load:
  load_vars = hickle.load(args.load)
  print("Saved file had : Epoch:%4d, sigma:%5.2f" % (load_vars['epoch'], load_vars['sigma'], ) )
  #fraction_of_vocab=fraction_of_vocab
  
  epoch_base = load_vars['epoch']
  
  if 'layer_names' in load_vars:
    layer_names = load_vars['layer_names']
  else:
    i=0
    layer_names=[]
    while "Lasagne%d" % (i,) in load_vars:
      layer_names.append( "Lasagne%d" % (i,) )
      i=i+1
    
  layers = [ load_vars[ ln ] for ln in layer_names ]
  
  lasagne.layers.set_all_param_values(network, layers)

In [ ]:
if mode == 'train':
  updates = lasagne.updates.adam( cost, params )

  iterate_net = theano.function( 
                  [batch_start_index,sigma,mix], 
                  [l2_error_mean,sparsity_mean,sparsity_std,sparsity_probe], 
                  updates=updates, 
                  allow_input_downcast=True,
                  on_unused_input='warn',
                )

  print("Built Theano op graph")
  
  sigma_ = 0.0
  mix_ = 0.0
  if args.param:
    mix_=args.param
  
  t0 = time.time()
  for epoch in range(epoch_base, epoch_base+args.iters):
    t1 = time.time()
    
    fraction_of_vocab = 1.0

    max_l2_error_mean=-1000.0

    batch_list = np.array( range(0, int(embedding.shape[0]*fraction_of_vocab), batchsize) )
    batch_list = np.random.permutation( batch_list )
    
    for b_start in batch_list.astype(int).tolist():
      #l2_error_mean_,sparsity_mean_ = iterate_net(b_start)
      
      l2_error_mean_,sparsity_mean_,sparsity_std_,sparsity_probe_ = iterate_net(b_start, sigma_, mix_)

      print(" epoch:,%4d, b:,%7d, l2:,%9.2f, sparsity_mean_:,%9.4f, sparsity_std_:,%9.4f, sigma:,%5.2f, mix:,%5.2f, " % 
          (epoch, b_start, 1000*l2_error_mean_, sparsity_mean_, sparsity_std_, sigma_, mix_, ))

      if b_start==0:
        #print("Hurdles : " + np_int_list( sparsity_probe_[0:100] ))
        print("  Row-wise sparsity : " + np_int_list( sparsity_probe_[0:30] ))
        #print("  %d, vector_probe : %s" % (epoch, np_int_list( np.sort(sparsity_probe_[0:100]) ), )) 
        #print("  %d, vector_probe : %s" % (epoch, np_int_list( sparsity_probe_[0:100] ), )) 
        #print("  vector_probe : " + np_int_list( sparsity_probe_[0:1000] ))
      
      if max_l2_error_mean<l2_error_mean_:
        max_l2_error_mean=l2_error_mean_

    print("Time per 100k words %6.2fs" % ((time.time() - t1)/embedding.shape[0]/fraction_of_vocab*1000.*100.,  ))
    #exit()

    boil_limit=10.
    
    if pre_normalize:
      boil_limit=40.
    
    if max_l2_error_mean*1000.<boil_limit:
      print("max_l2_error_mean<%6.2f - increasing sparseness emphasis" % (boil_limit,))
      sigma_ += 0.01
      mix_ += 0.1

    if (epoch +1) % 10 == 0:
      save_vars = dict(
        version=version,
        epoch=epoch,
        sigma=sigma_,
        mix=mix_,
        fraction_of_vocab=fraction_of_vocab
      )

      layer_names = []
      for i,p in enumerate(lasagne.layers.get_all_param_values(network)):
        if len(p)>0:
          name = "Lasagne%d" % (i,)
          save_vars[ name ] = p
          layer_names.append( name )
      save_vars[ 'layer_names' ] = layer_names
    
      epoch_thinned = int(epoch/100)*100
      hickle.dump(save_vars, args.save % (epoch_thinned,), mode='w', compression='gzip')

In [ ]:
if args.load and mode == 'predict':
  print("Parameters : ", lasagne.layers.get_all_params(network))
  
  get_sparse_linear = theano.function( [batch_start_index], [ lasagne.layers.get_output(sparse_embedding_batch_linear), ])  # allow_input_downcast=True 
  predict_net = theano.function( [batch_start_index], [l2_error_mean,sparsity_mean], allow_input_downcast=True )
  predict_emb = theano.function( [batch_start_index], [prediction], allow_input_downcast=True )

  predict_bin = theano.function( [batch_start_index], [ lasagne.layers.get_output(sparse_embedding_batch_squashed),])

  print("Built Theano op graph")

  if True:  # Shows the error predictions with hard01 sigmoid
    for b_start in range(0, int(embedding.shape[0]), batchsize):
      l2_error_mean_,sparsity_mean_ = predict_net(b_start)

      print(" epoch:%4d, b:%7d, l2:%12.4f, sparsity:%6.4f - hard01" % 
          (epoch_base, b_start, 1000*l2_error_mean_, sparsity_mean_, ))

  if False:  # Shows the linear range of the sparse layer (pre-squashing)
    for b_start in range(0, int(embedding.shape[0]), batchsize * 5):
      sparse_embedding_batch_linear_, = get_sparse_linear(b_start)

      for row in range(0,100,5):
        print(np_int_list( sparse_embedding_batch_linear_[row][0:1000:50], mult=10, size=4 ))

  if args.output:
    predictions=[]
    for b_start in range(0, int(embedding.shape[0]), batchsize):
      prediction_, = predict_emb(b_start)
      
      predictions.append( np.array( prediction_ ) )

      print(" epoch:%3d, b:%7d, Downloading - reconstructed array" % 
          (epoch_base, b_start, ))
    
    embedding_prediction = np.concatenate(predictions, axis=0)
    predictions=None

    print("About to save to %s" % (args.output,))
    d=dict( 
      vocab=vocab, 
      vocab_orig=vocab_orig,
      embedding=embedding_prediction,
    )
    hickle.dump(d, args.output, mode='w', compression='gzip')
  
  if args.direct:
    predictions=[]
    for b_start in range(0, int(embedding.shape[0]), batchsize):
      binarised_, = predict_bin(b_start)
      
      #predictions.append( np.where( binarised_>0.5, 1., 0. ).astype('float32') )
      predictions.append( binarised_.astype('float32') )

      #print(" epoch:%3d, b:%7d, Downloading - hard01 to binary" % 
      print(" epoch:%3d, b:%7d, Downloading - sparse data" % 
          (epoch_base, b_start, ))
    
    embedding_prediction = np.concatenate(predictions, axis=0)
    predictions=None

    print("About to save sparse version to %s" % (args.direct,))
    d=dict( 
      vocab=vocab, 
      vocab_orig=vocab_orig,
      embedding=embedding_prediction,
    )
    hickle.dump(d, args.direct, mode='w', compression='gzip')