GloVe implementation speed tests (section 2.1)


In [ ]:
__author__ = 'Nick Dingwall and Christopher Potts'

This notebook requires a compiled version of the official GloVe code release to be in the directory official-glove/build.


In [ ]:
import numpy as np
import os
import timeit
import random
import pandas as pd
import subprocess
import sys
from utils import build_weighted_matrix

# GloVe implementations:
from nonvectorized_glove import GloVeModel
from tf_mittens import Mittens
from vector_glove import VectorGlove

Random corpora

We build simulated corpora from a Zipfian distribution of words, and we build matrices from them using the same methods used for our empirical corpora.


In [ ]:
def generate_corpus(n_words=1000000):
    """Returns a string of integers with a Zipfian distribution."""
    # For `n_words` at 1000 or more, these settings tend 
    # to return matrices that about the sparsity we see in
    # our empirical matrices.
    words = [str(i) for i in np.random.zipf(1.7, n_words)]
    corpus = " ".join(words)
    return corpus

In [ ]:
def generate_corpus_and_matrix(vocab_size, window_size):
    """Creates a corpus and associated matrix. This helps
    ensure parity between tests with the official distribution,
    where we start by reading in a corpus, and tests with the
    Python implementations, where we start with a matrix.    
    """
    # Setting `n_words` this way is an attempt to ensure
    # that we get a matrix of the right size we want. For
    # large vocabularies, really big corpora are needed.
    n_words = np.min([vocab_size * 10000, int(5e8)])
    corpus = generate_corpus(n_words)    
    tokenizer = lambda x: x.split(' ')
    X = build_weighted_matrix(
        [corpus], 
        tokenizing_func=tokenizer, 
        vocab_size=vocab_size, 
        window_size=window_size)
    return X.values, corpus

Experiments


In [ ]:
n = 50
xmax = 100
alpha = 0.75
max_iter = 10
eta = 0.01
tol = 1e-4
window_size = 10

Official GloVe distribution


In [ ]:
def official_glove_experiment(corpus, vocab_size, verbose=False):
    BUILDDIR = 'official-glove/build'    
    CORPUS_FILE = 'official-glove/speed-test-corpus.txt'
    VOCAB_FILE = 'official-glove/vocab.txt'
    COOCCUR_FILE = 'official-glove/cooccurrence.bin'
    SHUFFLE_FILE = 'official-glove/cooccurrence.shuf.bin'
    VECTORS_FILE = 'official-glove/vectors'
    
    with open(CORPUS_FILE, 'wt') as f:
        f.write(corpus)                
    
    VERBOSE = 0
    MEMORY = 4.0
    NUM_THREADS = 1
        
    vocab_cmd = [
        '{}/vocab_count'.format(BUILDDIR),
        '-max-vocab', str(vocab_size),
        '-min-count', '1',
        '< {} > {}'.format(CORPUS_FILE, VOCAB_FILE)]    
    
    cooccur_cmd = [
        '{}/cooccur'.format(BUILDDIR), 
        '-memory', str(MEMORY),
        '-verbose', '0',
        '-vocab-file', VOCAB_FILE,
        '-window-size', str(window_size),
        '< {} > {}'.format(CORPUS_FILE, COOCCUR_FILE)]  
    
    shuffle_cmd = [
        '{}/shuffle'.format(BUILDDIR),
        '-memory', str(MEMORY), 
        '-verbose', '0',
        '< {} > {}'.format(COOCCUR_FILE, SHUFFLE_FILE)]
    
    glove_cmd = [
        '{}/glove'.format(BUILDDIR), 
        '-save-file', VECTORS_FILE, 
        '-threads', str(NUM_THREADS), 
        '-input-file', SHUFFLE_FILE,
        '-x-max', str(xmax),
        '-iter', str(max_iter),
        '-vector-size', str(n),
        '-binary', '0',
        '-vocab-file', VOCAB_FILE]
    
    for cmd in [vocab_cmd, cooccur_cmd, shuffle_cmd]:        
        x = subprocess.run(" ".join(cmd), shell=True, check=True, stdout=subprocess.PIPE)
        if verbose:
            print("="*70)
            print(" ".join(cmd) + ";")
            print(x)
        
    def run_test():
         subprocess.run(glove_cmd)
                        
    secs = timeit.timeit(run_test, number=1)
    
    VECTORS_FILE = VECTORS_FILE + ".txt"
            
    X = pd.read_csv(VECTORS_FILE, delim_whitespace=True, index_col=0).values
      
    for f in [CORPUS_FILE, VOCAB_FILE, COOCCUR_FILE, SHUFFLE_FILE, VECTORS_FILE]:
        os.remove(f)
                
    return secs, X

Vectorized Tensorflow


In [ ]:
def vectorized_tensorflow_experiment(X):
    model = Mittens(
        n=n, 
        xmax=xmax, 
        alpha=alpha, 
        max_iter=max_iter, 
        eta=eta, 
        tol=tol,
        display_progress=0)
    
    def run_test():    
        model.fit(X)

    return timeit.timeit(run_test, number=1)

Non-vectorized Tensorflow

Adapted Grady Simon (https://github.com/GradySimon/tensorflow-glove)


In [ ]:
def nonvectorized_tensorflow_experiment(X):    
    model = GloVeModel(
        n=n, 
        alpha=alpha, 
        xmax=xmax,
        eta=eta,
        max_iter=max_iter)
    
    def run_test():
        model.fit(X)
        
    return timeit.timeit(run_test, number=1)

Vectorized Numpy


In [ ]:
def vectorized_numpy_experiment(X):
    model = VectorGlove(
        n=n, 
        xmax=xmax, 
        alpha=alpha, 
        max_iter=max_iter,
        learning_rate=eta, 
        display_progress=False)
    
    def run_test():
        model.fit(X)
        
    return timeit.timeit(run_test, number=1)

Experiment runs


In [ ]:
def timing_experiment(
        n_tests=5, 
        vocab_sizes=(5000, 10000, 20000),
        funcs=(vectorized_numpy_experiment,
               nonvectorized_tensorflow_experiment,
               vectorized_tensorflow_experiment,
               official_glove_experiment)):               
    data = []            
    for vocab_size in vocab_sizes:   
        print("Vocab size: {:,}".format(vocab_size))
        for t in range(1, n_tests+1):
            X, corpus = generate_corpus_and_matrix(vocab_size, window_size) 
            print("\tX has vocab size {:,} and {:,} non-0 entries".format(
                    X.shape[0], np.count_nonzero(X)))
            for func in funcs:
                print("\t", t, func.__name__)                
                if func.__name__ == 'official_glove_experiment':
                    secs, X = func(corpus, vocab_size)
                else:                
                    secs = func(X)
                experiment_name = func.__name__.replace("_experiment", "")
                data.append({
                    'test_num': t, 
                    'iterations': max_iter,
                    'vocab_size': X.shape[0],
                    'model': experiment_name, 
                    'seconds': secs})
                
        pd.DataFrame(data).to_csv("tmp-speed-interim_to{}.csv".format(vocab_size))

    df = pd.DataFrame(data)     
    return df

In [ ]:
def summarize(data, digits=2):
    results = data.groupby(['model', 'vocab_size']).apply(
        lambda x: x['seconds'].sum() / x['iterations'].sum())
    results = results.to_frame().rename(columns={0: 'mean seconds per iteration'})
    return results.round(digits)

CPU


In [ ]:
cpu_data = timing_experiment()

In [ ]:
cpu_data_20K = timing_experiment(
     funcs=(vectorized_numpy_experiment,
               nonvectorized_tensorflow_experiment,
               vectorized_tensorflow_experiment,
               official_glove_experiment),
    vocab_sizes=(20000,))

In [ ]:
cpu_data.to_csv("results/speed-tests-cpu.csv")

In [ ]:
summarize(cpu_data)

GPU


In [ ]:
gpu_data = timing_experiment(
     funcs=(vectorized_tensorflow_experiment,
            nonvectorized_tensorflow_experiment))

In [ ]:
gpu_data.to_csv("results/speed-tests-gpu.csv")

In [ ]:
summarize(gpu_data)