In [ ]:
__author__ = 'Nick Dingwall and Christopher Potts'
This notebook requires a compiled version of the official GloVe code release to be in the directory official-glove/build
.
In [ ]:
import numpy as np
import os
import timeit
import random
import pandas as pd
import subprocess
import sys
from utils import build_weighted_matrix
# GloVe implementations:
from nonvectorized_glove import GloVeModel
from tf_mittens import Mittens
from vector_glove import VectorGlove
In [ ]:
def generate_corpus(n_words=1000000):
"""Returns a string of integers with a Zipfian distribution."""
# For `n_words` at 1000 or more, these settings tend
# to return matrices that about the sparsity we see in
# our empirical matrices.
words = [str(i) for i in np.random.zipf(1.7, n_words)]
corpus = " ".join(words)
return corpus
In [ ]:
def generate_corpus_and_matrix(vocab_size, window_size):
"""Creates a corpus and associated matrix. This helps
ensure parity between tests with the official distribution,
where we start by reading in a corpus, and tests with the
Python implementations, where we start with a matrix.
"""
# Setting `n_words` this way is an attempt to ensure
# that we get a matrix of the right size we want. For
# large vocabularies, really big corpora are needed.
n_words = np.min([vocab_size * 10000, int(5e8)])
corpus = generate_corpus(n_words)
tokenizer = lambda x: x.split(' ')
X = build_weighted_matrix(
[corpus],
tokenizing_func=tokenizer,
vocab_size=vocab_size,
window_size=window_size)
return X.values, corpus
In [ ]:
n = 50
xmax = 100
alpha = 0.75
max_iter = 10
eta = 0.01
tol = 1e-4
window_size = 10
In [ ]:
def official_glove_experiment(corpus, vocab_size, verbose=False):
BUILDDIR = 'official-glove/build'
CORPUS_FILE = 'official-glove/speed-test-corpus.txt'
VOCAB_FILE = 'official-glove/vocab.txt'
COOCCUR_FILE = 'official-glove/cooccurrence.bin'
SHUFFLE_FILE = 'official-glove/cooccurrence.shuf.bin'
VECTORS_FILE = 'official-glove/vectors'
with open(CORPUS_FILE, 'wt') as f:
f.write(corpus)
VERBOSE = 0
MEMORY = 4.0
NUM_THREADS = 1
vocab_cmd = [
'{}/vocab_count'.format(BUILDDIR),
'-max-vocab', str(vocab_size),
'-min-count', '1',
'< {} > {}'.format(CORPUS_FILE, VOCAB_FILE)]
cooccur_cmd = [
'{}/cooccur'.format(BUILDDIR),
'-memory', str(MEMORY),
'-verbose', '0',
'-vocab-file', VOCAB_FILE,
'-window-size', str(window_size),
'< {} > {}'.format(CORPUS_FILE, COOCCUR_FILE)]
shuffle_cmd = [
'{}/shuffle'.format(BUILDDIR),
'-memory', str(MEMORY),
'-verbose', '0',
'< {} > {}'.format(COOCCUR_FILE, SHUFFLE_FILE)]
glove_cmd = [
'{}/glove'.format(BUILDDIR),
'-save-file', VECTORS_FILE,
'-threads', str(NUM_THREADS),
'-input-file', SHUFFLE_FILE,
'-x-max', str(xmax),
'-iter', str(max_iter),
'-vector-size', str(n),
'-binary', '0',
'-vocab-file', VOCAB_FILE]
for cmd in [vocab_cmd, cooccur_cmd, shuffle_cmd]:
x = subprocess.run(" ".join(cmd), shell=True, check=True, stdout=subprocess.PIPE)
if verbose:
print("="*70)
print(" ".join(cmd) + ";")
print(x)
def run_test():
subprocess.run(glove_cmd)
secs = timeit.timeit(run_test, number=1)
VECTORS_FILE = VECTORS_FILE + ".txt"
X = pd.read_csv(VECTORS_FILE, delim_whitespace=True, index_col=0).values
for f in [CORPUS_FILE, VOCAB_FILE, COOCCUR_FILE, SHUFFLE_FILE, VECTORS_FILE]:
os.remove(f)
return secs, X
In [ ]:
def vectorized_tensorflow_experiment(X):
model = Mittens(
n=n,
xmax=xmax,
alpha=alpha,
max_iter=max_iter,
eta=eta,
tol=tol,
display_progress=0)
def run_test():
model.fit(X)
return timeit.timeit(run_test, number=1)
Adapted Grady Simon (https://github.com/GradySimon/tensorflow-glove)
In [ ]:
def nonvectorized_tensorflow_experiment(X):
model = GloVeModel(
n=n,
alpha=alpha,
xmax=xmax,
eta=eta,
max_iter=max_iter)
def run_test():
model.fit(X)
return timeit.timeit(run_test, number=1)
In [ ]:
def vectorized_numpy_experiment(X):
model = VectorGlove(
n=n,
xmax=xmax,
alpha=alpha,
max_iter=max_iter,
learning_rate=eta,
display_progress=False)
def run_test():
model.fit(X)
return timeit.timeit(run_test, number=1)
In [ ]:
def timing_experiment(
n_tests=5,
vocab_sizes=(5000, 10000, 20000),
funcs=(vectorized_numpy_experiment,
nonvectorized_tensorflow_experiment,
vectorized_tensorflow_experiment,
official_glove_experiment)):
data = []
for vocab_size in vocab_sizes:
print("Vocab size: {:,}".format(vocab_size))
for t in range(1, n_tests+1):
X, corpus = generate_corpus_and_matrix(vocab_size, window_size)
print("\tX has vocab size {:,} and {:,} non-0 entries".format(
X.shape[0], np.count_nonzero(X)))
for func in funcs:
print("\t", t, func.__name__)
if func.__name__ == 'official_glove_experiment':
secs, X = func(corpus, vocab_size)
else:
secs = func(X)
experiment_name = func.__name__.replace("_experiment", "")
data.append({
'test_num': t,
'iterations': max_iter,
'vocab_size': X.shape[0],
'model': experiment_name,
'seconds': secs})
pd.DataFrame(data).to_csv("tmp-speed-interim_to{}.csv".format(vocab_size))
df = pd.DataFrame(data)
return df
In [ ]:
def summarize(data, digits=2):
results = data.groupby(['model', 'vocab_size']).apply(
lambda x: x['seconds'].sum() / x['iterations'].sum())
results = results.to_frame().rename(columns={0: 'mean seconds per iteration'})
return results.round(digits)
In [ ]:
cpu_data = timing_experiment()
In [ ]:
cpu_data_20K = timing_experiment(
funcs=(vectorized_numpy_experiment,
nonvectorized_tensorflow_experiment,
vectorized_tensorflow_experiment,
official_glove_experiment),
vocab_sizes=(20000,))
In [ ]:
cpu_data.to_csv("results/speed-tests-cpu.csv")
In [ ]:
summarize(cpu_data)
In [ ]:
gpu_data = timing_experiment(
funcs=(vectorized_tensorflow_experiment,
nonvectorized_tensorflow_experiment))
In [ ]:
gpu_data.to_csv("results/speed-tests-gpu.csv")
In [ ]:
summarize(gpu_data)