In [1]:
!git rev-parse HEAD
In [2]:
from copy import deepcopy
from datetime import timedelta
from itertools import product
import logging
from math import floor, ceil, log10
import pickle
from random import sample, seed, shuffle
from time import time
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
def tqdm(iterable, total=None, desc=None):
if total is None:
total = len(iterable)
for num_done, element in enumerate(tqdm_notebook(iterable, total=total)):
logger.info("%s: %d / %d", desc, num_done, total)
yield element
from gensim.corpora import Dictionary
import gensim.downloader as api
from gensim.similarities.index import AnnoyIndexer
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import UniformTermSimilarityIndex
from gensim.similarities import LevenshteinSimilarityIndex
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.utils import simple_preprocess
RANDOM_SEED = 12345
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='matrix_speed.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.INFO)
pd.set_option('display.max_rows', None, 'display.max_seq_items', None)
In [3]:
"""Repeatedly run a benchmark callable given various configurations and
get a list of results.
Return a list of results of repeatedly running a benchmark callable.
Parameters
----------
benchmark : callable tuple -> dict
A benchmark callable that accepts a configuration and returns results.
configurations : iterable of tuple
An iterable of configurations that are used for calling the benchmark function.
results_filename : str
A filename of a file that will be used to persistently store the results using
pickle. If the file exists, then the function will load the stored results
instead of calling the benchmark callable.
Returns
-------
iterable of tuple
The return values of the individual invocations of the benchmark callable.
"""
def benchmark_results(benchmark, configurations, results_filename):
try:
with open(results_filename, "rb") as file:
results = pickle.load(file)
except IOError:
configurations = list(configurations)
shuffle(configurations)
results = list(tqdm(
(benchmark(configuration) for configuration in configurations),
total=len(configurations), desc="benchmark"))
with open(results_filename, "wb") as file:
pickle.dump(results, file)
return results
In Gensim PR #1827, we added a base implementation of the soft cosine measure (SCM). The base implementation would create term similarity matrices using a single complex procedure. In the Gensim PR #2016, we split the procedure into:
One of the benefits of this separation is that we can easily measure the speed at which a TermSimilarityIndex builder class produces term similarities and compare this speed with the speed at which the SparseTermSimilarityMatrix director class consumes term similarities. This allows us to see which of the classes are a bottleneck that slows down the construction of term similarity matrices.
In this notebook, we measure all the currently available builder and director classes. For the measurements, we use the Google News word embeddings distributed with the C implementation of Word2Vec. From the word embeddings, we will derive a dictionary of 2.01M terms.
In [4]:
full_model = api.load("word2vec-google-news-300")
try:
full_dictionary = Dictionary.load("matrix_speed.dictionary")
except IOError:
full_dictionary = Dictionary([[term] for term in full_model.vocab.keys()])
full_dictionary.save("matrix_speed.dictionary")
In [5]:
def benchmark(configuration):
dictionary, nonzero_limit, symmetric, positive_definite, repetition = configuration
index = UniformTermSimilarityIndex(dictionary)
start_time = time()
matrix = SparseTermSimilarityMatrix(
index, dictionary, nonzero_limit=nonzero_limit, symmetric=symmetric,
positive_definite=positive_definite, dtype=np.float16).matrix
end_time = time()
duration = end_time - start_time
return {
"dictionary_size": len(dictionary),
"nonzero_limit": nonzero_limit,
"matrix_nonzero": matrix.nnz,
"repetition": repetition,
"symmetric": symmetric,
"positive_definite": positive_definite,
"duration": duration, }
In [6]:
dictionary_sizes = [10**k for k in range(3, int(ceil(log10(len(full_dictionary)))))]
seed(RANDOM_SEED)
dictionaries = []
for size in tqdm(dictionary_sizes, desc="dictionaries"):
dictionary = Dictionary([sample(list(full_dictionary.values()), size)])
dictionaries.append(dictionary)
dictionaries.append(full_dictionary)
nonzero_limits = [1, 10, 100]
symmetry = (True, False)
positive_definiteness = (True, False)
repetitions = range(10)
configurations = product(dictionaries, nonzero_limits, symmetry, positive_definiteness, repetitions)
results = benchmark_results(benchmark, configurations, "matrix_speed.director_results")
The following tables show how long it takes to construct a term similarity matrix (the duration column), how many nonzero elements there are in the matrix (the matrix_nonzero column) and the mean term similarity consumption speed (the consumption_speed column) as we vary the dictionary size (the dictionary_size column) the maximum number of nonzero elements outside the diagonal in every column of the matrix (the nonzero_limit column), the matrix symmetry constraint (the symmetric column), and the matrix positive definiteness constraing (the positive_definite column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.
We can see that the symmetry and positive definiteness constraints severely limit the number of nonzero elements in the resulting matrix. This in turn increases the consumption speed, since we end up throwing away most of the elements that we consume. The effects of the dictionary size on the mean term similarity consumption speed are minor to none.
In [7]:
df = pd.DataFrame(results)
df["consumption_speed"] = df.dictionary_size * df.nonzero_limit / df.duration
df = df.groupby(["dictionary_size", "nonzero_limit", "symmetric", "positive_definite"])
def display(df):
df["duration"] = [timedelta(0, duration) for duration in df["duration"]]
df["matrix_nonzero"] = [int(nonzero) for nonzero in df["matrix_nonzero"]]
df["consumption_speed"] = ["%.02f Kword pairs / s" % (speed / 1000) for speed in df["consumption_speed"]]
return df
In [8]:
display(df.mean()).loc[
[10000, len(full_dictionary)], :, :].loc[
:, ["duration", "matrix_nonzero", "consumption_speed"]]
Out[8]:
In [9]:
display(df.apply(lambda x: (x - x.mean()).std())).loc[
[10000, len(full_dictionary)], :, :].loc[
:, ["duration", "matrix_nonzero", "consumption_speed"]]
Out[9]:
First, we measure the speed at which the UniformTermSimilarityIndex builder class produces term similarities. UniformTermSimilarityIndex is a dummy class that just generates a sequence of constants. It produces much more term similarities per second than the SparseTermSimilarityMatrix is capable of consuming and its results will serve as an upper limit.
In [10]:
def benchmark(configuration):
dictionary, nonzero_limit, repetition = configuration
start_time = time()
index = UniformTermSimilarityIndex(dictionary)
end_time = time()
constructor_duration = end_time - start_time
start_time = time()
for term in dictionary.values():
for _j, _k in zip(index.most_similar(term, topn=nonzero_limit), range(nonzero_limit)):
pass
end_time = time()
production_duration = end_time - start_time
return {
"dictionary_size": len(dictionary),
"nonzero_limit": nonzero_limit,
"repetition": repetition,
"constructor_duration": constructor_duration,
"production_duration": production_duration, }
In [11]:
nonzero_limits = [1, 10, 100, 1000]
configurations = product(dictionaries, nonzero_limits, repetitions)
results = benchmark_results(benchmark, configurations, "matrix_speed.builder_results.uniform")
The following tables show how long it takes to retrieve the most similar terms for all terms in a dictionary (the production_duration column) and the mean term similarity production speed (the production_speed column) as we vary the dictionary size (the dictionary_size column), and the maximum number of most similar terms that will be retrieved (the nonzero_limit column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.
The production_speed is proportional to nonzero_limit.
In [12]:
df = pd.DataFrame(results)
df["processing_speed"] = df.dictionary_size ** 2 / df.production_duration
df["production_speed"] = df.dictionary_size * df.nonzero_limit / df.production_duration
df = df.groupby(["dictionary_size", "nonzero_limit"])
def display(df):
df["constructor_duration"] = [timedelta(0, duration) for duration in df["constructor_duration"]]
df["production_duration"] = [timedelta(0, duration) for duration in df["production_duration"]]
df["processing_speed"] = ["%.02f Kword pairs / s" % (speed / 1000) for speed in df["processing_speed"]]
df["production_speed"] = ["%.02f Kword pairs / s" % (speed / 1000) for speed in df["production_speed"]]
return df
In [13]:
display(df.mean()).loc[
[1000, len(full_dictionary)], :, :].loc[
:, ["production_duration", "production_speed"]]
Out[13]:
In [14]:
display(df.apply(lambda x: (x - x.mean()).std())).loc[
[1000, len(full_dictionary)], :, :].loc[
:, ["production_duration", "production_speed"]]
Out[14]:
Next, we measure the speed at which the LevenshteinSimilarityIndex builder class produces term similarities. LevenshteinSimilarityIndex is currently just a naïve implementation that produces much fewer term similarities per second than the SparseTermSimilarityMatrix class is capable of consuming.
In [15]:
def benchmark(configuration):
dictionary, nonzero_limit, query_terms, repetition = configuration
start_time = time()
index = LevenshteinSimilarityIndex(dictionary)
end_time = time()
constructor_duration = end_time - start_time
start_time = time()
for term in query_terms:
for _j, _k in zip(index.most_similar(term, topn=nonzero_limit), range(nonzero_limit)):
pass
end_time = time()
production_duration = end_time - start_time
return {
"dictionary_size": len(dictionary),
"mean_query_term_length": np.mean([len(term) for term in query_terms]),
"nonzero_limit": nonzero_limit,
"repetition": repetition,
"constructor_duration": constructor_duration,
"production_duration": production_duration, }
In [16]:
nonzero_limits = [1, 10, 100]
seed(RANDOM_SEED)
min_dictionary = sorted((len(dictionary), dictionary) for dictionary in dictionaries)[0][1]
query_terms = sample(list(min_dictionary.values()), 10)
configurations = product(dictionaries, nonzero_limits, [query_terms], repetitions)
results = benchmark_results(benchmark, configurations, "matrix_speed.builder_results.levenshtein")
The following tables show how long it takes to retrieve the most similar terms for ten randomly sampled terms from a dictionary (the production_duration column), the mean term similarity production speed (the production_speed column) and the mean term similarity processing speed (the processing_speed column) as we vary the dictionary size (the dictionary_size column), and the maximum number of most similar terms that will be retrieved (the nonzero_limit column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.
The production_speed is proportional to nonzero_limit / dictionary_size. The processing_speed is constant.
In [17]:
df = pd.DataFrame(results)
df["processing_speed"] = df.dictionary_size * len(query_terms) / df.production_duration
df["production_speed"] = df.nonzero_limit * len(query_terms) / df.production_duration
df = df.groupby(["dictionary_size", "nonzero_limit"])
def display(df):
df["constructor_duration"] = [timedelta(0, duration) for duration in df["constructor_duration"]]
df["production_duration"] = [timedelta(0, duration) for duration in df["production_duration"]]
df["processing_speed"] = ["%.02f Kword pairs / s" % (speed / 1000) for speed in df["processing_speed"]]
df["production_speed"] = ["%.02f word pairs / s" % speed for speed in df["production_speed"]]
return df
In [18]:
display(df.mean()).loc[
[1000, 1000000, len(full_dictionary)], :].loc[
:, ["production_duration", "production_speed", "processing_speed"]]
Out[18]:
In [19]:
display(df.apply(lambda x: (x - x.mean()).std())).loc[
[1000, 1000000, len(full_dictionary)], :].loc[
:, ["production_duration", "production_speed", "processing_speed"]]
Out[19]:
Lastly, we measure the speed at which the WordEmbeddingSimilarityIndex builder class constructs an instance and produces term similarities. Gensim currently supports slow and precise nearest neighbor search, and also approximate nearest neighbor search using ANNOY. We evaluate both options.
In [20]:
def benchmark(configuration):
(model, dictionary), nonzero_limit, annoy_n_trees, query_terms, repetition = configuration
use_annoy = annoy_n_trees > 0
model.init_sims()
start_time = time()
if use_annoy:
annoy = AnnoyIndexer(model, annoy_n_trees)
kwargs = {"indexer": annoy}
else:
kwargs = {}
index = WordEmbeddingSimilarityIndex(model, kwargs=kwargs)
end_time = time()
constructor_duration = end_time - start_time
start_time = time()
for term in query_terms:
for _j, _k in zip(index.most_similar(term, topn=nonzero_limit), range(nonzero_limit)):
pass
end_time = time()
production_duration = end_time - start_time
return {
"dictionary_size": len(dictionary),
"mean_query_term_length": np.mean([len(term) for term in query_terms]),
"nonzero_limit": nonzero_limit,
"use_annoy": use_annoy,
"annoy_n_trees": annoy_n_trees,
"repetition": repetition,
"constructor_duration": constructor_duration,
"production_duration": production_duration, }
In [21]:
models = []
for dictionary in tqdm(dictionaries, desc="models"):
if dictionary == full_dictionary:
models.append(full_model)
continue
model = full_model.__class__(full_model.vector_size)
model.vocab = {word: deepcopy(full_model.vocab[word]) for word in dictionary.values()}
model.index2entity = []
vector_indices = []
for index, word in enumerate(full_model.index2entity):
if word in model.vocab.keys():
model.index2entity.append(word)
model.vocab[word].index = len(vector_indices)
vector_indices.append(index)
model.vectors = full_model.vectors[vector_indices]
models.append(model)
annoy_n_trees = [0] + [10**k for k in range(3)]
seed(RANDOM_SEED)
query_terms = sample(list(min_dictionary.values()), 1000)
configurations = product(zip(models, dictionaries), nonzero_limits, annoy_n_trees, [query_terms], repetitions)
results = benchmark_results(benchmark, configurations, "matrix_speed.builder_results.wordembeddings")
The following tables show how long it takes to construct an ANNOY index and the builder class instance (the constructor_duration column), how long it takes to retrieve the most similar terms for 1,000 randomly sampled terms from a dictionary (the production_duration column), the mean term similarity production speed (the production_speed column) and the mean term similarity processing speed (the processing_speed column) as we vary the dictionary size (the dictionary_size column), the maximum number of most similar terms that will be retrieved (the nonzero_limit column), and the number of constructed ANNOY trees (the annoy_n_trees column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.
If we do not use ANNOY (annoy_n_trees${}=0$), then production_speed is proportional to nonzero_limit / dictionary_size. If we do use ANNOY (annoy_n_trees${}>0$), then production_speed is proportional to nonzero_limit / (annoy_n_trees)${}^{1/2}$.
In [22]:
df = pd.DataFrame(results)
df["processing_speed"] = df.dictionary_size * len(query_terms) / df.production_duration
df["production_speed"] = df.nonzero_limit * len(query_terms) / df.production_duration
df = df.groupby(["dictionary_size", "nonzero_limit", "annoy_n_trees"])
def display(df):
df["constructor_duration"] = [timedelta(0, duration) for duration in df["constructor_duration"]]
df["production_duration"] = [timedelta(0, duration) for duration in df["production_duration"]]
df["processing_speed"] = ["%.02f Kword pairs / s" % (speed / 1000) for speed in df["processing_speed"]]
df["production_speed"] = ["%.02f Kword pairs / s" % (speed / 1000) for speed in df["production_speed"]]
return df
In [23]:
display(df.mean()).loc[
[1000000, len(full_dictionary)], [1, 100], [0, 1, 100]].loc[
:, ["constructor_duration", "production_duration", "production_speed", "processing_speed"]]
Out[23]:
In [24]:
display(df.apply(lambda x: (x - x.mean()).std())).loc[
[1000000, len(full_dictionary)], [1, 100], [0, 1, 100]].loc[
:, ["constructor_duration", "production_duration", "production_speed", "processing_speed"]]
Out[24]:
In Gensim PR #1827, we added a base implementation of the soft cosine measure (SCM). The base implementation would compute SCM between single documents using the softcossim function. In the Gensim PR #2016, we intruduced the SparseTermSimilarityMatrix.inner_product method, which computes SCM not only between single documents, but also between a document and a corpus, and between two corpora.
For the measurements, we use the Google News word embeddings distributed with the C implementation of Word2Vec. From the word embeddings, we will derive a dictionary of 2.01m terms. As a corpus, we will use a random sample of 100K articles from the 4.92m English Wikipedia articles.
In [25]:
full_model = api.load("word2vec-google-news-300")
try:
with open("matrix_speed.corpus", "rb") as file:
full_corpus = pickle.load(file)
except IOError:
original_corpus = list(tqdm(api.load("wiki-english-20171001"), desc="original_corpus", total=4924894))
seed(RANDOM_SEED)
full_corpus = [
simple_preprocess(u'\n'.join(article["section_texts"]))
for article in tqdm(sample(original_corpus, 10**5), desc="full_corpus", total=10**5)]
del original_corpus
with open("matrix_speed.corpus", "wb") as file:
pickle.dump(full_corpus, file)
try:
full_dictionary = Dictionary.load("matrix_speed.dictionary")
except IOError:
full_dictionary = Dictionary([[term] for term in full_model.vocab.keys()])
full_dictionary.save("matrix_speed.dictionary")
In [26]:
def benchmark(configuration):
(matrix, dictionary, nonzero_limit), corpus, normalized, repetition = configuration
corpus_size = len(corpus)
corpus = [dictionary.doc2bow(doc) for doc in corpus]
corpus = [vec for vec in corpus if len(vec) > 0]
start_time = time()
for vec1 in corpus:
for vec2 in corpus:
matrix.inner_product(vec1, vec2, normalized=normalized)
end_time = time()
duration = end_time - start_time
return {
"dictionary_size": matrix.matrix.shape[0],
"matrix_nonzero": matrix.matrix.nnz,
"nonzero_limit": nonzero_limit,
"normalized": normalized,
"corpus_size": corpus_size,
"corpus_actual_size": len(corpus),
"corpus_nonzero": sum(len(vec) for vec in corpus),
"mean_document_length": np.mean([len(doc) for doc in corpus]),
"repetition": repetition,
"duration": duration, }
In [27]:
seed(RANDOM_SEED)
dictionary_sizes = [1000, 100000]
dictionaries = []
for size in tqdm(dictionary_sizes, desc="dictionaries"):
dictionary = Dictionary([sample(list(full_dictionary.values()), size)])
dictionaries.append(dictionary)
min_dictionary = sorted((len(dictionary), dictionary) for dictionary in dictionaries)[0][1]
corpus_sizes = [100, 1000]
corpora = []
for size in tqdm(corpus_sizes, desc="corpora"):
corpus = sample(full_corpus, size)
corpora.append(corpus)
models = []
for dictionary in tqdm(dictionaries, desc="models"):
if dictionary == full_dictionary:
models.append(full_model)
continue
model = full_model.__class__(full_model.vector_size)
model.vocab = {word: deepcopy(full_model.vocab[word]) for word in dictionary.values()}
model.index2entity = []
vector_indices = []
for index, word in enumerate(full_model.index2entity):
if word in model.vocab.keys():
model.index2entity.append(word)
model.vocab[word].index = len(vector_indices)
vector_indices.append(index)
model.vectors = full_model.vectors[vector_indices]
models.append(model)
nonzero_limits = [1, 10, 100]
matrices = []
for (model, dictionary), nonzero_limit in tqdm(
list(product(zip(models, dictionaries), nonzero_limits)), desc="matrices"):
annoy = AnnoyIndexer(model, 1)
index = WordEmbeddingSimilarityIndex(model, kwargs={"indexer": annoy})
matrix = SparseTermSimilarityMatrix(index, dictionary, nonzero_limit=nonzero_limit)
matrices.append((matrix, dictionary, nonzero_limit))
del annoy
normalization = (True, False)
repetitions = range(10)
In [28]:
configurations = product(matrices, corpora, normalization, repetitions)
results = benchmark_results(benchmark, configurations, "matrix_speed.inner-product_results.doc_doc")
The following tables show how long it takes to compute the inner_product method between all document vectors in a corpus (the duration column), how many nonzero elements there are in a corpus matrix (the corpus_nonzero column), how many nonzero elements there are in a term similarity matrix (the matrix_nonzero column) and the mean document similarity production speed (the speed column) as we vary the dictionary size (the dictionary_size column), the size of the corpus (the corpus_size column), the maximum number of nonzero elements in a single column of the matrix (the nonzero_limit column), and the matrix symmetry constraint (the symmetric column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.
The speed is proportional to the square of the number of unique terms shared by the two document vectors. In our scenario as well as the standard IR scenario, this means speed is constant. Computing a normalized inner product (normalized${}={}$True) results in a constant speed decrease.
In [29]:
df = pd.DataFrame(results)
df["speed"] = df.corpus_actual_size**2 / df.duration
del df["corpus_actual_size"]
df = df.groupby(["dictionary_size", "corpus_size", "nonzero_limit", "normalized"])
def display(df):
df["duration"] = [timedelta(0, duration) for duration in df["duration"]]
df["speed"] = ["%.02f Kdoc pairs / s" % (speed / 1000) for speed in df["speed"]]
return df
In [30]:
display(df.mean()).loc[
[1000, 100000], :, [1, 100], :].loc[
:, ["duration", "corpus_nonzero", "matrix_nonzero", "speed"]]
Out[30]:
In [31]:
display(df.apply(lambda x: (x - x.mean()).std())).loc[
[1000, 100000], :, [1, 100], :].loc[
:, ["duration", "corpus_nonzero", "matrix_nonzero", "speed"]]
Out[31]:
In [32]:
def benchmark(configuration):
(matrix, dictionary, nonzero_limit), corpus, normalized, repetition = configuration
corpus_size = len(corpus)
corpus = [dictionary.doc2bow(doc) for doc in corpus if doc]
start_time = time()
for vec in corpus:
matrix.inner_product(vec, corpus, normalized=normalized)
end_time = time()
duration = end_time - start_time
return {
"dictionary_size": matrix.matrix.shape[0],
"matrix_nonzero": matrix.matrix.nnz,
"nonzero_limit": nonzero_limit,
"normalized": normalized,
"corpus_size": corpus_size,
"corpus_actual_size": len(corpus),
"corpus_nonzero": sum(len(vec) for vec in corpus),
"mean_document_length": np.mean([len(doc) for doc in corpus]),
"repetition": repetition,
"duration": duration, }
In [33]:
configurations = product(matrices, corpora, normalization, repetitions)
results = benchmark_results(benchmark, configurations, "matrix_speed.inner-product_results.doc_corpus")
The speed is inversely proportional to matrix_nonzero. Computing a normalized inner product (normalized${}={}$True) results in a constant speed decrease.
In [34]:
df = pd.DataFrame(results)
df["speed"] = df.corpus_actual_size**2 / df.duration
del df["corpus_actual_size"]
df = df.groupby(["dictionary_size", "corpus_size", "nonzero_limit", "normalized"])
def display(df):
df["duration"] = [timedelta(0, duration) for duration in df["duration"]]
df["speed"] = ["%.02f Kdoc pairs / s" % (speed / 1000) for speed in df["speed"]]
return df
In [35]:
display(df.mean()).loc[
[1000, 100000], :, [1, 100], :].loc[
:, ["duration", "corpus_nonzero", "matrix_nonzero", "speed"]]
Out[35]:
In [36]:
display(df.apply(lambda x: (x - x.mean()).std())).loc[
[1000, 100000], :, [1, 100], :].loc[
:, ["duration", "corpus_nonzero", "matrix_nonzero", "speed"]]
Out[36]:
In [37]:
def benchmark(configuration):
(matrix, dictionary, nonzero_limit), corpus, normalized, repetition = configuration
corpus_size = len(corpus)
corpus = [dictionary.doc2bow(doc) for doc in corpus]
corpus = [vec for vec in corpus if len(vec) > 0]
start_time = time()
matrix.inner_product(corpus, corpus, normalized=normalized)
end_time = time()
duration = end_time - start_time
return {
"dictionary_size": matrix.matrix.shape[0],
"matrix_nonzero": matrix.matrix.nnz,
"nonzero_limit": nonzero_limit,
"normalized": normalized,
"corpus_size": corpus_size,
"corpus_actual_size": len(corpus),
"corpus_nonzero": sum(len(vec) for vec in corpus),
"mean_document_length": np.mean([len(doc) for doc in corpus]),
"repetition": repetition,
"duration": duration, }
In [38]:
nonzero_limits = [1000]
dense_matrices = []
for (model, dictionary), nonzero_limit in tqdm(
list(product(zip(models, dictionaries), nonzero_limits)), desc="matrices"):
annoy = AnnoyIndexer(model, 1)
index = WordEmbeddingSimilarityIndex(model, kwargs={"indexer": annoy})
matrix = SparseTermSimilarityMatrix(index, dictionary, nonzero_limit=nonzero_limit)
matrices.append((matrix, dictionary, nonzero_limit))
del annoy
In [39]:
configurations = product(matrices + dense_matrices, corpora + [full_corpus], normalization, repetitions)
results = benchmark_results(benchmark, configurations, "matrix_speed.inner-product_results.corpus_corpus")
In [40]:
df = pd.DataFrame(results)
df["speed"] = df.corpus_actual_size**2 / df.duration
del df["corpus_actual_size"]
df = df.groupby(["dictionary_size", "corpus_size", "nonzero_limit", "normalized"])
def display(df):
df["duration"] = [timedelta(0, duration) for duration in df["duration"]]
df["speed"] = ["%.02f Kdoc pairs / s" % (speed / 1000) for speed in df["speed"]]
return df
In [41]:
display(df.mean()).loc[
[1000, 100000], :, [1, 10, 100, 1000], :].loc[
:, ["duration", "corpus_nonzero", "matrix_nonzero", "speed"]]
Out[41]:
In [42]:
display(df.apply(lambda x: (x - x.mean()).std())).loc[
[1000, 100000], :, [1, 100], :].loc[
:, ["duration", "corpus_nonzero", "matrix_nonzero", "speed"]]
Out[42]: