This is a test of a Python implementation of Kernelized Locality Sensitive Hashing (KLSH), first reported in Kulis & Grauman 2009. The Python code is available at http://github.com/jakevdp/klsh/
In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
from klsh import KernelLSH
from klsh.utils import timeit
Let's search for $k=1$ nearest point among 10000. We'll use 1000 query points.
In [2]:
# Generate some data
rng = np.random.RandomState(0)
X = rng.randn(10000, 10)
Y = rng.randn(1000, 10)
In [3]:
# time the training phase
with timeit("Time to train hashing: {0:.2g} sec"):
klsh = KernelLSH(nbits=64, epsilon=0.5,
subspace_size=300, sample_size=30,
random_state=0)
klsh.fit(X)
print("kernel evaluations: {0:.2g}".format(klsh.kernel_evaluations))
In [4]:
# time the approximate query
with timeit("Time for approximate query on 1000 pts: {0:.2g} sec"):
evals = klsh.kernel_evaluations
nbrs_approx = klsh.query(Y, 1)
print("kernel evaluations: {0:.2g}".format(klsh.kernel_evaluations - evals))
In [5]:
# time the exact (linear scan) query
with timeit("Time for linear scan query on 1000 pts: {0:.2g} sec"):
evals = klsh.kernel_evaluations
nbrs_true = klsh.query_brute(Y, 1)
print("kernel evaluations: {0:.2g}".format(klsh.kernel_evaluations - evals))
In [6]:
# check the accuracy
acc = (nbrs_approx == nbrs_true).sum() * 1. / nbrs_true.size
print("Accuracy of approximate neighbors: {0:.2g}".format(acc))
Let's see how the various tuning parameters affect the accuracy and the number of kernel evaluations
In [7]:
def test_variation(X, Y, nbits, epsilon, subspace_size, sample_size,
kernel='linear'):
B = np.broadcast(nbits, epsilon, subspace_size, sample_size)
kernel_evals = np.zeros(B.shape, dtype=int)
accuracy = np.zeros(B.shape, dtype=float)
nbrs_true = KernelLSH(kernel=kernel).fit(X).query_brute(Y, 1)
for i, (n, e, p, t) in enumerate(B):
klsh = KernelLSH(kernel=kernel, nbits=n, epsilon=e,
subspace_size=p, sample_size=t,
random_state=0)
klsh.fit(X)
klsh.kernel_evaluations = 0
nbrs_approx = klsh.query(Y, 1)
kernel_evals.flat[i] = klsh.kernel_evaluations
accuracy.flat[i] = (nbrs_approx == nbrs_true).sum() * 1. / nbrs_true.size
return kernel_evals, accuracy
In [8]:
# Some default parameters (we'll explore these later)
labels = ['epsilon', 'nbits', 'subspace_size', 'sample_size']
default_kwds = dict(nbits=64, epsilon=0.5, subspace_size=300, sample_size=30)
varying_kwds = dict(nbits = [8, 16, 32, 64, 128, 256, 300],
epsilon = [0.0, 0.1, 0.2, 0.5, 1.0, 1.5],
subspace_size = [150, 200, 250, 300, 350, 400],
sample_size = [10, 20, 30, 40, 50, 60])
def compute_results(kernel='linear'):
results = {'kernel':kernel}
for label in labels:
kwds = default_kwds.copy()
kwds['kernel'] = kernel
kwds[label] = varying_kwds[label]
with timeit("Time to test varying " + label + ": {0:.2g} sec"):
results[label] = test_variation(X, Y, **kwds)
return results
def plot_results(results):
fig, ax = plt.subplots(2, 4, figsize=(10.5, 4),
sharex='col', sharey='row')
for i, label in enumerate(labels):
ax[0, i].semilogy(varying_kwds[label], results[label][0], '-k')
ax[1, i].plot(varying_kwds[label], results[label][1], '-k')
ax[0, i].axvline(default_kwds[label], linestyle='dotted', color='gray')
ax[1, i].axvline(default_kwds[label], linestyle='dotted', color='gray')
ax[1, i].set_xlabel(label)
ax[1, i].xaxis.set_major_locator(plt.MaxNLocator(5))
ax[0, 0].set_ylim(1E5, 1E7)
ax[0, 0].set_xlim(0, 1.5)
ax[0, 0].set_ylabel('kernel evaluations')
ax[1, 0].set_ylabel('accuracy')
fig.suptitle("{0} kernel".format(results['kernel']))
In [9]:
results_linear = compute_results(kernel='linear')
plot_results(results_linear)
Apparently epsilon and nbits are the most important considerations for the accuracy of the result. epsilon in particular encodes the tradeoff between compute time (number of kernel evaluations) and accuracy.
Let's check the same benchmark for some other common kernels.
In [10]:
results_rbf = compute_results('rbf')
plot_results(results_rbf)
Very similar to the linear kernel.
In [11]:
results_poly = compute_results('poly')
plot_results(results_poly)
Again, very similar behavior.
In [12]:
results_cosine = compute_results('cosine')
plot_results(results_cosine)
The method is much more accurate for the cosine kernel: this is likely due to the fact that the hashing algorithm itself is basically a cosine similarity.
In [13]:
results_sigmoid = compute_results('sigmoid')
plot_results(results_sigmoid)
Again, similar results, but I'm not entirely sure what the strange artifact in the subspace_size is due to.
In [14]:
from klsh.kernels import crosscorr_kernel
results = compute_results(crosscorr_kernel)
plot_results(results)
In [15]:
from klsh.kernels import crosscorr_similarity
results = compute_results(crosscorr_similarity)
plot_results(results)