This contains the implementations of the benchmarks described at http://jakevdp.github.com/blog/2012/08/08/memoryview-benchmarks.
Here we'll use ipython's cython magic to compile and run the benchmarks.
In [25]:
%load_ext cythonmagic
# Define our test array
import numpy as np
X = np.random.random((500, 3))
In [3]:
import numpy as np
def euclidean_distance(x1, x2):
x1 = np.asarray(x1)
x2 = np.asarray(x2)
return np.sqrt(np.sum((x1 - x2) ** 2))
def pairwise_v1(X, metric=euclidean_distance):
X = np.asarray(X)
n_samples, n_dim = X.shape
D = np.empty((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
D[i, j] = metric(X[i], X[j])
return D
In [4]:
%timeit pairwise_v1(X)
In [4]:
%%cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
cimport cython
# define a function pointer to a metric
ctypedef double (*metric_ptr)(np.ndarray, np.ndarray)
@cython.boundscheck(False)
@cython.wraparound(False)
cdef double euclidean_distance(np.ndarray[double, ndim=1, mode='c'] x1,
np.ndarray[double, ndim=1, mode='c'] x2):
cdef double tmp, d
cdef np.intp_t i, N
d = 0
N = x1.shape[0]
# assume x2 has the same shape as x1. This could be dangerous!
for i in range(N):
tmp = x1[i] - x2[i]
d += tmp * tmp
return sqrt(d)
@cython.boundscheck(False)
@cython.wraparound(False)
def pairwise_v2(np.ndarray[double, ndim=2, mode='c'] X not None,
metric = 'euclidean'):
cdef metric_ptr dist_func
if metric == 'euclidean':
dist_func = &euclidean_distance
else:
raise ValueError("unrecognized metric")
cdef np.intp_t i, j, n_samples
n_samples = X.shape[0]
cdef np.ndarray[double, ndim=2, mode='c'] D = np.empty((n_samples,
n_samples))
for i in range(n_samples):
for j in range(n_samples):
D[i, j] = dist_func(X[i], X[j])
return D
In [5]:
%timeit pairwise_v2(X)
In [6]:
%%cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
cimport cython
# define a function pointer to a metric
ctypedef double (*metric_ptr)(double[::1], double[::1])
@cython.boundscheck(False)
@cython.wraparound(False)
cdef double euclidean_distance(double[::1] x1,
double[::1] x2):
cdef double tmp, d
cdef np.intp_t i, N
d = 0
N = x1.shape[0]
# assume x2 has the same shape as x1. This could be dangerous!
for i in range(N):
tmp = x1[i] - x2[i]
d += tmp * tmp
return sqrt(d)
@cython.boundscheck(False)
@cython.wraparound(False)
def pairwise_v3(double[:, ::1] X not None,
metric = 'euclidean'):
cdef metric_ptr dist_func
if metric == 'euclidean':
dist_func = &euclidean_distance
else:
raise ValueError("unrecognized metric")
cdef np.intp_t i, j, n_samples
n_samples = X.shape[0]
cdef double[:, ::1] D = np.empty((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
D[i, j] = dist_func(X[i], X[j])
return D
In [7]:
%timeit pairwise_v3(X)
In [8]:
%%cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
cimport cython
# define a function pointer to a metric
ctypedef double (*metric_ptr)(double*, double*, int)
@cython.boundscheck(False)
@cython.wraparound(False)
cdef double euclidean_distance(double* x1,
double* x2,
int N):
cdef double tmp, d
cdef np.intp_t i
d = 0
for i in range(N):
tmp = x1[i] - x2[i]
d += tmp * tmp
return sqrt(d)
@cython.boundscheck(False)
@cython.wraparound(False)
def pairwise_v4(double[:, ::1] X not None,
metric = 'euclidean'):
cdef metric_ptr dist_func
if metric == 'euclidean':
dist_func = &euclidean_distance
else:
raise ValueError("unrecognized metric")
cdef np.intp_t i, j, n_samples, n_dim
n_samples = X.shape[0]
n_dim = X.shape[1]
cdef double[:, ::1] D = np.empty((n_samples, n_samples))
cdef double* Dptr = &D[0, 0]
cdef double* Xptr = &X[0, 0]
for i in range(n_samples):
for j in range(n_samples):
Dptr[i * n_samples + j] = dist_func(Xptr + i * n_dim,
Xptr + j * n_dim,
n_dim)
return D
In [9]:
%timeit pairwise_v4(X)
In [14]:
%%cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
cimport cython
# define a function pointer to a metric
ctypedef double (*metric_ptr)(double[:, ::1], np.intp_t, np.intp_t)
@cython.boundscheck(False)
@cython.wraparound(False)
cdef double euclidean_distance(double[:, ::1] X,
np.intp_t i1, np.intp_t i2):
cdef double tmp, d
cdef np.intp_t j
d = 0
for j in range(X.shape[1]):
tmp = X[i1, j] - X[i2, j]
d += tmp * tmp
return sqrt(d)
@cython.boundscheck(False)
@cython.wraparound(False)
def pairwise_v5(double[:, ::1] X not None,
metric = 'euclidean'):
cdef metric_ptr dist_func
if metric == 'euclidean':
dist_func = &euclidean_distance
else:
raise ValueError("unrecognized metric")
cdef np.intp_t i, j, n_samples, n_dim
n_samples = X.shape[0]
n_dim = X.shape[1]
cdef double[:, ::1] D = np.empty((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
D[i, j] = dist_func(X, i, j)
return D
In [28]:
%timeit pairwise_v5(X)
for more: http://docs.cython.org/src/userguide/parallelism.html
In [26]:
%%cython --compile-args=-fopenmp --link-args=-fopenmp --force
from cython.parallel import prange
cimport numpy as np
from libc.math cimport sqrt
cimport cython
import numpy as np
# define a function pointer to a metric
ctypedef double (*metric_ptr)(double[:, ::1], np.intp_t, np.intp_t) nogil
@cython.boundscheck(False)
@cython.wraparound(False)
cdef double euclidean_distance(double[:, ::1] X,
np.intp_t i1, np.intp_t i2) nogil:
cdef double tmp, d
cdef np.intp_t j
d = 0
for j in range(X.shape[1]):
tmp = X[i1, j] - X[i2, j]
d += tmp * tmp
return sqrt(d)
@cython.boundscheck(False)
@cython.wraparound(False)
def pairwise_v6(double[:, ::1] X not None,
metric = 'euclidean'):
cdef metric_ptr dist_func
if metric == 'euclidean':
dist_func = &euclidean_distance
else:
raise ValueError("unrecognized metric")
cdef np.intp_t i, j, n_samples, n_dim
n_samples = X.shape[0]
n_dim = X.shape[1]
cdef double[:, ::1] D = np.empty((n_samples, n_samples))
for i in prange(n_samples, nogil=True, schedule='guided'):
for j in range(n_samples):
D[i, j] = dist_func(X, i, j)
return D
In [27]:
%timeit pairwise_v6(X)
Here we'll compare the benchmark to two similar routines from scipy
and scikit-learn
In [12]:
from scipy.spatial.distance import cdist
%timeit cdist(X, X)
In [13]:
from sklearn.metrics import pairwise_distances
%timeit pairwise_distances(X, X)