A comparison of dense and sparse matrix vector multiplication routines


In [28]:
import numpy as np
import scipy.sparse
%load_ext cython

Setup


In [2]:
p = 0.01
Nc, Na = 10000, 200
c = np.ones(Nc)
a = np.ones(Na)
K = np.random.random((Nc, Na)) < p

Dense matrix vector multiplication


In [3]:
%timeit K.dot(a)


100 loops, best of 3: 3.06 ms per loop

In [4]:
%timeit c.dot(K)


100 loops, best of 3: 3.16 ms per loop

Sparse matrix vector multiplication


In [5]:
Ksp = scipy.sparse.csr_matrix(K)

In [6]:
%timeit scipy.sparse.csr_matrix(K)


100 loops, best of 3: 6.15 ms per loop

In [7]:
np.all(Ksp.dot(a) == K.dot(a))


Out[7]:
True

In [8]:
%timeit Ksp.dot(a)


10000 loops, best of 3: 101 µs per loop

In [9]:
np.all(Ksp.transpose(copy=False).dot(c) == c.dot(K))


Out[9]:
True

In [10]:
%timeit Ksp.transpose(copy=False).dot(c)


10000 loops, best of 3: 137 µs per loop

In [11]:
csp = scipy.sparse.csr_matrix(c)
%timeit csp.dot(Ksp)


1000 loops, best of 3: 333 µs per loop

Sparse matrix vector multiplication using MKL

Intel's Math Kernel Library (MKL) provides BLAS routines for sparse matrices. In the following we are going to use those to compare their speed to using scipy's sparse routines (which are implemented directly even if scipy is linked against MKL).


In [22]:
# inspired by http://stackoverflow.com/questions/17158893/does-scipy-support-multithreading-for-sparse-matrix-multiplication-when-using-mk
# and https://github.com/afedynitch/MCEq/blob/master/MCEq/kernels.py

from ctypes import POINTER,c_void_p,c_int,c_char,c_double,byref,cdll

def SpMV_viaMKL(A, x, trans=False):
    """
    Wrapper to Intel's Sparse Matrix-Vector multiplaction routine.
    Handles rectangular matrices
    """
 
    mkl = cdll.LoadLibrary("libmkl_rt.so")
    mkl.mkl_set_num_threads(byref(c_int(4)))

    SpMV = mkl.mkl_dcsrmv
    (m, k) = A.shape

    data    = A.data.ctypes.data_as(POINTER(c_double))
    pb = A.indptr[:-1].ctypes.data_as(POINTER(c_int))
    pe = A.indptr[1:].ctypes.data_as(POINTER(c_int))
    indices = A.indices.ctypes.data_as(POINTER(c_int))

    # Allocate output, using same conventions as input
    insize = m if trans else k
    outsize = k if trans else m
    y = np.empty(outsize, dtype=np.double, order='F')
    if x.size != insize:
        raise Exception("x must have n entries. x.size is %d, n is %d" % (x.size, outsize))

    # Check input
    if x.dtype.type is not np.double:
        x = x.astype(np.double, copy=True)

    np_x = x.ctypes.data_as(POINTER(c_double))
    np_y = y.ctypes.data_as(POINTER(c_double))
    # now call MKL. This returns the answer in np_y, which links to y
    alpha = c_double(1.0)
    beta = c_double(0.0)
    npmatd = np.chararray(6)
    npmatd[0] = 'G'
    npmatd[3] = 'C'
    matdescra = npmatd.ctypes.data_as(POINTER(c_char))
    SpMV(byref(c_char("T" if trans else "N")), byref(c_int(m)), byref(c_int(k)), byref(alpha),
         matdescra, data, indices, pb, pe, np_x, byref(beta), np_y ) 
    return y

In [13]:
Kfloat = K.astype(np.float)
Kfloatsp = scipy.sparse.csr_matrix(Kfloat)

In [14]:
np.all(SpMV_viaMKL(Kfloatsp, a) == Ksp.dot(a))


Out[14]:
True

In [23]:
%timeit SpMV_viaMKL(Kfloatsp, a)


The slowest run took 9.17 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 190 µs per loop

In [16]:
np.all(SpMV_viaMKL(Kfloatsp, c, True)  == c.dot(K))


Out[16]:
True

In [24]:
%timeit SpMV_viaMKL(Kfloatsp, c, True)


1000 loops, best of 3: 143 µs per loop

In [25]:
%prun [SpMV_viaMKL(Kfloatsp, c, True) for i in range(1000)]


 

In [79]:
%%cython -l mkl_core -l mkl_intel_lp64
cimport numpy as np
import numpy as np

cdef extern from "mkl_types.h":
    ctypedef MKL_INT

cdef extern from "mkl.h" nogil:
    double cblas_dasum (MKL_INT n, double *x, MKL_INT incx);

def cythonSpMV_viaMKL(np.ndarray[np.double_t] x):
    """
    Wrapper to Intel's Sparse Matrix-Vector multiplaction routine.
    Handles rectangular matrices
    """
    
    #cdef MKL_INT n = x.shape[0]
    #cdef MKL_INT incx = 1
    return 2#cblas_dasum(n, &x[0], incx)


---------------------------------------------------------------------------
CompileError                              Traceback (most recent call last)
<ipython-input-79-e080b41e1e68> in <module>()
----> 1 get_ipython().run_cell_magic(u'cython', u'-l mkl_core -l mkl_intel_lp64', u'cimport numpy as np\nimport numpy as np\n\ncdef extern from "mkl_types.h":\n    ctypedef MKL_INT\n\ncdef extern from "mkl.h" nogil:\n    double cblas_dasum (MKL_INT n, double *x, MKL_INT incx);\n\ndef cythonSpMV_viaMKL(np.ndarray[np.double_t] x):\n    """\n    Wrapper to Intel\'s Sparse Matrix-Vector multiplaction routine.\n    Handles rectangular matrices\n    """\n    \n    #cdef MKL_INT n = x.shape[0]\n    #cdef MKL_INT incx = 1\n    return 2#cblas_dasum(n, &x[0], incx)')

/home/andreas/miniconda2/envs/condapy2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2113             magic_arg_s = self.var_expand(line, stack_depth)
   2114             with self.builtin_trap:
-> 2115                 result = fn(magic_arg_s, cell)
   2116             return result
   2117 

<decorator-gen-124> in cython(self, line, cell)

/home/andreas/miniconda2/envs/condapy2/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

/home/andreas/miniconda2/envs/condapy2/lib/python2.7/site-packages/Cython/Build/IpythonMagic.pyc in cython(self, line, cell)
    289             build_extension.build_temp = os.path.dirname(pyx_file)
    290             build_extension.build_lib  = lib_dir
--> 291             build_extension.run()
    292             self._code_cache[key] = module_name
    293 

/home/andreas/miniconda2/envs/condapy2/lib/python2.7/distutils/command/build_ext.pyc in run(self)
    338 
    339         # Now actually compile and link everything.
--> 340         self.build_extensions()
    341 
    342     def check_extensions_list(self, extensions):

/home/andreas/miniconda2/envs/condapy2/lib/python2.7/distutils/command/build_ext.pyc in build_extensions(self)
    447 
    448         for ext in self.extensions:
--> 449             self.build_extension(ext)
    450 
    451     def build_extension(self, ext):

/home/andreas/miniconda2/envs/condapy2/lib/python2.7/distutils/command/build_ext.pyc in build_extension(self, ext)
    497                                          debug=self.debug,
    498                                          extra_postargs=extra_args,
--> 499                                          depends=ext.depends)
    500 
    501         # XXX -- this is a Vile HACK!

/home/andreas/miniconda2/envs/condapy2/lib/python2.7/distutils/ccompiler.pyc in compile(self, sources, output_dir, macros, include_dirs, debug, extra_preargs, extra_postargs, depends)
    572             except KeyError:
    573                 continue
--> 574             self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
    575 
    576         # Return *all* object filenames, not just the ones we just built.

/home/andreas/miniconda2/envs/condapy2/lib/python2.7/distutils/unixccompiler.pyc in _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts)
    122                        extra_postargs)
    123         except DistutilsExecError, msg:
--> 124             raise CompileError, msg
    125 
    126     def create_static_lib(self, objects, output_libname,

CompileError: command 'gcc' failed with exit status 1

In [48]:
%timeit cythonSpMV_viaMKL(Kfloatsp, c, True)


The slowest run took 42.08 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 177 µs per loop

Conclusions

Using sparse matrices can provide huge speed-ups even for moderate sparsity in high dimensions. There is a cross-over with the scipy routines performing better for smaller matrices and MKL routines for larger matrices.