Running faster your code

  • Discrete signal energy can be computed as a particular case of the dot product when both signals are the same: $$ ~\\ E_{s} \ \ = \ \ \langle x(n), x(n)\rangle \ \ = \sum_{n}{|x(n)|^2} = \sum_{n}{x(n)y(n)}$$ Let's see how to speed up this operation.

1 Vectorize


In [ ]:
import numpy as np

def non_vectorized_dot_product(x, y):
    """Return the sum of x[i] * y[j] for all pairs of indices i, j.

    Example:
    
        >>> my_dot_product(np.arange(20), np.arange(20))
    
    """
    result = 0
    for i in range(len(x)):
        result += x[i] * y[i]
    return result

signal = np.random.random(1000)
#print(signal)

In [ ]:
%timeit non_vectorized_dot_product(signal, signal)

In [ ]:
non_vectorized_dot_product(signal, signal)

Now, using Numpy's array multiplication and sum:


In [ ]:
%timeit np.sum(signal*signal)

In [ ]:
np.sum(signal*signal)
  • Another example to see that vectorization not only involves pure computation:

In [ ]:
# https://softwareengineering.stackexchange.com/questions/254475/how-do-i-move-away-from-the-for-loop-school-of-thought
def cleanup(x, missing=-1, value=0):
    """Return an array that's the same as x, except that where x ==
    missing, it has value instead.

    >>> cleanup(np.arange(-3, 3), value=10)
    ... # doctest: +NORMALIZE_WHITESPACE
    array([-3, -2, 10, 0, 1, 2])

    """
    result = []
    for i in range(len(x)):
        if x[i] == missing:
            result.append(value)
        else:
            result.append(x[i])
    return np.array(result)

array = np.arange(-8,8)
print(array)
print(cleanup(array, value=10, missing=0))

In [ ]:
array = np.arange(-1000,1000)
%timeit cleanup(array, value=10, missing=0)
print(array[995:1006])
print(cleanup(array, value=10, missing=0)[995:1006])

In [ ]:
# http://www.secnetix.de/olli/Python/list_comprehensions.hawk
# https://docs.python.org/3/library/functions.html#zip
value = [10]*2000
%timeit [xv if c else yv for (c,xv,yv) in zip(array == 0, value, array)]
print([xv if c else yv for (c,xv,yv) in zip(array == 0, value, array)][995:1006])

In [ ]:
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.where.html
%timeit np.where(array == 0, 10, array)
print(np.where(array == 0, 10, array)[995:1006])

In [ ]:
from math import sin
import numpy as np

arr = np.arange(1000000)
%timeit [sin(i)**2 for i in arr]

In [ ]:
%timeit np.sin(arr)**2

2 Use in-place operations


In [ ]:
a = np.random.random(500000)
print(a[0:10])
b = np.copy(a)
%timeit global a; a = 10*a
a = 10*a
print(a[0:10])

In [ ]:
a = np.copy(b)
print(a[0:10])
%timeit global a ; a *= 10
a *= 10
print(a[0:10])

3 Maximize locality in memory acess


In [ ]:
a = np.random.rand(100,50)
b = np.copy(a)

In [ ]:
def mult(x, val):
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            x[i][j] /= val
%timeit -n 1 -r 1 mult(a, 10)

In [ ]:
a = np.copy(b)

def mult2(x, val):
    for j in range(x.shape[1]):
        for i in range(x.shape[0]):
            x[i][j] /= val
            
%timeit -n 1 -r 1 mult2(a, 10)

In [ ]:
# http://www.scipy-lectures.org/advanced/optimizing/
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.sum.html

In [ ]:
c = np.zeros((1000, 1000), order='C')

In [ ]:
%timeit c.sum(axis=0)
c.sum(axis=0).shape

In [ ]:
%timeit c.sum(axis=1)
c.sum(axis=1).shape

4 Delegate in C

When you want to speed-up your code or simply when you need to reuse C code, it is possible to use it from Python. There are several alternatives:

  1. Cython: A superset of Python to allow you call C functions and load Python variables with C ones.
  2. SWIG (Simplified Wrapper Interface Generator): A software development tool to connect C/C++ programs with other languages (included Python).
  3. Ctypes: A Python package that can be used to call shared libraries (.ddl/.so/.dylib) from Python.
  4. Python-C-API: A low-level interface between (compiled) C code and Python.

A function to optimize:


In [ ]:
!cat sum_array_lib.py

In [ ]:
# Please, restart the kernel to ensure that the module sum_array_lib is re-loaded
!rm -f sum_array_lib.cpython*.so
import sum_array_lib
import array as arr
a = arr.array('d', [i for i in range(100000)])
#a = [1 for i in range(100000)]
%timeit sum_array_lib.sum_array(a, len(a))
sum = sum_array_lib.sum_array(a, len(a))
print(sum)

Working flow:

      Cython compiler        C compiler
.pyx -----------------> .c --------------> .so

Installation

$ pip install Cython

Compilation of pure Python code:


In [ ]:
!cp sum_array_lib.py sum_array_lib.pyx

In [ ]:
!cat sum_array_lib.pyx

In [ ]:
!cat Cython/basic/setup.py

In [13]:
!rm -f sum_array_lib.cpython*.so
!python Cython/basic/setup.py build_ext --inplace


running build_ext
building 'sum_array_lib' extension
gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/home/vruiz/.pyenv/versions/3.6.4/include/python3.6m -c sum_array_lib.c -o build/temp.linux-x86_64-3.6/sum_array_lib.o
gcc -pthread -shared -L/home/vruiz/.pyenv/versions/3.6.4/lib -L/home/vruiz/.pyenv/versions/3.6.4/lib build/temp.linux-x86_64-3.6/sum_array_lib.o -o /home/vruiz/YAPT/sum_array_lib.cpython-36m-x86_64-linux-gnu.so

In [ ]:
# Please, restart the kernel to ensure that the module sum_array_lib is re-loaded
import sum_array_lib
import array as arr
a = arr.array('d', [i for i in range(100000)])
#a = [1.1 for i in range(100000)]
%timeit sum_array_lib.sum_array(a, len(a))
sum = sum_array_lib.sum_array(a, len(a))
print(sum)

Defining C types:


In [ ]:
!cat Cython/cdef/sum_array_lib.pyx

In [17]:
!cat Cython/cdef/setup.py


from distutils.core import setup
from Cython.Build import cythonize
#import numpy

setup(
    ext_modules=cythonize("Cython/cdef/sum_array_lib.pyx"),
    #include_dirs=[numpy.get_include()]
)

In [18]:
# Please, restart the kernel to ensure that the module sum_array_lib is re-loaded
!rm sum_array_lib.cpython*.so
!python Cython/cdef/setup.py build_ext --inplace


running build_ext
building 'sum_array_lib' extension
gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/home/vruiz/.pyenv/versions/3.6.4/include/python3.6m -c Cython/cdef/sum_array_lib.c -o build/temp.linux-x86_64-3.6/Cython/cdef/sum_array_lib.o
gcc -pthread -shared -L/home/vruiz/.pyenv/versions/3.6.4/lib -L/home/vruiz/.pyenv/versions/3.6.4/lib build/temp.linux-x86_64-3.6/Cython/cdef/sum_array_lib.o -o /home/vruiz/YAPT/sum_array_lib.cpython-36m-x86_64-linux-gnu.so

In [1]:
# Please, restart the kernel to ensure that the module sum_array_lib is re-loaded
import array as arr
import sum_array_lib
#import numpy as np
#a = np.arange(100000)
a = arr.array('d', [i for i in range(100000)])
%timeit sum_array_lib.sum_array(a, len(a))
print(sum)


193 µs ± 27.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
<built-in function sum>

4.2 Python-C

Python-C-API is the most flexible and efficient alternative, but also the hardest to code.

The C code to reuse in Python


In [16]:
!cat sum_array_lib.c


double sum_array(double* a, int N) {
  int i;
  double sum = 0;
  for(i=0; i<N; i++) {
    sum += *a+i;
  }
  return sum;
}

In [5]:
!cat sum_array.c


#include <stdio.h>
#include <time.h>
#include "sum_array_lib.c"

#define N 100000

int main() {
  double a[N];
  int i;
  clock_t start, end;
  double cpu_time;
  for(i=0; i<N; i++) {
    a[i] = i;
  }
  start = clock();
  double sum = sum_array(a,N);
  end = clock();
  printf("%f ", sum);
  cpu_time = ((double) (end - start)) / CLOCKS_PER_SEC;
  cpu_time *= 1000000;
  printf("%f usegs\n", cpu_time);
}

In [6]:
!gcc -O3 sum_array.c -o sum_array
!./sum_array


4999950000.000000 178.000000 usegs

The module


In [7]:
!cat sum_array_module.c


#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <Python.h>            /* Compulsory in every module */
#include <numpy/arrayobject.h> /* To interact with numpy arrays */
#include "sum_array_lib.c"

static PyObject* sumArray(PyObject* self, PyObject* args) {
  int N;
  double sum;
  //int* a;
  PyArrayObject *in_array;
  
  clock_t start, end;
  double cpu_time;

  /*  parse the input */
  //if (!PyArg_ParseTuple(args, "i", &N))
  if (!PyArg_ParseTuple(args, "O!", &PyArray_Type, &in_array))
    return NULL;
  /* if the above function returns -1, an appropriate Python exception will
   * have been set, and the function simply returns NULL
   */

  N = PyArray_DIM(in_array, 0);
  printf("array size %d\n", N);

  npy_double* data  = (npy_double*)PyArray_DATA(in_array);
  //a = (int*)malloc(N*sizeof(int));
  //if (!a) return NULL;
  
  /*for(i=0; i<N; i++) {
    data[i] = i;
    }*/

  start = clock();
  sum = sum_array(data, N);
  end = clock();
  cpu_time = ((double) (end - start)) / CLOCKS_PER_SEC;
  cpu_time *= 1000000;
  printf("%f usegs\n", cpu_time);
  
  /*  construct the output (https://docs.python.org/3/c-api/arg.html) */
  return Py_BuildValue("d", sum);
}

/*  define functions in module */
static PyMethodDef module_methods[] = {
  {"sumArray", sumArray, METH_VARARGS, "Computes the sum of all elements of an array"},
  {NULL, NULL, 0, NULL}
};

/* module initialization */
/* Python version 3*/
static struct PyModuleDef cModPyDem = {
    PyModuleDef_HEAD_INIT,
    "sumArray", "Computes the sum of all elements of an array",
    -1,
    module_methods
};

PyMODINIT_FUNC
PyInit_sum_array_module(void) {
  import_array();
  return PyModule_Create(&cModPyDem);
}

Module compilation


In [8]:
!cat setup.py


from distutils.core import setup, Extension
import numpy.distutils.misc_util

# define the extension module
sum_array_module = Extension(
    'sum_array_module',
    sources=['sum_array_module.c'],
    include_dirs=numpy.distutils.misc_util.get_numpy_include_dirs()
)

# run the setup
setup(
    ext_modules=[sum_array_module],
)

In [2]:
!python setup.py build_ext --inplace


running build_ext

In [10]:
import sum_array_module
import numpy as np
a = np.arange(100000)
%timeit sum_array_module.sumArray(a)
print(sum)


177 µs ± 1.9 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
<built-in function sum>

However, remember: vectorize when possible!


In [11]:
%timeit np.sum(a)
print(sum)


80.2 µs ± 3.53 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
<built-in function sum>