In [1]:
from __future__ import division, print_function
Please compile the shared libraries by running the Makefile:
In [2]:
!make
Import the multiprecision library mpmath as a reference for accuracy benchmarks:
In [3]:
import mpmath
Import the rest of the usual lot:
In [4]:
import numpy as np
import scipy
import sys
import time
from collections import OrderedDict
%load_ext Cython
In [5]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LogNorm
%matplotlib inline
sns.set_style("white")
cmap = matplotlib.colors.ListedColormap(sns.color_palette("Blues", 256))
cmap2 = matplotlib.colors.ListedColormap(sns.cubehelix_palette(8, 256))
matplotlib.rcParams['mathtext.fontset'] = 'custom'
matplotlib.rcParams['mathtext.rm'] = 'Bitstream Vera Sans'
matplotlib.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic'
matplotlib.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold'
Timer class for a more convenient timing:
In [6]:
class Timer(object):
timing_fct = time.time
def __init__(self):
self.timing_fct = Timer.timing_fct # saves one lookup to class
def __enter__(self):
self.start = Timer.timing_fct()
return self
def __exit__(self, *args):
self.end = Timer.timing_fct()
self.interval_s = self.end - self.start
The dictionary wofz_impl accesses the various function implementations.
Input interface is wofz_impl[<implementation language>](x, y):
x is the real and y is the imaginary part of the input, both should be numpy arrays (i.e. provide the ctypes field).
In [7]:
wofz_impl = OrderedDict()
set the precision:
In [8]:
mpmath.mp.dps = 50
prepare the wofz_impl entry:
In [9]:
def wofz(x, y):
z = mpmath.mpc(x, y)
w = mpmath.exp(-z**2) * mpmath.erfc(z * -1j)
return w.real, w.imag
wofz_impl['mp'] = np.vectorize(wofz)
In [10]:
from scipy.special import wofz as scipy_wofz
prepare the wofz_impl entry:
In [11]:
def wofz(x, y):
z = scipy_wofz(x + 1j*y)
return z.real, z.imag
wofz_impl['scipy'] = wofz
embedding the file in cython:
(N.B.: our implementation here relies on 2D numpy.ndarray inputs)
In [12]:
%%cython --name cernlib_c
# distutils: sources = ./cernlib_c/ErrorFunctions.c
import numpy as np
cimport numpy as np
cdef extern void cerrf(double in_real, double in_imag, double* out_real, double* out_imag)
cpdef tuple wofz(np.ndarray[double, ndim=2, mode="c"] in_real,
np.ndarray[double, ndim=2, mode="c"] in_imag):
cdef np.ndarray[double, ndim=2, mode="c"] out_real = \
np.empty_like(in_real)
cdef np.ndarray[double, ndim=2, mode="c"] out_imag = \
np.empty_like(in_real)
cdef int i = 0
cdef int j = 0
cdef int m = in_real.shape[0]
cdef int n = in_real.shape[1]
for i in xrange(m):
for j in xrange(n):
cerrf(in_real[i, j], in_imag[i, j], &out_real[i, j], &out_imag[i, j])
return (out_real, out_imag)
prepare the wofz_impl entry:
In [13]:
import cernlib_c
wofz_impl['c'] = cernlib_c.wofz
try whether PyCUDA is available for the CUDA FADDEEVA version:
In [14]:
i_pycuda = False
try:
from pycuda.autoinit import context
from pycuda import gpuarray
from pycuda.elementwise import ElementwiseKernel
i_pycuda = True
except ImportError as e:
context = None
print ('No PyCUDA available, as per error message:')
print (e.message)
define special GPU Timer:
In [15]:
class GPUTimer(object):
'''Use time.time() for GPU as timer.clock() doesn't work...'''
timing_fct = time.time
def __init__(self):
self.timing_fct = Timer.timing_fct # saves one lookup to class
def __enter__(self):
self.start = Timer.timing_fct()
return self
def __exit__(self, *args):
context.synchronize()
self.end = Timer.timing_fct()
self.interval_s = self.end - self.start
prepare the CUDA kernel for the wofz function:
In [16]:
if i_pycuda:
kernel = ElementwiseKernel(
'double* in_real, double* in_imag, double* out_real, double* out_imag',
# 'out_real[i] = in_real[i]; out_imag[i] = in_imag[i]',
'wofz(in_real[i], in_imag[i], &out_real[i], &out_imag[i]);',
'wofz_kernel',
preamble=open('cernlib_cuda/wofz.cu', 'r').read()
)
prepare the wofz_impl entry:
(N.B.: the function call will include the transfers to and from the GPU!)
In [17]:
if i_pycuda:
def wofz(x, y):
in_real = gpuarray.to_gpu(np.atleast_1d(x).astype(np.float64))
in_imag = gpuarray.to_gpu(np.atleast_1d(y).astype(np.float64))
out_real = gpuarray.empty(in_real.shape, dtype=np.float64)
out_imag = gpuarray.empty(in_imag.shape, dtype=np.float64)
kernel(in_real, in_imag, out_real, out_imag)
return out_real.get(), out_imag.get()
wofz_impl['cuda'] = wofz
import and numpy-vectorise the f90 version:
(N.B.: You need to have called the Makefile beforehand in order to have the compiled shared fortran libraries available!)
In [19]:
sys.path.append('cernlib_f90')
from wwerf import ccperrfr
wofz_impl['f90'] = np.vectorize(ccperrfr)
In [20]:
%%cython --name cernlib_root_adapted
# distutils: include_dirs = ./cernlib_root_adapted/erfc.h
# distutils: sources = ./cernlib_root_adapted/erfc.c
# distutils: extra_compile_args = -std=c99 -W -Wall -Wextra -pedantic -O3 -ftree-vectorize
import numpy as np
cimport numpy as np
cdef extern void wofz (double in_re, double in_im, double* out_re, double* out_im)
# cdef extern void wofzf(float in_re, float in_im, float* out_re, float* out_im)
cpdef tuple wofz_cython(np.ndarray[double, ndim=2, mode="c"] in_real,
np.ndarray[double, ndim=2, mode="c"] in_imag):
cdef np.ndarray[double, ndim=2, mode="c"] out_real = \
np.empty_like(in_real)
cdef np.ndarray[double, ndim=2, mode="c"] out_imag = \
np.empty_like(in_real)
cdef int i = 0
cdef int j = 0
cdef int m = in_real.shape[0]
cdef int n = in_real.shape[1]
for i in xrange(m):
for j in xrange(n):
wofz(in_real[i, j], in_imag[i, j], &out_real[i, j], &out_imag[i, j])
return (out_real, out_imag)
In [21]:
from cernlib_root_adapted import wofz_cython
wofz_impl['c-root-adapt'] = wofz_cython
In [22]:
%%cython --name cernlib_root_adapted_dfastimpl
# distutils: include_dirs = ./cernlib_root_adapted/erfc.h
# distutils: sources = ./cernlib_root_adapted/erfc.c
# distutils: extra_compile_args = -std=c99 -W -Wall -Wextra -pedantic -O3 -ftree-vectorize -DFAST_IMPL
import numpy as np
cimport numpy as np
cdef extern void wofz(double in_re, double in_im, double* out_re, double* out_im)
cpdef tuple wofz_cython(np.ndarray[double, ndim=2, mode="c"] in_real,
np.ndarray[double, ndim=2, mode="c"] in_imag):
cdef np.ndarray[double, ndim=2, mode="c"] out_real = \
np.empty_like(in_real)
cdef np.ndarray[double, ndim=2, mode="c"] out_imag = \
np.empty_like(in_real)
cdef int i = 0
cdef int j = 0
cdef int m = in_real.shape[0]
cdef int n = in_real.shape[1]
for i in xrange(m):
for j in xrange(n):
wofz(in_real[i, j], in_imag[i, j], &out_real[i, j], &out_imag[i, j])
return (out_real, out_imag)
In [23]:
from cernlib_root_adapted_dfastimpl import wofz_cython
wofz_impl['c-root-adapt-fast'] = wofz_cython
In [24]:
%%cython --name cernlib_root_adapted_dsincos
# distutils: include_dirs = ./cernlib_root_adapted/erfc.h
# distutils: sources = ./cernlib_root_adapted/erfc.c
# distutils: extra_compile_args = -std=c99 -W -Wall -Wextra -pedantic -O3 -ftree-vectorize -DSINCOS
import numpy as np
cimport numpy as np
cdef extern void wofz(double in_re, double in_im, double* out_re, double* out_im)
cpdef tuple wofz_cython(np.ndarray[double, ndim=2, mode="c"] in_real,
np.ndarray[double, ndim=2, mode="c"] in_imag):
cdef np.ndarray[double, ndim=2, mode="c"] out_real = \
np.empty_like(in_real)
cdef np.ndarray[double, ndim=2, mode="c"] out_imag = \
np.empty_like(in_real)
cdef int i = 0
cdef int j = 0
cdef int m = in_real.shape[0]
cdef int n = in_real.shape[1]
for i in xrange(m):
for j in xrange(n):
wofz(in_real[i, j], in_imag[i, j], &out_real[i, j], &out_imag[i, j])
return (out_real, out_imag)
In [25]:
from cernlib_root_adapted_dsincos import wofz_cython
wofz_impl['c-root-adapt-sincos'] = wofz_cython
Laurent implemented www.ccsenet.org/journal/index.php/jmr/article/viewFile/41877/24151- for ROOT
In [204]:
%%cython --name cernlib_root_extended5
# distutils: include_dirs = ./cernlib_root_extended/fadf.h
# distutils: sources = ./cernlib_root_extended/fadf.c
# distutils: extra_compile_args = -std=c99 -W -Wall -Wextra -pedantic -O3 -ffast-math -lm -ftree-vectorize -flto
ctypedef double complex cnum_t
cdef extern double creal(cnum_t z)
cdef extern double cimag(cnum_t z)
import numpy as np
cimport numpy as np
cdef extern cnum_t fadf(cnum_t z)
cpdef tuple wofz_cython(np.ndarray[double, ndim=2, mode="c"] in_real,
np.ndarray[double, ndim=2, mode="c"] in_imag):
cdef cnum_t zin;
cdef cnum_t z;
cdef np.ndarray[double, ndim=2, mode="c"] out_real = \
np.empty_like(in_real)
cdef np.ndarray[double, ndim=2, mode="c"] out_imag = \
np.empty_like(in_real)
cdef int i = 0
cdef int j = 0
cdef int m = in_real.shape[0]
cdef int n = in_real.shape[1]
for i in xrange(m):
for j in xrange(n):
zin = in_real[i, j] + 1.0j * in_imag[i, j]
z = fadf(zin)
out_real[i, j] = creal(z)
out_imag[i, j] = cimag(z)
return (out_real, out_imag)
In [211]:
from cernlib_root_extended5 import wofz_cython
wofz_impl['c-root-extended'] = wofz_cython
In [79]:
from StringIO import StringIO
sixtrack_output = open('./sixtrack_f90_table/sixtrack-3', 'r').read()
sixtrack_output = sixtrack_output.replace('D', 'e')
sixtrack_buffer = StringIO(sixtrack_output)
sixtrack_res = {}
(sixtrack_res['exp_re'], sixtrack_res['exp_im'],
sixtrack_res['inp_re'], sixtrack_res['inp_im'],
sixtrack_res['res_re'], sixtrack_res['res_im']) = \
np.genfromtxt(sixtrack_buffer, unpack=True, dtype=np.float64)
define range:
In [26]:
exp_min = -8
exp_max = 8
r = 10**np.linspace(exp_min, exp_max, 101)
x, y = np.meshgrid(r, r)
In [99]:
np.save('input_real.npy', x)
np.save('input_imag.npy', y)
the reference values via mpmath:
In [27]:
# takes ~10sec
wr_ref, wi_ref = wofz_impl['mp'](x, y)
In [197]:
cmap_std = matplotlib.colors.ListedColormap(sns.color_palette("RdBu_r", 16))
def plot_error(x,y,z, title, vextr=None, n_orders_div=1):
fig, ax = plt.subplots(figsize=(8,6))
if vextr is not None:
z = np.clip(a=z, a_min=vextr[0], a_max=vextr[1])
n_orders = int(np.ceil(np.abs(np.log10(vextr[0]) -
np.log10(vextr[1])))) // n_orders_div #// 2 + 1
cmap_idv = matplotlib.colors.ListedColormap(sns.color_palette("RdBu_r", n_orders))
im = ax.imshow(
#np.vectorize(mpmath.log10)(z).astype(np.float64),
z.astype(np.float64),
origin="bottom",extent=[exp_min,exp_max,exp_min,exp_max],aspect='auto',
norm=LogNorm(vmin=vextr[0], vmax=vextr[1]),
cmap=cmap_idv)
else:
im = ax.imshow(np.vectorize(mpmath.log10)(z).astype(np.float64),
origin="bottom",extent=[exp_min,exp_max,exp_min,exp_max],aspect='auto',
cmap=cmap_std)
# Plot look: axes, title, ticks, ...
cbar = plt.colorbar(im)
cbar.set_label('relative error', labelpad=-18, y=1.05, rotation=0)
ax.set_title(title)
ax.set_xlabel('real')
ax.set_ylabel('imaginary')
ax.set_xticklabels([ "$10^{" + str(i) + "}$" for i in range(exp_min, exp_max + 1, 2)])
ax.set_yticklabels([ "$10^{" + str(i) + "}$" for i in range(exp_min, exp_max + 1, 2)])
ax.grid(True, c='black', ls='--')
return fig, ax
def get_extr(z):
return [10**np.floor(np.log10(np.min(z.astype(np.float64)))),
10**np.ceil(np.log10(np.max(z.astype(np.float64))))]
test = True
if test is True:
wr, wi = wofz_impl['c'](x, y)
z = abs(wr - wr_ref) / wr_ref
#z = abs(wi - wi_ref)/wi_ref
fig, ax = plot_error(x,y,z, title='c',
vextr=get_extr(z))
In [206]:
real_error = dict()
imag_error = dict()
for implementation, function in wofz_impl.iteritems():
if implementation == 'mp':
continue
wr, wi = function(x, y)
zr = np.abs(wr - wr_ref)/wr_ref
zi = np.abs(wi - wi_ref)/wi_ref
real_error[implementation] = zr
imag_error[implementation] = zi
adding sixtrack:
In [207]:
# takes ~10sec
sixtrack_wr_ref, sixtrack_wi_ref = wofz_impl['mp'](
sixtrack_res['inp_re'].reshape(x.shape).T,
sixtrack_res['inp_im'].reshape(x.shape).T
)
sixtrack_wr = sixtrack_res['res_re'].reshape(x.shape).T
sixtrack_wi = sixtrack_res['res_im'].reshape(x.shape).T
real_error['sixtrack_f90_table'] = np.abs(sixtrack_wr - sixtrack_wr_ref)/sixtrack_wr_ref
imag_error['sixtrack_f90_table'] = np.abs(sixtrack_wi - sixtrack_wi_ref)/sixtrack_wi_ref
In [208]:
vextr = []
err_min = np.inf
err_max = -np.inf
for implementation in real_error:
zr = real_error[implementation]
zi = imag_error[implementation]
vextr += get_extr(zr) + get_extr(zi)
err_min = min(vextr + [err_min])
err_max = max(vextr + [err_max])
In [209]:
for implementation in real_error:
re_err = real_error[implementation]
im_err = imag_error[implementation]
plot_error(x, y, re_err, title=implementation,
vextr=[err_min, 10**-8])
plt.suptitle('real error')
plot_error(x, y, im_err, title=implementation,
vextr=[err_min, 10**-8])
plt.suptitle('imaginary error')
In [127]:
import mystyle as ms
ms.mystyle_arial(fontsz=17, dist_tick_lab=6)
In [199]:
def plot_error_ax(ax_re, ax_im, x, y, err_re, err_im, title, vextr, n_orders_div=1):
n_orders = int(np.ceil(np.abs(np.log10(vextr[0]) -
np.log10(vextr[1])))) // n_orders_div #// 2 + 1
im = []
ax = ax_re
z = np.clip(a=err_re, a_min=vextr[0], a_max=vextr[1])
cmap_idv = matplotlib.colors.ListedColormap(sns.color_palette("RdBu_r", n_orders))
im.append(ax.imshow(
z.astype(np.float64),
origin="bottom",
extent=[exp_min,exp_max,exp_min,exp_max],
aspect='auto',
norm=LogNorm(vmin=vextr[0], vmax=vextr[1]),
cmap=cmap_idv
))
ax.set_title(title + ' real error', y=1.05)
ax.set_aspect(1)
ax = ax_im
z = np.clip(a=err_im, a_min=vextr[0], a_max=vextr[1])
cmap_idv = matplotlib.colors.ListedColormap(sns.color_palette("RdBu_r", n_orders))
im.append(ax.imshow(
z.astype(np.float64),
origin="bottom",
extent=[exp_min,exp_max,exp_min,exp_max],
aspect='auto',
norm=LogNorm(vmin=vextr[0], vmax=vextr[1]),
cmap=cmap_idv
))
ax.set_title(title + ' imag. error', y=1.05)
ax.set_aspect(1)
# Plot look: axes, title, ticks, ...
for ax in (ax_re, ax_im):
ax.set_xlabel('real input')
ax.set_ylabel('imaginary input')
ax.set_xticklabels([ "$10^{" + str(i) + "}$" for i in range(exp_min, exp_max + 1, 2)])
ax.set_yticklabels([ "$10^{" + str(i) + "}$" for i in range(exp_min, exp_max + 1, 2)])
ax.grid(True, c='black', ls='--')
return im
In [210]:
n_impl_per_col = 2
n_rows = 3 * 2
n = 8 # colspan
vextr = (err_min, 10**-8)
n_cols = n_impl_per_col * 2 # * re / im errors
# last addition = spacings, first 1 = colorbar:
grid_shape = (n_rows, 1 + n_cols*n + n_impl_per_col-1)
fig = plt.figure(figsize=(11*n_cols//2, 11*n_rows//4))
fig.patch.set_facecolor('w')
ax_scipy_re = plt.subplot2grid(grid_shape, (0, 0), colspan=n, rowspan=2)
ax_scipy_im = plt.subplot2grid(grid_shape, (0, n), colspan=n, rowspan=2)
ax_f90_re = plt.subplot2grid(grid_shape, (0, 2*n+1), colspan=n, rowspan=2)
ax_f90_im = plt.subplot2grid(grid_shape, (0, 3*n+1), colspan=n, rowspan=2)
ax_c_re = plt.subplot2grid(grid_shape, (2, 0), colspan=n, rowspan=2)
ax_c_im = plt.subplot2grid(grid_shape, (2, n), colspan=n, rowspan=2)
ax_cuda_re = plt.subplot2grid(grid_shape, (2, 2*n+1), colspan=n, rowspan=2)
ax_cuda_im = plt.subplot2grid(grid_shape, (2, 3*n+1), colspan=n, rowspan=2)
ax_root_re = plt.subplot2grid(grid_shape, (4, 0), colspan=n, rowspan=2)
ax_root_im = plt.subplot2grid(grid_shape, (4, n), colspan=n, rowspan=2)
ax_rootex_re = plt.subplot2grid(grid_shape, (4, 2*n+1), colspan=n, rowspan=2)
ax_rootex_im = plt.subplot2grid(grid_shape, (4, 3*n+1), colspan=n, rowspan=2)
im = plot_error_ax(
ax_scipy_re, ax_scipy_im, x, y,
real_error['scipy'], imag_error['scipy'],
'SciPy (MIT):', vextr=vextr)
im = plot_error_ax(
ax_f90_re, ax_f90_im, x, y,
real_error['f90'], imag_error['f90'],
'CERNlib f90:', vextr=vextr)
im = plot_error_ax(
ax_c_re, ax_c_im, x, y,
real_error['c'], imag_error['c'],
'CERNlib C:', vextr=vextr)
im = plot_error_ax(
ax_cuda_re, ax_cuda_im, x, y,
real_error['cuda'], imag_error['cuda'],
'CERNlib CUDA:', vextr=vextr)
im = plot_error_ax(
ax_root_re, ax_root_im, x, y,
real_error['c-root-adapt'], imag_error['c-root-adapt'],
'CERN ROOT:', vextr=vextr)
im = plot_error_ax(
ax_rootex_re, ax_rootex_im, x, y,
real_error['c-root-extended'], imag_error['c-root-extended'],
'CERN ROOT ext:', vextr=vextr)
ax_cbar = plt.subplot2grid(grid_shape, (1, grid_shape[1] - 1), colspan=1, rowspan=n_rows-2)
cbar = plt.colorbar(im[0], cax=ax_cbar)
cbar.set_label('relative error', labelpad=-60, y=1.05, rotation=0)
ticklabs = cbar.ax.get_yticklabels()
ticklabs = [ "$10^{" + str(i) + "}$" for i in range(
int(np.round(np.log10(vextr[0]))),
int(np.round(np.log10(vextr[0]))) + len(ticklabs))]
cbar.ax.set_yticklabels(ticklabs, ha='left', fontsize=22)
cbar.ax.yaxis.set_tick_params(pad=60)
plt.tight_layout()
plt.savefig('accuracies.pdf', bbox_inches='tight')
In [44]:
def plot_timing(x,y,z, title, unify=False):
fig, ax = plt.subplots(figsize=(8,6))
z = np.clip(a=z, a_min=1e-17, a_max=1e0)
z *= 1e6 # make us out of s
im = ax.imshow(
z.astype(np.float64),
origin="bottom",extent=[exp_min,exp_max,exp_min,exp_max],aspect='auto',
)
# Plot look: axes, title, ticks, ...
cbar = plt.colorbar(im)
cbar.set_label('t [us] per call', labelpad=-18, y=1.05, rotation=0)
ax.set_title(title)
ax.set_xlabel('real')
ax.set_ylabel('imaginary')
ax.set_xticklabels([ "$10^{" + str(i) + "}$" for i in range(exp_min, exp_max + 1, 2)])
ax.set_yticklabels([ "$10^{" + str(i) + "}$" for i in range(exp_min, exp_max + 1, 2)])
return fig, ax
In [65]:
import gc
exp_min = -8
exp_max = 8
r = 10**np.linspace(exp_min, exp_max, 21)
x, y = np.meshgrid(r, r)
def dummy(a,b):
pass
# def timeit(x,y, function, ones_array, nrep_timing=3, n_entries_array=50, Timer=Timer):
def timeit(x,y, function, ones_array, nrep_timing=10, n_entries_array=5000, Timer=Timer):
'''Time the implementation on the arrays x,y = x + iy'''
T = np.zeros_like(x)
timings = np.zeros(nrep_timing, dtype=np.float32)
for el in xrange(len(x)):
for el2 in xrange(len(x[0])):
xx = ones_array * x[el][el2]
yy = ones_array * y[el][el2]
for i in xrange(nrep_timing):
gcold = gc.isenabled()
gc.disable()
try:
with Timer() as t:
wr, wi = function(xx, yy)
finally:
if gcold:
gc.enable()
timings[i] = t.interval_s
T[el,el2] = np.median(timings)/n_entries_array
dummy(wr,wi) # use wr, wi, otherwise they might get optimized away
return T
if test is True:
fn = wofz_impl['c']
z = timeit(x,y, fn, np.ones((1,timeit.func_defaults[1]), dtype=np.float64))
plot_timing(x,y,z, 'c')
In [66]:
n_entries_array = 500
timings_avg = OrderedDict()
for implementation, function in wofz_impl.iteritems():
if implementation is not 'mp':
print (implementation)
# CUDA should be timed without host-device memory transfers:
if implementation is 'cuda':
# z = timeit(gpuarray.to_gpu(x), gpuarray.to_gpu(y), kernel,
# gpuarray.empty((1,int(1e8)), dtype=np.float64).fill(1.),
# n_entries_array=int(1e8), Timer=GPUTimer).get()
continue
else:
z = timeit(x,y, function, np.ones((1,n_entries_array), dtype=np.float64))
timings_avg[implementation] = np.mean(z)
plot_timing(x,y,z, title=implementation)
# plot_timing(x,y,z, title=implementation)
In [67]:
name_len = max(map(len, wofz_impl))
for impl in sorted(timings_avg, key=timings_avg.__getitem__):
print (('{impl:' + str(name_len) +
's}: {avg_time:.3f} us/call').format(
impl=impl, avg_time=timings_avg[impl]*1e6))
Computed by hand, counting the number of variables and lookup tables. Neglecting the memory needed by calls to pow, exp etc. Assuming 4 bytes for integers. Not counting the memory for the input and output. Constants are treated as variables of the corresponding size. Variables: all declarations of variables, even if they might be eliminated by the compiler. Tables: all memory used to store fixed constants, lookup tables, ... . Some values are estimates because they depend on the program flow (if/else branches).
Cython allows to call the C function in a less involved way which leads to much less overhead and therefore more realistic C performance. This fact is also partly due to the numpy.ndarray output creation being implemented on the C side and not in python (as it happens in the ctypes case).
Ctypes rely on calling the Makefile before running this section.
Defining the cython interface:
In [34]:
%%cython --name cernlib_c_cython
# distutils: sources = ./cernlib_c/ErrorFunctions.c
import numpy as np
cimport numpy as np
cdef extern void cerrf(double in_real, double in_imag, double* out_real, double* out_imag)
cpdef tuple wofz(np.ndarray[double, ndim=2, mode="c"] in_real,
np.ndarray[double, ndim=2, mode="c"] in_imag):
cdef np.ndarray[double, ndim=2, mode="c"] out_real = \
np.empty_like(in_real)
cdef np.ndarray[double, ndim=2, mode="c"] out_imag = \
np.empty_like(in_real)
cdef int i = 0
cdef int j = 0
cdef int m = in_real.shape[0]
cdef int n = in_real.shape[1]
for i in xrange(m):
for j in xrange(n):
cerrf(in_real[i, j], in_imag[i, j], &out_real[i, j], &out_imag[i, j])
return (out_real, out_imag)
In [35]:
from cernlib_c_cython import wofz as test_cython
Defining the ctypes interface:
In [36]:
import ctypes
from numpy.ctypeslib import ndpointer
np_double_p = ndpointer(dtype=np.float64)
dll = ctypes.cdll.LoadLibrary('cernlib_c/wofz.so')
dll.cerrf.restype = None
dll.cerrf.argtypes = [ctypes.c_double, ctypes.c_double, np_double_p, np_double_p]
def test_ctypes(x, y):
in_real = ctypes.c_double(x)
in_imag = ctypes.c_double(y)
out_real = np.empty(1, dtype=np.float64)
out_imag = np.empty(1, dtype=np.float64)
dll.cerrf(in_real, in_imag, out_real, out_imag)
return out_real[0], out_imag[0]
test_ctypes = np.vectorize(test_ctypes)
For single values, there is already a huge difference (half of which comes from the numpy allocation and ctypes conversion):
In [37]:
a, b = np.atleast_2d(1.), np.atleast_2d(2.)
In [38]:
print ('ctypes implementation:')
t_ctypes = %timeit -o -c test_ctypes(a, b)
print ('\n\ncython implementation:')
t_cython = %timeit -o -c test_cython(a, b)
print ('\n\nratio between ctypes and cython:\n{0:.2f}'.format(
t_ctypes.best / t_cython.best))
For large arrays it becomes even more evident:
In [39]:
exp_min = -8
exp_max = 8
r = 10**np.linspace(exp_min, exp_max, 101)
x, y = np.meshgrid(r, r)
In [40]:
print ('ctypes implementation:')
t_ctypes = %timeit -o -c test_ctypes(x, y)
print ('\n\ncython implementation:')
t_cython = %timeit -o -c test_cython(x, y)
print ('\n\nratio between ctypes and cython:\n{0:.2f}'.format(
t_ctypes.best / t_cython.best))
By using more aggressive compiler options for gcc one can improve the cythonised functions in terms of speed:
In [41]:
%%cython --name cernlib_c_cython_aggr
# distutils: sources = ./cernlib_c/ErrorFunctions.c
# distutils: extra_compile_args = -std=c99 -W -Wall -Wextra -pedantic -O3 -ftree-vectorize
import numpy as np
cimport numpy as np
cdef extern void cerrf(double in_real, double in_imag, double* out_real, double* out_imag)
cpdef tuple wofz(np.ndarray[double, ndim=2, mode="c"] in_real,
np.ndarray[double, ndim=2, mode="c"] in_imag):
cdef np.ndarray[double, ndim=2, mode="c"] out_real = \
np.empty_like(in_real)
cdef np.ndarray[double, ndim=2, mode="c"] out_imag = \
np.empty_like(in_real)
cdef int i = 0
cdef int j = 0
cdef int m = in_real.shape[0]
cdef int n = in_real.shape[1]
for i in xrange(m):
for j in xrange(n):
cerrf(in_real[i, j], in_imag[i, j], &out_real[i, j], &out_imag[i, j])
return (out_real, out_imag)
In [42]:
from cernlib_c_cython_aggr import wofz as test_cython_aggr
The corresponding timing as in the previous section shows no influence from the more aggressive compiler options:
In [43]:
print ('default cython implementation:')
t_cython_def = %timeit -o -c test_cython(a, b)
print ('\n\ncompiler flag optimised cython implementation:')
t_cython_aggr = %timeit -o -c test_cython_aggr(a, b)
print ('\n\nratio between default and optimised:\n{0:.2f}'.format(
t_cython_def.best / t_cython_aggr.best))
The results are still the same:
In [44]:
res_def = np.array(test_cython(x, y))
res_aggr = np.array(test_cython_aggr(x, y))
print ('Identical arrays? -->', np.all(res_def == res_aggr))
In [45]:
%%cython --name cernlib_c_cython_fastmath
# distutils: sources = ./cernlib_c/ErrorFunctions.c
# distutils: extra_compile_args = -ffast-math
import numpy as np
cimport numpy as np
cdef extern void cerrf(double in_real, double in_imag, double* out_real, double* out_imag)
cpdef tuple wofz(np.ndarray[double, ndim=2, mode="c"] in_real,
np.ndarray[double, ndim=2, mode="c"] in_imag):
cdef np.ndarray[double, ndim=2, mode="c"] out_real = \
np.empty_like(in_real)
cdef np.ndarray[double, ndim=2, mode="c"] out_imag = \
np.empty_like(in_real)
cdef int i = 0
cdef int j = 0
cdef int m = in_real.shape[0]
cdef int n = in_real.shape[1]
for i in xrange(m):
for j in xrange(n):
cerrf(in_real[i, j], in_imag[i, j], &out_real[i, j], &out_imag[i, j])
return (out_real, out_imag)
In [46]:
from cernlib_c_cython_fastmath import wofz as test_cython_fastmath
The corresponding timing as in the previous section shows a few percents difference for small array lengths:
In [47]:
print ('default cython implementation:')
t_cython_def = %timeit -o -c test_cython(a, b)
print ('\n\nfastmath optimised cython implementation:')
t_cython_fastmath = %timeit -o -c test_cython_fastmath(a, b)
print ('\n\nratio between default and optimised:\n{0:.2f}'.format(
t_cython_def.best / t_cython_fastmath.best))
... which is negligible for large arrays:
In [48]:
print ('default cython implementation:')
t_cython_def = %timeit -o -c test_cython(x, y)
print ('\n\nfastmath optimised cython implementation:')
t_cython_fastmath = %timeit -o -c test_cython_fastmath(x, y)
print ('\n\nratio between default and optimised:\n{0:.2f}'.format(
t_cython_def.best / t_cython_fastmath.best))
-fast-math introduces small errors though:
In [49]:
res_def = np.array(test_cython(x, y))
res_fastmath = np.array(test_cython_fastmath(x, y))
print ('Identical arrays? -->', np.all(res_def == res_fastmath))
with the maximal absolute error being
In [50]:
np.max(res_def - res_fastmath)
Out[50]:
In [ ]: