In [1]:
%matplotlib notebook
#NOTA:
# Manylinux1 wheels have been used on the system, making *numpy* implementation underperforming by ~20%

In [2]:
import sys
print(sys.version)
import platform
print(platform.system())


3.5.3 (default, Jan 19 2017, 14:11:04) 
[GCC 6.3.0 20170118]
Linux

In [3]:
!uname -r
!lscpu
!free


4.16.0-0.bpo.2-amd64
Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                16
On-line CPU(s) list:   0-15
Thread(s) per core:    1
Core(s) per socket:    8
Socket(s):             2
NUMA node(s):          2
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 85
Model name:            Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz
Stepping:              4
CPU MHz:               1200.482
BogoMIPS:              6400.00
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              1024K
L3 cache:              25344K
NUMA node0 CPU(s):     0,2,4,6,8,10,12,14
NUMA node1 CPU(s):     1,3,5,7,9,11,13,15
Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cdp_l3 invpcid_single pti intel_ppin mba tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm cqm mpx rdt_a avx512f avx512dq rdseed adx smap clflushopt clwb intel_pt avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local dtherm ida arat pln pts pku ospke
              total        used        free      shared  buff/cache   available
Mem:      196743352    74273772   114975592       14840     7493988   121033496
Swap:       9764860         268     9764592

In [4]:
!pip freeze|grep numpy
!pip freeze|grep numexpr
!pip freeze|grep numba
!pip freeze|grep pythran
!pip freeze|grep Cython


numpy==1.15.1
numpydoc==0.7.0
numexpr==2.6.8
numba==0.40.1
pythran==0.9.0
Cython==0.29

In [5]:
import math
import cmath
import numpy as np
import numba as nb
import numexpr as ne
from matplotlib.pyplot import subplots
from matplotlib.colors import LogNorm
from time import time
H = 0 #Miller index of reflection
K = 4 #Miller index of reflection
N = 100 #Number of units cells per direction
oversampling = 6 #Defines how much points are needed to describe a single Laue fringe (2 = Nyquist frequency)
e0 = 0.1 #Maximum strain at surface
w = 20. #Depth of strain profile below the surface

In [6]:
#Generate real and reciprocal space coordinates
n = np.arange(N)
m = np.arange(N)
h = np.arange(H-0.5, H+0.5, 1./(oversampling*N))
k = np.arange(K-0.5, K+0.5, 1./(oversampling*N))

In [7]:
def Circ_numpy(n, m, h, k, e0, w):
    N = len(n)
    k = k[:, np.newaxis]
    n = n[:, np.newaxis, np.newaxis]
    m = m[:, np.newaxis, np.newaxis, np.newaxis]
    radius = np.sqrt((n - N/2)**2 + (m - N/2)**2)
    support = np.where(radius > N/2, 0, 1)
    strain = e0 * (1 + np.tanh((radius-N/2) / w))
    return np.abs((support * np.exp(2j*np.pi*(h*(n+strain*(n-N/2)) + k*(m+strain*(m-N/2))))).sum(axis=0).sum(axis=0))**2

%time  intensity = Circ_numpy(n, m, h, k, e0, w)


CPU times: user 4min 32s, sys: 1min 46s, total: 6min 18s
Wall time: 6min 19s

In [8]:
ref_intensity = intensity
fig, ax = subplots()
ax.imshow(ref_intensity, extent=(h.min(), h.max(), k.min(), k.max()), norm=LogNorm(), origin = 'lower')
ax.set_xlabel('H')
ax.set_ylabel('K')


Out[8]:
Text(0, 0.5, 'K')

In [9]:
def Circ_hybrid(n, m, h, k, e0, w):
    result = np.zeros((h.size, k.size))
    N = len(n)
    n=np.atleast_2d(n).T
    m=np.atleast_2d(m)
    n_c = (n - N/2)
    m_c = (m - N/2)
    radius = np.sqrt(n_c**2 + m_c**2)
    strain = e0 * (1 + np.tanh((radius-N/2) / w))
    support = np.where(radius > N/2, 0, 1)
    
    n_s = n + strain * (n_c)
    m_s = m + strain * (m_c)
    #loop over the reciprocal space coordinates
    for i_k, v_k in enumerate(k):
        for i_h, v_h in enumerate(h): 
            tmp = (support * np.exp(2j*np.pi*(v_h*n_s + v_k*m_s))).sum()
            result[i_k, i_h] = tmp.real*tmp.real + tmp.imag*tmp.imag
    return result

#Compute and plot
%time  intensity = Circ_hybrid(n, m, h, k, e0, w)
print("Error:", abs(np.array(intensity)-np.array(ref_intensity)).max())


CPU times: user 4min 27s, sys: 10 ms, total: 4min 27s
Wall time: 4min 27s
Error: 1.862645149230957e-09

In [10]:
fig, ax = subplots()
ax.imshow(intensity, extent=(h.min(), h.max(), k.min(), k.max()), norm=LogNorm(), origin = 'lower')
ax.set_xlabel('H')
ax.set_ylabel('K')


Out[10]:
Text(0, 0.5, 'K')

In [11]:
def Circ_numexpr(n, m, h, k, e0, w):
    N = len(n)
    h = h[np.newaxis, :, np.newaxis, np.newaxis]
    k = k[:, np.newaxis, np.newaxis, np.newaxis]
    n = n[np.newaxis, np.newaxis, :, np.newaxis]
    m = m[np.newaxis, np.newaxis, np.newaxis, :]
    radius = ne.evaluate("sqrt((n - N/2)**2 + (m - N/2)**2)")
    strain = ne.evaluate("e0 * (1 + tanh((radius-N/2) / w))")
    j2pi = np.pi*2j
    #loop over the reciprocal space coordinates
    tmp = ne.evaluate("where(radius > N/2, 0, exp(j2pi*(h*(n+strain*(n-N/2)) + k*(m+strain*(m-N/2)))))")
    tmp.shape = k.size, h.size, -1
    result = abs(tmp.sum(axis=-1))**2
    return result

#Compute and plot
%time  intensity = Circ_numexpr(n, m, h, k, e0, w)
print("Error:", abs(np.array(intensity)-np.array(ref_intensity)).max())


CPU times: user 4min 46s, sys: 20 s, total: 5min 6s
Wall time: 51 s
Error: 2.0954757928848267e-09

In [12]:
def Circ_numexpr_hybrid(n, m, h, k, e0, w):
    result = np.zeros((h.size, k.size))
    N = len(n)
    n=np.atleast_2d(n).T
    m=np.atleast_2d(m)
    n_c = (n - N/2)
    m_c = (m - N/2)
    radius = np.sqrt(n_c**2 + m_c**2)
    strain = e0 * (1 + np.tanh((radius-N/2) / w))
    support = np.where(radius > N/2, 0, 1)
    j2pi = np.pi*2j
    n_s = n + strain * (n_c)
    m_s = m + strain * (m_c)
    #loop over the reciprocal space coordinates
    for i_h, v_h in enumerate(h): 
        for i_k, v_k in enumerate(k):
            tmp = ne.evaluate("where(radius > N/2, 0, exp(j2pi*(v_h*n_s + v_k*m_s)))").sum()
            result[i_k, i_h] = tmp.real*tmp.real + tmp.imag*tmp.imag
    return result

#Compute and plot
%time  intensity = Circ_numexpr_hybrid(n, m, h, k, e0, w)
print("Error:", abs(np.array(intensity)-np.array(ref_intensity)).max())


CPU times: user 7min 38s, sys: 21.9 s, total: 8min
Wall time: 2min
Error: 1.862645149230957e-09

In [13]:
@nb.jit(nb.float64[:,:](nb.int64[:],nb.int64[:],nb.float64[:],nb.float64[:],nb.float64, nb.float64), 
        nopython=True, parallel=False, fastmath=False)
def Circ_numba_noparallel_slow(n, m, h, k, e0, w):
    result = np.zeros(len(h)*len(k), dtype=nb.float64)
    N = len(n)
    for i in nb.prange(len(h)*len(k)):
        tmp = 0j
        for i_n in n:
            for i_m in n:
                support = 1.0
                radius = math.sqrt((i_n - N/2)** 2 + (i_m - N/2)** 2)
                if (radius > (N/2)):
                    support = 0.0
                strain = e0 * (1 + math.tanh((radius-N/2)/w))
                tmp += support * cmath.exp(2j*cmath.pi*(h[i%len(h)]*(i_n+strain*(i_n-N/2)) + k[i//len(h)]*(i_m+strain*(i_m-N/2))))
        result[i] = abs(tmp)**2
    return result.reshape((len(k),len(h)))

In [14]:
#Compute and plot
%time  intensity = Circ_numba_noparallel_slow(n, m, h, k, e0, w)
print("Error:", abs(np.array(intensity)-np.array(ref_intensity)).max())


CPU times: user 7min 37s, sys: 2.55 s, total: 7min 40s
Wall time: 7min 37s
Error: 1.979060471057892e-08

In [15]:
@nb.jit(nb.float64[:,:](nb.int64[:],nb.int64[:],nb.float64[:],nb.float64[:],nb.float64, nb.float64), 
        nopython=True, parallel=False, fastmath=True)
def Circ_numba_noparallel_fast(n, m, h, k, e0, w):
    result = np.zeros(len(h)*len(k), dtype=nb.float64)
    N = len(n)
    for i in nb.prange(len(h)*len(k)):
        tmp = 0j
        for i_n in n:
            for i_m in n:
                support = 1.0
                radius = math.sqrt((i_n - N/2)** 2 + (i_m - N/2)** 2)
                if (radius > (N/2)):
                    support = 0.0
                strain = e0 * (1 + math.tanh((radius-N/2)/w))
                tmp += support * cmath.exp(2j*cmath.pi*(h[i%len(h)]*(i_n+strain*(i_n-N/2)) + k[i//len(h)]*(i_m+strain*(i_m-N/2))))
        result[i] = abs(tmp)**2
    return result.reshape((len(k),len(h)))

In [16]:
#Compute and plot
%time  intensity = Circ_numba_noparallel_fast(n, m, h, k, e0, w)
print("Error:", abs(np.array(intensity)-np.array(ref_intensity)).max())


CPU times: user 6min 48s, sys: 2.07 s, total: 6min 50s
Wall time: 6min 48s
Error: 4.959292709827423e-08

In [17]:
@nb.jit(nb.float64[:,:](nb.int64[:],nb.int64[:],nb.float64[:],nb.float64[:],nb.float64, nb.float64), 
        nopython=True, parallel=True, fastmath=False)
def Circ_numba_parallel(n, m, h, k, e0, w):
    result = np.zeros(len(h)*len(k), dtype=nb.float64)
    N = len(n)
    for i in nb.prange(len(h)*len(k)):
        tmp = 0j
        for i_n in n:
            for i_m in n:
                support = 1.0
                radius = math.sqrt((i_n - N/2)** 2 + (i_m - N/2)** 2)
                if (radius > (N/2)):
                    support = 0.0
                strain = e0 * (1 + math.tanh((radius-N/2)/w))
                tmp += support * cmath.exp(2j*cmath.pi*(h[i%len(h)]*(i_n+strain*(i_n-N/2)) + k[i//len(h)]*(i_m+strain*(i_m-N/2))))
        result[i] = abs(tmp)**2
    return result.reshape((len(k),len(h)))

In [18]:
#Compute and plot



#%time  intensity = Circ_numba_parallel(n, m, h, k, e0, w)

# This did not finish in a night ... there is probably a bug in the numba installation.

#print("Error:", abs(np.array(intensity)-np.array(ref_intensity)).max())

In [19]:
%load_ext pythran.magic

In [20]:
%%pythran -fopenmp

#pythran export Circ_pythran(int[] or float[], int[] or float[], float[], float[], float, float or int)
import numpy as np

def Circ_pythran(n, m, h, k, e0, w):
    result = np.zeros((len(k),len(h)), dtype=np.float64)
    N = len(n)
    "omp parallel for"
    for i_h, v_h in enumerate(h): #loop over the reciprocal space coordinates
        for i_k, v_k in enumerate(k):
            tmp = 0j
            for i_n in n:
                for i_m in n:
                    radius = np.sqrt((i_n - N/2.)** 2 + (i_m - N/2.)** 2)
                    if (radius > (N/2.)):
                        continue
                    strain = e0 * (1 + np.tanh((radius-N/2.)/w))
                    tmp += np.exp(2j*np.pi*(v_h*(i_n+strain*(i_n-N/2.)) + v_k*(i_m+strain*(i_m-N/2.))))
            result[i_k, i_h] = abs(tmp)**2
    return result

In [21]:
#Compute and plot
%time  intensity = Circ_pythran(n, m, h, k, e0, w)
print("Error:", abs(np.array(intensity)-np.array(ref_intensity)).max())


CPU times: user 5min 56s, sys: 0 ns, total: 5min 56s
Wall time: 22.7 s
Error: 2.0023435354232788e-08

In [22]:
%load_ext Cython

In [23]:
%%cython -a --compile-args=-fopenmp --link-args=-fopenmp
import numpy as np
from cython.parallel import prange
import cython

from libc.math cimport sqrt, tanh

cdef extern from "complex.h" nogil:
    double complex cexp(double complex)
    
@cython.wraparound(False)
@cython.boundscheck(False)
@cython.cdivision(True)
def Circ_cython(long[::1] n, 
                long[::1] m, 
                double[::1] h, 
                double[::1] k,
                double e0,
                int w):
    cdef:
        double[:, ::1] result
        double r_part, i_part, v_h, v_k, radius, strain
        double complex tmp, two_j_pi
        int i_h, i_k, i_m, i_n, size_h, size_k, N
        
    two_j_pi = np.pi*2j
    size_h = h.size
    size_k = k.size
    N = n.size
    result = np.zeros((size_k, size_h))
    #loop over the reciprocal space coordinates
    for i_k in prange(size_k, nogil=True):
        v_k = k[i_k]
        for i_h in range(size_h): 
            v_h = h[i_h]
            tmp = 0
            #loop and sum over unit-cells
            for i_n in range(N):
                for i_m in range(N):
                    radius = sqrt((i_n - N/2.)** 2 + (i_m - N/2.)** 2)
                    if (radius > (N/2.)):
                        continue
                    strain = e0 * (1 + tanh((radius-N/2.)/w))
                    tmp = tmp + cexp(two_j_pi*(v_h*(i_n+strain*(i_n-N/2.)) + v_k*(i_m+strain*(i_m-N/2.))))
            r_part = tmp.real
            i_part = tmp.imag
            result[i_k, i_h] += r_part*r_part + i_part*i_part

    return result


Out[23]:
Cython: _cython_magic_5c489999c47c89346f2303b617aeaa37.pyx

Generated by Cython 0.29

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

+01: import numpy as np
  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
/* … */
  __pyx_t_1 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 02: from cython.parallel import prange
 03: import cython
 04: 
 05: from libc.math cimport sqrt, tanh
 06: 
 07: cdef extern from "complex.h" nogil:
 08:     double complex cexp(double complex)
 09: 
 10: @cython.wraparound(False)
 11: @cython.boundscheck(False)
 12: @cython.cdivision(True)
+13: def Circ_cython(long[::1] n,
/* Python wrapper */
static PyObject *__pyx_pw_46_cython_magic_5c489999c47c89346f2303b617aeaa37_1Circ_cython(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
static PyMethodDef __pyx_mdef_46_cython_magic_5c489999c47c89346f2303b617aeaa37_1Circ_cython = {"Circ_cython", (PyCFunction)(void*)(PyCFunctionWithKeywords)__pyx_pw_46_cython_magic_5c489999c47c89346f2303b617aeaa37_1Circ_cython, METH_VARARGS|METH_KEYWORDS, 0};
static PyObject *__pyx_pw_46_cython_magic_5c489999c47c89346f2303b617aeaa37_1Circ_cython(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
  __Pyx_memviewslice __pyx_v_n = { 0, 0, { 0 }, { 0 }, { 0 } };
  CYTHON_UNUSED __Pyx_memviewslice __pyx_v_m = { 0, 0, { 0 }, { 0 }, { 0 } };
  __Pyx_memviewslice __pyx_v_h = { 0, 0, { 0 }, { 0 }, { 0 } };
  __Pyx_memviewslice __pyx_v_k = { 0, 0, { 0 }, { 0 }, { 0 } };
  double __pyx_v_e0;
  int __pyx_v_w;
  PyObject *__pyx_r = 0;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("Circ_cython (wrapper)", 0);
  {
    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_n,&__pyx_n_s_m,&__pyx_n_s_h,&__pyx_n_s_k,&__pyx_n_s_e0,&__pyx_n_s_w,0};
    PyObject* values[6] = {0,0,0,0,0,0};
    if (unlikely(__pyx_kwds)) {
      Py_ssize_t kw_args;
      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
      switch (pos_args) {
        case  6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
        CYTHON_FALLTHROUGH;
        case  5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
        CYTHON_FALLTHROUGH;
        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
        CYTHON_FALLTHROUGH;
        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
        CYTHON_FALLTHROUGH;
        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
        CYTHON_FALLTHROUGH;
        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
        CYTHON_FALLTHROUGH;
        case  0: break;
        default: goto __pyx_L5_argtuple_error;
      }
      kw_args = PyDict_Size(__pyx_kwds);
      switch (pos_args) {
        case  0:
        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_n)) != 0)) kw_args--;
        else goto __pyx_L5_argtuple_error;
        CYTHON_FALLTHROUGH;
        case  1:
        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_m)) != 0)) kw_args--;
        else {
          __Pyx_RaiseArgtupleInvalid("Circ_cython", 1, 6, 6, 1); __PYX_ERR(0, 13, __pyx_L3_error)
        }
        CYTHON_FALLTHROUGH;
        case  2:
        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
        else {
          __Pyx_RaiseArgtupleInvalid("Circ_cython", 1, 6, 6, 2); __PYX_ERR(0, 13, __pyx_L3_error)
        }
        CYTHON_FALLTHROUGH;
        case  3:
        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_k)) != 0)) kw_args--;
        else {
          __Pyx_RaiseArgtupleInvalid("Circ_cython", 1, 6, 6, 3); __PYX_ERR(0, 13, __pyx_L3_error)
        }
        CYTHON_FALLTHROUGH;
        case  4:
        if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_e0)) != 0)) kw_args--;
        else {
          __Pyx_RaiseArgtupleInvalid("Circ_cython", 1, 6, 6, 4); __PYX_ERR(0, 13, __pyx_L3_error)
        }
        CYTHON_FALLTHROUGH;
        case  5:
        if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
        else {
          __Pyx_RaiseArgtupleInvalid("Circ_cython", 1, 6, 6, 5); __PYX_ERR(0, 13, __pyx_L3_error)
        }
      }
      if (unlikely(kw_args > 0)) {
        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "Circ_cython") < 0)) __PYX_ERR(0, 13, __pyx_L3_error)
      }
    } else if (PyTuple_GET_SIZE(__pyx_args) != 6) {
      goto __pyx_L5_argtuple_error;
    } else {
      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
      values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
      values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
    }
    __pyx_v_n = __Pyx_PyObject_to_MemoryviewSlice_dc_long(values[0], PyBUF_WRITABLE); if (unlikely(!__pyx_v_n.memview)) __PYX_ERR(0, 13, __pyx_L3_error)
    __pyx_v_m = __Pyx_PyObject_to_MemoryviewSlice_dc_long(values[1], PyBUF_WRITABLE); if (unlikely(!__pyx_v_m.memview)) __PYX_ERR(0, 14, __pyx_L3_error)
    __pyx_v_h = __Pyx_PyObject_to_MemoryviewSlice_dc_double(values[2], PyBUF_WRITABLE); if (unlikely(!__pyx_v_h.memview)) __PYX_ERR(0, 15, __pyx_L3_error)
    __pyx_v_k = __Pyx_PyObject_to_MemoryviewSlice_dc_double(values[3], PyBUF_WRITABLE); if (unlikely(!__pyx_v_k.memview)) __PYX_ERR(0, 16, __pyx_L3_error)
    __pyx_v_e0 = __pyx_PyFloat_AsDouble(values[4]); if (unlikely((__pyx_v_e0 == (double)-1) && PyErr_Occurred())) __PYX_ERR(0, 17, __pyx_L3_error)
    __pyx_v_w = __Pyx_PyInt_As_int(values[5]); if (unlikely((__pyx_v_w == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 18, __pyx_L3_error)
  }
  goto __pyx_L4_argument_unpacking_done;
  __pyx_L5_argtuple_error:;
  __Pyx_RaiseArgtupleInvalid("Circ_cython", 1, 6, 6, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 13, __pyx_L3_error)
  __pyx_L3_error:;
  __Pyx_AddTraceback("_cython_magic_5c489999c47c89346f2303b617aeaa37.Circ_cython", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __Pyx_RefNannyFinishContext();
  return NULL;
  __pyx_L4_argument_unpacking_done:;
  __pyx_r = __pyx_pf_46_cython_magic_5c489999c47c89346f2303b617aeaa37_Circ_cython(__pyx_self, __pyx_v_n, __pyx_v_m, __pyx_v_h, __pyx_v_k, __pyx_v_e0, __pyx_v_w);

  /* function exit code */
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}

static PyObject *__pyx_pf_46_cython_magic_5c489999c47c89346f2303b617aeaa37_Circ_cython(CYTHON_UNUSED PyObject *__pyx_self, __Pyx_memviewslice __pyx_v_n, CYTHON_UNUSED __Pyx_memviewslice __pyx_v_m, __Pyx_memviewslice __pyx_v_h, __Pyx_memviewslice __pyx_v_k, double __pyx_v_e0, int __pyx_v_w) {
  __Pyx_memviewslice __pyx_v_result = { 0, 0, { 0 }, { 0 }, { 0 } };
  double __pyx_v_r_part;
  double __pyx_v_i_part;
  double __pyx_v_v_h;
  double __pyx_v_v_k;
  double __pyx_v_radius;
  double __pyx_v_strain;
  __pyx_t_double_complex __pyx_v_tmp;
  __pyx_t_double_complex __pyx_v_two_j_pi;
  int __pyx_v_i_h;
  int __pyx_v_i_k;
  int __pyx_v_i_m;
  int __pyx_v_i_n;
  int __pyx_v_size_h;
  int __pyx_v_size_k;
  int __pyx_v_N;
  PyObject *__pyx_r = NULL;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("Circ_cython", 0);
/* … */
  /* function exit code */
  __pyx_L1_error:;
  __Pyx_XDECREF(__pyx_t_1);
  __Pyx_XDECREF(__pyx_t_2);
  __Pyx_XDECREF(__pyx_t_3);
  __Pyx_XDECREF(__pyx_t_6);
  __Pyx_XDECREF(__pyx_t_7);
  __PYX_XDEC_MEMVIEW(&__pyx_t_8, 1);
  __Pyx_AddTraceback("_cython_magic_5c489999c47c89346f2303b617aeaa37.Circ_cython", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __pyx_r = NULL;
  __pyx_L0:;
  __PYX_XDEC_MEMVIEW(&__pyx_v_result, 1);
  __PYX_XDEC_MEMVIEW(&__pyx_v_n, 1);
  __PYX_XDEC_MEMVIEW(&__pyx_v_m, 1);
  __PYX_XDEC_MEMVIEW(&__pyx_v_h, 1);
  __PYX_XDEC_MEMVIEW(&__pyx_v_k, 1);
  __Pyx_XGIVEREF(__pyx_r);
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}
/* … */
  __pyx_tuple__19 = PyTuple_Pack(22, __pyx_n_s_n, __pyx_n_s_m, __pyx_n_s_h, __pyx_n_s_k, __pyx_n_s_e0, __pyx_n_s_w, __pyx_n_s_result, __pyx_n_s_r_part, __pyx_n_s_i_part, __pyx_n_s_v_h, __pyx_n_s_v_k, __pyx_n_s_radius, __pyx_n_s_strain, __pyx_n_s_tmp, __pyx_n_s_two_j_pi, __pyx_n_s_i_h, __pyx_n_s_i_k, __pyx_n_s_i_m, __pyx_n_s_i_n, __pyx_n_s_size_h, __pyx_n_s_size_k, __pyx_n_s_N); if (unlikely(!__pyx_tuple__19)) __PYX_ERR(0, 13, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_tuple__19);
  __Pyx_GIVEREF(__pyx_tuple__19);
/* … */
  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_5c489999c47c89346f2303b617aeaa37_1Circ_cython, NULL, __pyx_n_s_cython_magic_5c489999c47c89346f); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 13, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_Circ_cython, __pyx_t_1) < 0) __PYX_ERR(0, 13, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_codeobj__20 = (PyObject*)__Pyx_PyCode_New(6, 0, 22, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__19, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mntdirect__scisoft_users_kieffe, __pyx_n_s_Circ_cython, 13, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__20)) __PYX_ERR(0, 13, __pyx_L1_error)
 14:                 long[::1] m,
 15:                 double[::1] h,
 16:                 double[::1] k,
 17:                 double e0,
 18:                 int w):
 19:     cdef:
 20:         double[:, ::1] result
 21:         double r_part, i_part, v_h, v_k, radius, strain
 22:         double complex tmp, two_j_pi
 23:         int i_h, i_k, i_m, i_n, size_h, size_k, N
 24: 
+25:     two_j_pi = np.pi*2j
  __Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 25, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_pi); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 25, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_t_1 = PyComplex_FromDoubles(0.0, 2.0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 25, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_3 = PyNumber_Multiply(__pyx_t_2, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 25, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_t_4 = __Pyx_PyComplex_As___pyx_t_double_complex(__pyx_t_3); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 25, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __pyx_v_two_j_pi = __pyx_t_4;
+26:     size_h = h.size
  __pyx_t_3 = __pyx_memoryview_fromslice(__pyx_v_h, 1, (PyObject *(*)(char *)) __pyx_memview_get_double, (int (*)(char *, PyObject *)) __pyx_memview_set_double, 0);; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 26, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 26, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __pyx_t_5 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_5 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 26, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_v_size_h = __pyx_t_5;
+27:     size_k = k.size
  __pyx_t_1 = __pyx_memoryview_fromslice(__pyx_v_k, 1, (PyObject *(*)(char *)) __pyx_memview_get_double, (int (*)(char *, PyObject *)) __pyx_memview_set_double, 0);; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 27, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_size); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 27, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_t_5 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_5 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 27, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __pyx_v_size_k = __pyx_t_5;
+28:     N = n.size
  __pyx_t_3 = __pyx_memoryview_fromslice(__pyx_v_n, 1, (PyObject *(*)(char *)) __pyx_memview_get_long, (int (*)(char *, PyObject *)) __pyx_memview_set_long, 0);; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 28, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 28, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __pyx_t_5 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_5 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 28, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_v_N = __pyx_t_5;
+29:     result = np.zeros((size_k, size_h))
  __Pyx_GetModuleGlobalName(__pyx_t_3, __pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 29, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_zeros); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 29, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_v_size_k); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 29, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __pyx_t_6 = __Pyx_PyInt_From_int(__pyx_v_size_h); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 29, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_6);
  __pyx_t_7 = PyTuple_New(2); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 29, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_7);
  __Pyx_GIVEREF(__pyx_t_3);
  PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_3);
  __Pyx_GIVEREF(__pyx_t_6);
  PyTuple_SET_ITEM(__pyx_t_7, 1, __pyx_t_6);
  __pyx_t_3 = 0;
  __pyx_t_6 = 0;
  __pyx_t_6 = NULL;
  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
    __pyx_t_6 = PyMethod_GET_SELF(__pyx_t_2);
    if (likely(__pyx_t_6)) {
      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
      __Pyx_INCREF(__pyx_t_6);
      __Pyx_INCREF(function);
      __Pyx_DECREF_SET(__pyx_t_2, function);
    }
  }
  __pyx_t_1 = (__pyx_t_6) ? __Pyx_PyObject_Call2Args(__pyx_t_2, __pyx_t_6, __pyx_t_7) : __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_7);
  __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
  if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 29, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __pyx_t_8 = __Pyx_PyObject_to_MemoryviewSlice_d_dc_double(__pyx_t_1, PyBUF_WRITABLE); if (unlikely(!__pyx_t_8.memview)) __PYX_ERR(0, 29, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_v_result = __pyx_t_8;
  __pyx_t_8.memview = NULL;
  __pyx_t_8.data = NULL;
 30:     #loop over the reciprocal space coordinates
+31:     for i_k in prange(size_k, nogil=True):
  {
      #ifdef WITH_THREAD
      PyThreadState *_save;
      Py_UNBLOCK_THREADS
      __Pyx_FastGIL_Remember();
      #endif
      /*try:*/ {
        __pyx_t_5 = __pyx_v_size_k;
        if (1 == 0) abort();
        {
            #if ((defined(__APPLE__) || defined(__OSX__)) && (defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))))
                #undef likely
                #undef unlikely
                #define likely(x)   (x)
                #define unlikely(x) (x)
            #endif
            __pyx_t_10 = (__pyx_t_5 - 0 + 1 - 1/abs(1)) / 1;
            if (__pyx_t_10 > 0)
            {
                #ifdef _OPENMP
                #pragma omp parallel
                #endif /* _OPENMP */
                {
                    #ifdef _OPENMP
                    #pragma omp for lastprivate(__pyx_v_i_h) firstprivate(__pyx_v_i_k) lastprivate(__pyx_v_i_k) lastprivate(__pyx_v_i_m) lastprivate(__pyx_v_i_n) lastprivate(__pyx_v_i_part) lastprivate(__pyx_v_r_part) lastprivate(__pyx_v_radius) lastprivate(__pyx_v_strain) lastprivate(__pyx_v_tmp) lastprivate(__pyx_v_v_h) lastprivate(__pyx_v_v_k)
                    #endif /* _OPENMP */
                    for (__pyx_t_9 = 0; __pyx_t_9 < __pyx_t_10; __pyx_t_9++){
                        {
                            __pyx_v_i_k = (int)(0 + 1 * __pyx_t_9);
                            /* Initialize private variables to invalid values */
                            __pyx_v_i_h = ((int)0xbad0bad0);
                            __pyx_v_i_m = ((int)0xbad0bad0);
                            __pyx_v_i_n = ((int)0xbad0bad0);
                            __pyx_v_i_part = ((double)__PYX_NAN());
                            __pyx_v_r_part = ((double)__PYX_NAN());
                            __pyx_v_radius = ((double)__PYX_NAN());
                            __pyx_v_strain = ((double)__PYX_NAN());
                            __pyx_v_v_h = ((double)__PYX_NAN());
                            __pyx_v_v_k = ((double)__PYX_NAN());
/* … */
      /*finally:*/ {
        /*normal exit:*/{
          #ifdef WITH_THREAD
          __Pyx_FastGIL_Forget();
          Py_BLOCK_THREADS
          #endif
          goto __pyx_L5;
        }
        __pyx_L5:;
      }
  }
+32:         v_k = k[i_k]
                            __pyx_t_11 = __pyx_v_i_k;
                            __pyx_v_v_k = (*((double *) ( /* dim=0 */ ((char *) (((double *) __pyx_v_k.data) + __pyx_t_11)) )));
+33:         for i_h in range(size_h):
                            __pyx_t_12 = __pyx_v_size_h;
                            __pyx_t_13 = __pyx_t_12;
                            for (__pyx_t_14 = 0; __pyx_t_14 < __pyx_t_13; __pyx_t_14+=1) {
                              __pyx_v_i_h = __pyx_t_14;
+34:             v_h = h[i_h]
                              __pyx_t_15 = __pyx_v_i_h;
                              __pyx_v_v_h = (*((double *) ( /* dim=0 */ ((char *) (((double *) __pyx_v_h.data) + __pyx_t_15)) )));
+35:             tmp = 0
                              __pyx_v_tmp = __pyx_t_double_complex_from_parts(0, 0);
 36:             #loop and sum over unit-cells
+37:             for i_n in range(N):
                              __pyx_t_16 = __pyx_v_N;
                              __pyx_t_17 = __pyx_t_16;
                              for (__pyx_t_18 = 0; __pyx_t_18 < __pyx_t_17; __pyx_t_18+=1) {
                                __pyx_v_i_n = __pyx_t_18;
+38:                 for i_m in range(N):
                                __pyx_t_19 = __pyx_v_N;
                                __pyx_t_20 = __pyx_t_19;
                                for (__pyx_t_21 = 0; __pyx_t_21 < __pyx_t_20; __pyx_t_21+=1) {
                                  __pyx_v_i_m = __pyx_t_21;
+39:                     radius = sqrt((i_n - N/2.)** 2 + (i_m - N/2.)** 2)
                                  __pyx_v_radius = sqrt((pow((__pyx_v_i_n - (((double)__pyx_v_N) / 2.)), 2.0) + pow((__pyx_v_i_m - (((double)__pyx_v_N) / 2.)), 2.0)));
+40:                     if (radius > (N/2.)):
                                  __pyx_t_22 = ((__pyx_v_radius > (((double)__pyx_v_N) / 2.)) != 0);
                                  if (__pyx_t_22) {
/* … */
                                  }
+41:                         continue
                                    goto __pyx_L14_continue;
+42:                     strain = e0 * (1 + tanh((radius-N/2.)/w))
                                  __pyx_v_strain = (__pyx_v_e0 * (1.0 + tanh(((__pyx_v_radius - (((double)__pyx_v_N) / 2.)) / ((double)__pyx_v_w)))));
+43:                     tmp = tmp + cexp(two_j_pi*(v_h*(i_n+strain*(i_n-N/2.)) + v_k*(i_m+strain*(i_m-N/2.))))
                                  __pyx_v_tmp = __Pyx_c_sum_double(__pyx_v_tmp, cexp(__Pyx_c_prod_double(__pyx_v_two_j_pi, __pyx_t_double_complex_from_parts(((__pyx_v_v_h * (__pyx_v_i_n + (__pyx_v_strain * (__pyx_v_i_n - (((double)__pyx_v_N) / 2.))))) + (__pyx_v_v_k * (__pyx_v_i_m + (__pyx_v_strain * (__pyx_v_i_m - (((double)__pyx_v_N) / 2.)))))), 0))));
                                  __pyx_L14_continue:;
                                }
                              }
+44:             r_part = tmp.real
                              __pyx_t_23 = __Pyx_CREAL(__pyx_v_tmp);
                              __pyx_v_r_part = __pyx_t_23;
+45:             i_part = tmp.imag
                              __pyx_t_23 = __Pyx_CIMAG(__pyx_v_tmp);
                              __pyx_v_i_part = __pyx_t_23;
+46:             result[i_k, i_h] += r_part*r_part + i_part*i_part
                              __pyx_t_24 = __pyx_v_i_k;
                              __pyx_t_25 = __pyx_v_i_h;
                              *((double *) ( /* dim=1 */ ((char *) (((double *) ( /* dim=0 */ (__pyx_v_result.data + __pyx_t_24 * __pyx_v_result.strides[0]) )) + __pyx_t_25)) )) += ((__pyx_v_r_part * __pyx_v_r_part) + (__pyx_v_i_part * __pyx_v_i_part));
                            }
                        }
                    }
                }
            }
        }
        #if ((defined(__APPLE__) || defined(__OSX__)) && (defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))))
            #undef likely
            #undef unlikely
            #define likely(x)   __builtin_expect(!!(x), 1)
            #define unlikely(x) __builtin_expect(!!(x), 0)
        #endif
      }
 47: 
+48:     return result
  __Pyx_XDECREF(__pyx_r);
  __pyx_t_1 = __pyx_memoryview_fromslice(__pyx_v_result, 2, (PyObject *(*)(char *)) __pyx_memview_get_double, (int (*)(char *, PyObject *)) __pyx_memview_set_double, 0);; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 48, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_r = __pyx_t_1;
  __pyx_t_1 = 0;
  goto __pyx_L0;

In [24]:
#Compute and plot
%time  intensity = Circ_cython(n, m, h, k, e0, w)
print("Error:", abs(np.array(intensity)-np.array(ref_intensity)).max())


CPU times: user 6min 4s, sys: 4.79 ms, total: 6min 4s
Wall time: 23.4 s
Error: 2.0023435354232788e-08

In [ ]: