Fortran cython i numpy

Porównanie różnych podejść - do rozwiązywanie równania dyfuzji jawnym algorytmem.

Obliczanie operatora Laplace'a na siatce z f2py, stosując wektorowy kod w f90 działa najszybciej!


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [2]:
%load_ext Cython

In [3]:
%%cython
cimport cython
cimport numpy as np

@cython.wraparound(False)
@cython.boundscheck(False)
def cython_diff2d(np.ndarray[double, ndim=2] u,np.ndarray[double, ndim=2] v, double dx2, double dy2, double c):
    cdef unsigned int i, j
    for i in xrange(1,u.shape[0]-1):
        for j in xrange(1, u.shape[1]-1):
            v[i,j] = u[i,j] + c*( (u[i+1, j] + u[i-1, j]-2.0*u[i,j])/dy2 +
                                  (u[i, j+1] + u[i, j-1]-2.0*u[i,j])/dx2 )

In [4]:
def numpy_diff2d(u,v,dx2,dy2,c):
    v[1:-1,1:-1] =u[1:-1,1:-1] + c*((u[2:,1:-1]+u[:-2,1:-1]-2.0*u[1:-1,1:-1])/dy2 + 
                    (u[1:-1,2:] + u[1:-1,:-2]-2.0*u[1:-1,1:-1])/dx2)

In [5]:
def numpy_diff2d_a(u,v,dx2,dy2,c):
    A = (1.0-2.0*(c/dx2+c/dy2))
    v[1:-1,1:-1] =A*u[1:-1,1:-1] + c/dy2*(u[2:,1:-1] + u[:-2,1:-1]) + \
                                   c/dx2*(u[1:-1,2:] + u[1:-1,:-2])

In [6]:
def numpy_diff2d_b(u,v,dx2,dy2,c):
    
    v[1:-1,1:-1] =u[1:-1,1:-1] + c/dx2*(np.diff(u,2,axis=0)[:,1:-1] + np.diff(u,2,axis=1)[1:-1,:])

In [7]:
def calc(N, Niter, func, dx2,dy2,c):
    u = np.zeros([N, N])
    v = np.zeros_like(u)
    u[u.shape[0]//2,u.shape[1]//2] = 1.0/np.sqrt(dx2*dy2)
    for i in range(Niter//2):
        func(u,v,dx2,dy2,c)
        func(v,u,dx2,dy2,c)
    return u

Parametry symulacji


In [8]:
N = 100
dx = 0.1
dy = 0.1
dx2 = dx*dx
dy2 = dy*dy
dt = 0.01
D = 0.1
c = D*dt
print("CLF = ",c/dx2,c/dy2)


CLF =  0.09999999999999998 0.09999999999999998

Walidacja wyników


In [9]:
u = calc(N,125,numpy_diff2d_b,dx2,dy2,c)
plt.imshow(u)


Out[9]:
<matplotlib.image.AxesImage at 0x7f62fb883438>

In [10]:
u = calc(N,125,cython_diff2d,dx2,dy2,c)
plt.imshow(u)


Out[10]:
<matplotlib.image.AxesImage at 0x7f62f8792128>

znamy rozwiązanie równania dyfuzji na nieskończonym obszarze startujące z punktu $$ u(x,0) = \delta (x)$$

$$ u(x,t) = \frac{1}{4 \, \pi D t}e^{\left(-\frac{x^{2} + y^{2}}{4 D t}\right)}$$

In [11]:
Lx,Ly = N/2*dx,N/2*dy
x  = np.linspace(-Lx,Lx,N)
y  = np.linspace(-Ly,Ly,N)

X,Y = np.meshgrid(x,y )
Niter = 125
t = dt*Niter
P = 1/(4*np.pi*D*t)*np.exp(-(X**2+Y**2)/(4*D*t) )
plt.contourf(X,Y,P)
plt.axes().set_aspect('equal')



In [12]:
u = calc(N,Niter,cython_diff2d,dx2,dy2,c)
np.sum(P)*dx2,np.sum(u)*dx2


Out[12]:
(0.98010000000000008, 1.0000000000000002)

In [13]:
plt.plot(X[X.shape[0]//2,:],P[X.shape[0]//2,:],'b')
plt.plot(X[X.shape[0]//2,:],u[X.shape[0]//2,:],'r')


Out[13]:
[<matplotlib.lines.Line2D at 0x7f62f87dcc50>]

Benchmarks


In [14]:
%%time
u = calc(1000,200,cython_diff2d,dx2,dy2,c)


CPU times: user 1.19 s, sys: 0 ns, total: 1.19 s
Wall time: 1.19 s

In [15]:
%%time
u = calc(1000,200,numpy_diff2d,dx2,dy2,c)


CPU times: user 3.44 s, sys: 3.79 s, total: 7.24 s
Wall time: 7.23 s

In [16]:
%%time
u = calc(1000,200,numpy_diff2d_a,dx2,dy2,c)


CPU times: user 2.67 s, sys: 1.05 s, total: 3.72 s
Wall time: 3.72 s

In [17]:
%%time
u = calc(1000,200,numpy_diff2d_b,dx2,dy2,c)


CPU times: user 2.15 s, sys: 1.7 s, total: 3.85 s
Wall time: 3.85 s

In [33]:


In [18]:
N = 1000
fortran_source = """

subroutine fortran_diff2d(u, v, dx2, dy2, c)
real(8), intent(inout) :: u({0}, {1})
real(8), intent(inout) :: v({0}, {1})
real(8), intent(in) :: dx2, dy2, c
v(2:{0}-1,2:{1}-1) = u(2:{0}-1,2:{1}-1)+ c*( (u(3:,2:{1}-1)+u(:{0}-2,2:{1}-1))/dy2 + &
        (u(2:{0}-1,3:) + u(2:{0}-1,:{1}-2))/dx2)
end subroutine 

""".format(N,N)

fp = open("myfile.f90", "w")
fp.write(fortran_source)
fp.close()

In [19]:
!cat myfile.f90



subroutine fortran_diff2d(u, v, dx2, dy2, c)
real(8), intent(inout) :: u(1000, 1000)
real(8), intent(inout) :: v(1000, 1000)
real(8), intent(in) :: dx2, dy2, c
v(2:1000-1,2:1000-1) = u(2:1000-1,2:1000-1)+ c*( (u(3:,2:1000-1)+u(:1000-2,2:1000-1))/dy2 + &
        (u(2:1000-1,3:) + u(2:1000-1,:1000-2))/dx2)
end subroutine 


In [20]:
%%capture f2py.log
!f2py -c -m my_fortran_module myfile.f90

In [21]:
!ls -l


total 916
-rw-r--r-- 1 marcin.kostur staff   8487 Nov  2 15:37 Jacobi.ipynb
-rw-r--r-- 1 marcin.kostur staff 126910 Nov  2 15:33 PR_Diffusion1d_basics.ipynb
-rw-r--r-- 1 marcin.kostur staff  56835 Oct 12 13:18 PR_MPI_Diffusion2d.ipynb
-rw-r--r-- 1 marcin.kostur staff  13580 Nov  2 15:32 PR_MPI_Jupyter.ipynb
-rw-r--r-- 1 marcin.kostur staff   2763 Oct 13 09:19 PR_MPI_Setup_ipython_mpi_profile.ipynb
-rw-r--r-- 1 marcin.kostur staff   7428 Oct 13 14:26 PR_MPI_collective.ipynb
-rw-r--r-- 1 marcin.kostur staff  21641 Nov  2 15:38 PR_MPI_game_of_life.ipynb
-rw-r--r-- 1 marcin.kostur staff  20814 Nov  2 15:40 PR_MPI_p2p.ipynb
-rw-r--r-- 1 marcin.kostur staff  61903 Nov  2 15:39 PR_MPI_scatter_examples.ipynb
-rw-r--r-- 1 marcin.kostur staff 104143 Oct 21 09:24 PR_cython_numpy_fortran_diff2d.ipynb
drwxr-xr-x 2 marcin.kostur staff   4096 Nov  2 14:43 __pycache__
-rwxr-xr-x 1 marcin.kostur staff 107944 Oct 21 09:24 fortran_module00013.cpython-35m-x86_64-linux-gnu.so
-rwxr-xr-x 1 marcin.kostur staff 108264 Oct 21 09:24 fortran_module00014.cpython-35m-x86_64-linux-gnu.so
-rwxr-xr-x 1 marcin.kostur staff 107712 Oct 21 09:24 fortran_module00015.cpython-35m-x86_64-linux-gnu.so
-rw-r--r-- 1 marcin.kostur staff   1898 Nov  2 12:11 game.py
-rw-r--r-- 1 marcin.kostur staff   2567 Nov  2 14:43 game_of_life_utils.py
-rw-r--r-- 1 marcin.kostur staff  11805 Oct 21 12:48 ipyparallel_gpu4.ipynb
-rw-r--r-- 1 marcin.kostur staff  19680 Oct 20 15:18 ipyparallel_mpi_tests.ipynb
-rw-r--r-- 1 marcin.kostur staff    191 Oct 13 08:27 mpi001.py
-rwxr-xr-x 1 marcin.kostur staff  96232 Nov  2 15:41 my_fortran_module.cpython-35m-x86_64-linux-gnu.so
-rw-r--r-- 1 marcin.kostur staff    325 Nov  2 15:41 myfile.f90
-rw-r--r-- 1 marcin.kostur staff   2848 Nov  2 15:39 slice_object_2d.ipynb
-rw-r--r-- 1 marcin.kostur staff   1895 Nov  2 12:12 tet001.py

In [56]:
from my_fortran_module import fortran_diff2d

In [59]:
def calcF(N, Niter, func, dx2,dy2,c):
    u = np.zeros([N, N],order='F')
    v = np.zeros_like(u)
    u[u.shape[0]//2,u.shape[1]//2] = 1.0/np.sqrt(dx2*dy2)
    for i in range(Niter//2):
        func(u,v,dx2,dy2,c)
        func(v,u,dx2,dy2,c)
    return u

In [60]:
%%time
u = calcF(1000,200,fortran_diff2d,dx2,dy2,c)


CPU times: user 544 ms, sys: 20 ms, total: 564 ms
Wall time: 561 ms

Najszybsza wersja - wektorowy Fortran

Warto zauważyć, że:

  • nie zawsze jest taka duża różnica miedzy numpy/Cython/Fortran, sprawdź dla np. mapowania funkcji na tablicy wartości
  • numpy jest kilka razy wolniejsze bo przy każdym działaniu stosuje wartości pośrednie
  • istnieje możliwośc obejścia z pomocą modułu numexpr, który kompiluje całe wyrażenie na tablicy numpy, sprawdź!
  • wektorowy Fortran najefektywniej kompiluje wyrażenia typu $u_i+u_{i+1}$

Wniosek - warto stosować Fortran do pewnych operacji!

Automatyczna generacja kodu

Problem: kompilowane moduły nie mogą być ponownie ładowane.

Przeanalizujmy prostą metodę generacji kodu


In [22]:
import subprocess
subprocess.check_output(["pwd"])


Out[22]:
b'/home/users/marcin.kostur/ProgramowanieRownolegle/MPI\n'

In [23]:
import subprocess
import importlib

counter = 12

In [24]:
def prepare_fortran_module(N=100):
    
    global counter
    fortran_source = """

subroutine fortran_diff2d(u, v, dx2, dy2, c)
real(8), intent(in) :: u({0}, {1})
real(8), intent(inout) :: v({0}, {1})
real(8), intent(in) :: dx2, dy2, c
v(2:{0}-1,2:{1}-1) = u(2:{0}-1,2:{1}-1)+ c*( (u(3:,2:{1}-1)+u(:{0}-2,2:{1}-1))/dy2 + &
        (u(2:{0}-1,3:) + u(2:{0}-1,:{1}-2))/dx2)
end subroutine 

subroutine fortran_diff2d_a(u, dx2, dy2, c)
real(8), intent(inout) :: u({0}, {1})
real(8), intent(in) :: dx2, dy2, c
u(2:{0}-1,2:{1}-1) = u(2:{0}-1,2:{1}-1)+ c*( (u(3:,2:{1}-1)+u(:{0}-2,2:{1}-1))/dy2 + &
        (u(2:{0}-1,3:) + u(2:{0}-1,:{1}-2))/dx2)
end subroutine 

""".format(N,N)

    fp = open("myfile.f90", "w")
    fp.write(fortran_source)
    fp.close()
    counter=counter+1
    try:
        output = subprocess.check_output(["f2py", "-c","-m", "fortran_module%05d"%counter, "myfile.f90"])
        m = importlib.import_module("fortran_module%05d"%counter)
    except:
        print ("problem z kompilacja!")
        return output    
    
    return m

In [64]:
fortran_module = prepare_fortran_module(N=1000)

In [65]:
fortran_diff2d, fortran_diff2d_a = fortran_module.fortran_diff2d, fortran_module.fortran_diff2d_a

In [66]:
def calcF(N, Niter, func, dx2,dy2,c):
    u = np.zeros([N, N],order='F')
    v = np.zeros_like(u)
    u[u.shape[0]//2,u.shape[1]//2] = 1.0/np.sqrt(dx2*dy2)
    for i in range(Niter//2):
        func(u,v,dx2,dy2,c)
        func(v,u,dx2,dy2,c)
    return u

teraz możemy skompilować wielokrotnie ten sam kod i załadować modluł


In [67]:
N = 1000

fortran_module = prepare_fortran_module(N=N)
fortran_diff2d, fortran_diff2d_a = fortran_module.fortran_diff2d, fortran_module.fortran_diff2d_a

u = calcF(N,1225,fortran_diff2d,dx2,dy2,c)
plt.imshow(u)


Out[67]:
<matplotlib.image.AxesImage at 0x7fe6bc57e5f8>

In [71]:
def calcF_a(N, Niter, func, dx2,dy2,c):
    u = np.zeros([N, N],order='F')
    v = np.zeros_like(u)
    u[u.shape[0]//2,u.shape[1]//2] = 1.0/np.sqrt(dx2*dy2)
    for i in range(Niter):
        func(u,dx2,dy2,c)
    return u

In [72]:
u = calcF_a(1000,1225,fortran_diff2d_a,dx2,dy2,c)
plt.imshow(u)


Out[72]:
<matplotlib.image.AxesImage at 0x7fe65e352278>

In [73]:
%%time
u = calc(1000,200,numpy_diff2d_a,dx2,dy2,c)


CPU times: user 2.53 s, sys: 132 ms, total: 2.66 s
Wall time: 2.54 s

In [74]:
%%time
u = calc(1000,200,cython_diff2d,dx2,dy2,c)


CPU times: user 1.08 s, sys: 8 ms, total: 1.09 s
Wall time: 1.09 s

In [75]:
fortran_diff2d = prepare_fortran_module(N=1000).fortran_diff2d

In [76]:
%%time
u = calcF(1000,200,fortran_diff2d,dx2,dy2,c)


CPU times: user 508 ms, sys: 0 ns, total: 508 ms
Wall time: 507 ms

In [77]:
%%time
u = calcF_a(1000,200,fortran_diff2d_a,dx2,dy2,c)


CPU times: user 672 ms, sys: 0 ns, total: 672 ms
Wall time: 670 ms

In [53]: