In [ ]:
%matplotlib inline

In [ ]:
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as plt

In [ ]:
from sempy.tensormesh import HexCubePoisson
from sempy.maps import LinearIsopMap
from sempy.topology import CubicTopology
from sempy.poisson import PoissonProblem

In [ ]:
import pyopencl as cl
import pyopencl.array
import loopy as lp

In [ ]:
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

In [ ]:
mnorm = lambda x: np.max(np.abs(x))
def kron3(A, B, C):
    return sps.kron(A, sps.kron(B, C))

In [ ]:
N  = 8
NN = N

Ex = 17
Ey = Ex
Ez = Ex

periodic = True

Set Up Poisson Problem


In [ ]:
lmap = LinearIsopMap()

topo = CubicTopology(N, (Ex, Ey, Ez),
                    periodic=periodic)
topo.build()

etn = topo.elem_to_vertex
Q, etd = topo.Q, topo.elem_to_dof
R = topo.R
boundary_dofs = topo.boundary_dofs

vertex_ref = topo.get_vertex_ref()

vertex_phys = vertex_ref.copy()
vertex_phys[:,0] *= 1
vertex_phys[:,1] *= 1
vertex_phys[:,2] *= 1

shift = 0.5*0
chi, eta, zeta = vertex_ref.T+shift
sx = sy = sz = 0.1
vp = vertex_phys
sin3  = np.sin(np.pi*chi)*np.sin(np.pi*eta)*np.sin(np.pi*zeta)
vp[:,0] = chi +sx*sin3
vp[:,1] = eta +sy*sin3
vp[:,2] = zeta+sz*sin3

In [ ]:
poisson = PoissonProblem(topo, lmap)
poisson.build(vertex_phys)

Loo.py apply_A


In [ ]:
p = poisson

x0 = np.random.rand(p.n_dofs)
x  = Q.dot(x0)
x  = x.reshape((p.n_elem, -1))
x_d = cl.array.to_device(queue, x)

G11_d = cl.array.to_device(queue, p.G11)
G12_d = cl.array.to_device(queue, p.G12)
G13_d = cl.array.to_device(queue, p.G13)

G21_d = cl.array.to_device(queue, p.G21)
G22_d = cl.array.to_device(queue, p.G22)
G23_d = cl.array.to_device(queue, p.G23)

G31_d = cl.array.to_device(queue, p.G31)
G32_d = cl.array.to_device(queue, p.G32)
G33_d = cl.array.to_device(queue, p.G33)

D_d   = cl.array.to_device(queue, p.D.copy())

In [ ]:
knl = lp.make_kernel(
        ["{ [i,j,k]: 0<=i,j,k<n }",
         "{ [ii,jj,kk]: 0<=ii,jj,kk<n }",
         "{ [a1,a2,a3]: 0<=a1,a2,a3<n }",
         "{ [b1,b2,b3]: 0<=b1,b2,b3<n }",
         "{ [m,mm]:     0<=m,mm<N }",
         "{ [ielem]:    0<=ielem<n_elem}"],
        """
        # Compute initial tensor products
        D1[i*n*n+j*n+k]=sum(a1, D[k,a1]*A[ielem,i*n*n+j*n+a1])
        D2[i*n*n+j*n+k]=sum(a2, D[j,a2]*A[ielem,i*n*n+a2*n+k])
        D3[i*n*n+j*n+k]=sum(a3, D[i,a3]*A[ielem,a3*n*n+j*n+k])
        
        # Multiply by diagonal factors
        tmp1[m] = G11[ielem,m]*D1[m]+G12[ielem,m]*D2[m]+G13[ielem,m]*D3[m] {id=t1}
        tmp2[m] = G21[ielem,m]*D1[m]+G22[ielem,m]*D2[m]+G23[ielem,m]*D3[m] {id=t2}
        tmp3[m] = G31[ielem,m]*D1[m]+G32[ielem,m]*D2[m]+G33[ielem,m]*D3[m] {id=t3}
        
        # Apply Transposed tensor products
        DT1[ii*n*n+jj*n+kk]=sum(b1, D[b1,kk]*tmp1[ii*n*n+jj*n+b1]) {dep=t1}
        DT2[ii*n*n+jj*n+kk]=sum(b2, D[b2,jj]*tmp2[ii*n*n+b2*n+kk]) {dep=t2}
        DT3[ii*n*n+jj*n+kk]=sum(b3, D[b3,ii]*tmp3[b3*n*n+jj*n+kk]) {dep=t3}
        
        # Sum into out
        out[ielem,mm] = DT1[mm]+DT2[mm]+DT3[mm]
        
        """,
        [lp.GlobalArg("A",  np.double, shape="n_elem,N"),
         lp.GlobalArg("D",  np.double, "n,n"),
         lp.GlobalArg("G11",  np.double, "n_elem,N"),
         lp.GlobalArg("G12",  np.double, "n_elem,N"),
         lp.GlobalArg("G13",  np.double, "n_elem,N"),
         lp.GlobalArg("G21",  np.double, "n_elem,N"),
         lp.GlobalArg("G22",  np.double, "n_elem,N"),
         lp.GlobalArg("G23",  np.double, "n_elem,N"),
         lp.GlobalArg("G31",  np.double, "n_elem,N"),
         lp.GlobalArg("G32",  np.double, "n_elem,N"),
         lp.GlobalArg("G33",  np.double, "n_elem,N"),
         lp.GlobalArg("D1", np.double, "N"),
         lp.GlobalArg("D2", np.double, "N"),
         lp.GlobalArg("D3", np.double, "N"),
         lp.GlobalArg("DT1", np.double, "N"),
         lp.GlobalArg("DT2", np.double, "N"),
         lp.GlobalArg("DT3", np.double, "N"),
         lp.GlobalArg("tmp1", np.double, "N"),
         lp.GlobalArg("tmp2", np.double, "N"),
         lp.GlobalArg("tmp3", np.double, "N"),
         lp.GlobalArg("out",  np.double, "n_elem,N"),
         lp.ValueArg("n", np.int, ),
         lp.ValueArg("N", np.int),
         lp.ValueArg("n_elem", np.int)])

knl = lp.prioritize_loops(knl, "ielem,k,j,i")
#knl = lp.set_temporary_scope(knl, "tmp1,tmp2,tmp3", "local")

#print lp.generate_code_v2(knl).device_code()

In [ ]:
n = NN+1
N = n**3

b_str = """
        # Compute initial tensor products
        D1[i*n*n+j*n+k]=sum(a1, D[k,a1]*A[ielem,i*n*n+j*n+a1])
        D2[i*n*n+j*n+k]=sum(a2, D[j,a2]*A[ielem,i*n*n+a2*n+k])
        D3[i*n*n+j*n+k]=sum(a3, D[i,a3]*A[ielem,a3*n*n+j*n+k])
        
        # Multiply by diagonal factors
        tmp1[m] = G11[ielem,m]*D1[m]+G12[ielem,m]*D2[m]+G13[ielem,m]*D3[m] {id=t1}
        tmp2[m] = G21[ielem,m]*D1[m]+G22[ielem,m]*D2[m]+G23[ielem,m]*D3[m] {id=t2}
        tmp3[m] = G31[ielem,m]*D1[m]+G32[ielem,m]*D2[m]+G33[ielem,m]*D3[m] {id=t3}
        
        # Apply Transposed tensor products
        DT1[ii*n*n+jj*n+kk]=sum(b1, D[b1,kk]*tmp1[ii*n*n+jj*n+b1]) {dep=t1}
        DT2[ii*n*n+jj*n+kk]=sum(b2, D[b2,jj]*tmp2[ii*n*n+b2*n+kk]) {dep=t2}
        DT3[ii*n*n+jj*n+kk]=sum(b3, D[b3,ii]*tmp3[b3*n*n+jj*n+kk]) {dep=t3}
        
        # Sum into out
        out[ielem,mm] = DT1[mm]+DT2[mm]+DT3[mm]
        
        """
b_str = b_str.replace("n", str(n))
knl = lp.make_kernel(
        ["{ [i,j,k]: 0<=i,j,k<%i }"%n,
         "{ [ii,jj,kk]: 0<=ii,jj,kk<%i }"%n,
         "{ [a1,a2,a3]: 0<=a1,a2,a3<%i }"%n,
         "{ [b1,b2,b3]: 0<=b1,b2,b3<%i }"%n,
         "{ [m,mm]:     0<=m,mm<%i }"%N,
         "{ [ielem]:    0<=ielem<n_elem}"],
        b_str
        ,
        [lp.GlobalArg("A",  np.double, shape="n_elem,%i"%N),
         lp.GlobalArg("D",  np.double, "%i,%i"%(n,n)),
         lp.GlobalArg("G11",  np.double, "n_elem,%i"%N),
         lp.GlobalArg("G12",  np.double, "n_elem,%i"%N),
         lp.GlobalArg("G13",  np.double, "n_elem,%i"%N),
         lp.GlobalArg("G21",  np.double, "n_elem,%i"%N),
         lp.GlobalArg("G22",  np.double, "n_elem,%i"%N),
         lp.GlobalArg("G23",  np.double, "n_elem,%i"%N),
         lp.GlobalArg("G31",  np.double, "n_elem,%i"%N),
         lp.GlobalArg("G32",  np.double, "n_elem,%i"%N),
         lp.GlobalArg("G33",  np.double, "n_elem,%i"%N),
         lp.TemporaryVariable("D1", np.double, "%i"%N),
         lp.TemporaryVariable("D2", np.double, "%i"%N),
         lp.TemporaryVariable("D3", np.double, "%i"%N),
         lp.TemporaryVariable("DT1", np.double, "%i"%N),
         lp.TemporaryVariable("DT2", np.double, "%i"%N),
         lp.TemporaryVariable("DT3", np.double, "%i"%N),
         lp.TemporaryVariable("tmp1", np.double, "%i"%N),
         lp.TemporaryVariable("tmp2", np.double, "%i"%N),
         lp.TemporaryVariable("tmp3", np.double, "%i"%N),
         lp.GlobalArg("out",  np.double, "n_elem,%i"%N),
         lp.ValueArg("n_elem", np.int)])

knl = lp.prioritize_loops(knl, "ielem,k,j,i")
knl = lp.tag_inames(knl, dict(ielem='g.0'))

print lp.generate_code_v2(knl).device_code()

In [ ]:
evt, res = knl(queue, A=x_d, D=D_d, 
               G11=G11_d, G12=G12_d, G13=G13_d,
               G21=G21_d, G22=G22_d, G23=G23_d,
               G31=G31_d, G32=G32_d, G33=G33_d)
evt.wait()
out = res[-1].get().ravel()
out = Q.T.dot(out)

In [ ]:
mnorm(p.apply_A(x0.ravel())-out)

In [ ]:
%timeit knl(queue, A=x_d, D=D_d, \
            G11=G11_d, G12=G12_d, G13=G13_d,\
            G21=G21_d, G22=G22_d, G23=G23_d,\
            G31=G31_d, G32=G32_d, G33=G33_d)[0].wait()

In [ ]:
%timeit p.apply_A(x0.ravel())

In [ ]: