In [ ]:
%matplotlib inline

In [ ]:
import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp
import loopy.transform.iname

In [ ]:
tol = 1e-10
mnorm = lambda x: np.max(np.abs(x))

In [ ]:
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

In [ ]:
n = 16*10**3
# x_d = cl.clrandom.rand(queue, n, dtype=np.double)
# A_d = cl.clrandom.rand(queue, (n, n), dtype=np.double)

x_d = cl.array.arange(queue, n, dtype=np.double)/n
A_d = cl.array.arange(queue, n*n, dtype=np.double).reshape((n,n))/(n*n)
x = x_d.get()
A = A_d.get()
b = A.dot(x)

Tiled Kernel


In [ ]:
from loopy.transform.data import reduction_arg_to_subst_rule

In [ ]:
knl = lp.make_kernel(
        "{ [i,j]: 0<=i,j<n }",
        "out[i] = sum(j, A[i,j]*x[j])",
         assumptions="n>=0")

In [ ]:
# Andreas's transform

tknl = knl
tile_size = 16
tknl = lp.split_iname(tknl, "i", tile_size, 
                      outer_tag="g.0",
                      inner_tag="ilp.seq")
tknl = lp.split_iname(tknl, "j", tile_size)

# Slight issue: It won't  let you pick an order between i_inner and j_inner.
# https://github.com/inducer/loopy/issues/82

tknl = lp.add_and_infer_dtypes(tknl, {"A,x": np.float64})
print(lp.generate_code_v2(tknl).device_code())

In [ ]:
evt, (b_d,) = tknl(queue, A=A_d, x=x_d); evt.wait()
mnorm(b-b_d.get())

In [ ]:
%timeit evt, _ = tknl(queue, A=A_d, x=x_d, n=n); evt.wait()

In [ ]:
%timeit A.dot(x)

In [ ]:
# # Scott's old transform
# tile_size = 16
# knl = lp.split_iname(knl, "j", tile_size)

# knl = lp.split_reduction_inward(knl, "j_inner", within="iname:i")
# #knl = lp.split_reduction_inward(knl, "j_inner")
# knl = lp.split_iname(knl, "i", tile_size)
# knl = reduction_arg_to_subst_rule(knl, "j_outer")
# knl = lp.precompute(knl, "red_j_outer_arg", "j_outer,i_inner",
#                     temporary_scope=lp.temp_var_scope.PRIVATE)
# knl = lp.realize_reduction(knl)
# knl = lp.rename_iname(knl, "red_j_outer_arg_i_inner", "i_inner_")
# knl = lp.set_loop_priority(knl, "j_outer_0,i_inner_")

# knl = lp.assume(knl, "n mod %i = 0"%(tile_size,))

# print(knl)

# knl_tile = knl

In [ ]:
# cgr = lp.generate_code_v2(knl)
# print(cgr.device_code())
# evt, (b_d,) = knl_tile(queue, A=A_d, x=x_d); evt.wait()

In [ ]: