In [ ]:
%matplotlib inline
In [ ]:
import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp
import loopy.transform.iname
In [ ]:
tol = 1e-10
mnorm = lambda x: np.max(np.abs(x))
In [ ]:
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
In [ ]:
n = 16*10**3
# x_d = cl.clrandom.rand(queue, n, dtype=np.double)
# A_d = cl.clrandom.rand(queue, (n, n), dtype=np.double)
x_d = cl.array.arange(queue, n, dtype=np.double)/n
A_d = cl.array.arange(queue, n*n, dtype=np.double).reshape((n,n))/(n*n)
x = x_d.get()
A = A_d.get()
b = A.dot(x)
In [ ]:
from loopy.transform.data import reduction_arg_to_subst_rule
In [ ]:
knl = lp.make_kernel(
"{ [i,j]: 0<=i,j<n }",
"out[i] = sum(j, A[i,j]*x[j])",
assumptions="n>=0")
In [ ]:
# Andreas's transform
tknl = knl
tile_size = 16
tknl = lp.split_iname(tknl, "i", tile_size,
outer_tag="g.0",
inner_tag="ilp.seq")
tknl = lp.split_iname(tknl, "j", tile_size)
# Slight issue: It won't let you pick an order between i_inner and j_inner.
# https://github.com/inducer/loopy/issues/82
tknl = lp.add_and_infer_dtypes(tknl, {"A,x": np.float64})
print(lp.generate_code_v2(tknl).device_code())
In [ ]:
evt, (b_d,) = tknl(queue, A=A_d, x=x_d); evt.wait()
mnorm(b-b_d.get())
In [ ]:
%timeit evt, _ = tknl(queue, A=A_d, x=x_d, n=n); evt.wait()
In [ ]:
%timeit A.dot(x)
In [ ]:
# # Scott's old transform
# tile_size = 16
# knl = lp.split_iname(knl, "j", tile_size)
# knl = lp.split_reduction_inward(knl, "j_inner", within="iname:i")
# #knl = lp.split_reduction_inward(knl, "j_inner")
# knl = lp.split_iname(knl, "i", tile_size)
# knl = reduction_arg_to_subst_rule(knl, "j_outer")
# knl = lp.precompute(knl, "red_j_outer_arg", "j_outer,i_inner",
# temporary_scope=lp.temp_var_scope.PRIVATE)
# knl = lp.realize_reduction(knl)
# knl = lp.rename_iname(knl, "red_j_outer_arg_i_inner", "i_inner_")
# knl = lp.set_loop_priority(knl, "j_outer_0,i_inner_")
# knl = lp.assume(knl, "n mod %i = 0"%(tile_size,))
# print(knl)
# knl_tile = knl
In [ ]:
# cgr = lp.generate_code_v2(knl)
# print(cgr.device_code())
# evt, (b_d,) = knl_tile(queue, A=A_d, x=x_d); evt.wait()
In [ ]: