In [ ]:
using OpenCL
In [ ]:
test_source = "
__kernel void sum(__global const float *a,
__global const float *b,
__global float *c,
const unsigned int count)
{
int gid = get_global_id(0);
if (gid < count) {
c[gid] = a[gid] + b[gid];
}
}";
In [1]:
device = first(cl.devices())
length = 1024
h_a = Array(cl.CL_float, length)
h_b = Array(cl.CL_float, length)
h_c = Array(cl.CL_float, length)
h_d = Array(cl.CL_float, length)
h_e = Array(cl.CL_float, length)
h_f = Array(cl.CL_float, length)
h_g = Array(cl.CL_float, length)
for i in 1:length
h_a[i] = cl.cl_float(rand())
h_b[i] = cl.cl_float(rand())
h_e[i] = cl.cl_float(rand())
h_g[i] = cl.cl_float(rand())
end
err_code = Array(cl.CL_int, 1)
# create compute context (TODO: fails if function ptr's not passed...)
ctx_id = cl.api.clCreateContext(C_NULL, 1, [device.id],
cl.ctx_callback_ptr,
cl.raise_context_error,
err_code)
if err_code[1] != cl.CL_SUCCESS
error("Failed to create context")
end
q_id = cl.api.clCreateCommandQueue(ctx_id, device.id, 0, err_code)
if err_code[1] != cl.CL_SUCCESS
error("Failed to create command queue")
end
# create program
bytesource = bytestring(test_source)
prg_id = cl.api.clCreateProgramWithSource(ctx_id, 1, [bytesource], C_NULL, err_code)
if err_code[1] != cl.CL_SUCCESS
error("Failed to create program")
end
# build program
err = cl.api.clBuildProgram(prg_id, 0, C_NULL, C_NULL, C_NULL, C_NULL)
if err != cl.CL_SUCCESS
error("Failed to build program")
end
# create compute kernel
k_id = cl.api.clCreateKernel(prg_id, "sum", err_code)
if err_code[1] != cl.CL_SUCCESS
error("Failed to create compute kernel")
end
# create input array in device memory
Aid = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR,
sizeof(cl.CL_float) * length, h_a, err_code)
if err_code[1] != cl.CL_SUCCESS
error("Error creating buffer A")
end
Bid = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR,
sizeof(cl.CL_float) * length, h_b, err_code)
if err_code[1] != cl.CL_SUCCESS
error("Error creating buffer B")
end
Eid = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_WRITE_ONLY | cl.CL_MEM_COPY_HOST_PTR,
sizeof(cl.CL_float) * length, h_e, err_code)
if err_code[1] != cl.CL_SUCCESS
error("Error creating buffer E")
end
Gid = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_WRITE_ONLY | cl.CL_MEM_COPY_HOST_PTR,
sizeof(cl.CL_float) * length, h_g, err_code)
if err_code[1] != cl.CL_SUCCESS
error("Error creating buffer G")
end
# create output arrays in device memory
Cid = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_READ_WRITE,
sizeof(cl.CL_float) * length, C_NULL, err_code)
if err_code[1] != cl.CL_SUCCESS
error("Error creating buffer C")
end
Did = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_READ_WRITE,
sizeof(cl.CL_float) * length, C_NULL, err_code)
if err_code[1] != cl.CL_SUCCESS
error("Error creating buffer D")
end
Fid = cl.api.clCreateBuffer(ctx_id, cl.CL_MEM_WRITE_ONLY,
sizeof(cl.CL_float) * length, C_NULL, err_code)
if err_code[1] != cl.CL_SUCCESS
error("Error creating buffer F")
end
err = cl.api.clSetKernelArg(k_id, 0, sizeof(cl.CL_mem), [Aid])
err |= cl.api.clSetKernelArg(k_id, 1, sizeof(cl.CL_mem), [Bid])
err |= cl.api.clSetKernelArg(k_id, 2, sizeof(cl.CL_mem), [Cid])
err |= cl.api.clSetKernelArg(k_id, 3, sizeof(cl.CL_uint), cl.CL_uint[length])
if err != cl.CL_SUCCESS
error("Error setting kernel 1 args")
end
nglobal = Csize_t[length,]
err = cl.api.clEnqueueNDRangeKernel(q_id, k_id, 1, C_NULL,
nglobal, C_NULL, 0, C_NULL, C_NULL)
if err != cl.CL_SUCCESS
error("Failed to execute kernel 1")
end
err = cl.api.clSetKernelArg(k_id, 0, sizeof(cl.CL_mem), [Eid])
err |= cl.api.clSetKernelArg(k_id, 1, sizeof(cl.CL_mem), [Cid])
err |= cl.api.clSetKernelArg(k_id, 2, sizeof(cl.CL_mem), [Did])
if err != cl.CL_SUCCESS
error("Error setting kernel 2 args")
end
err = cl.api.clEnqueueNDRangeKernel(q_id, k_id, 1, C_NULL,
nglobal, C_NULL, 0, C_NULL, C_NULL)
if err != cl.CL_SUCCESS
error("Failed to execute kernel 2")
end
err = cl.api.clSetKernelArg(k_id, 0, sizeof(cl.CL_mem), [Gid])
err |= cl.api.clSetKernelArg(k_id, 1, sizeof(cl.CL_mem), [Did])
err |= cl.api.clSetKernelArg(k_id, 2, sizeof(cl.CL_mem), [Fid])
if err != cl.CL_SUCCESS
error("Error setting kernel 3 args")
end
err = cl.api.clEnqueueNDRangeKernel(q_id, k_id, 1, C_NULL,
nglobal, C_NULL, 0, C_NULL, C_NULL)
if err != cl.CL_SUCCESS
error("Failed to execute kernel 3")
end
# read back the result from compute device...
err = cl.api.clEnqueueReadBuffer(q_id, Fid, cl.CL_TRUE, 0,
sizeof(cl.CL_float) * length, h_f, 0, C_NULL, C_NULL)
if err != cl.CL_SUCCESS
error("Failed to read output array")
end
# test results
ncorrect = 0
for i in 1:length
tmp = h_a[i] + h_b[i] + h_e[i] + h_g[i]
if isapprox(tmp, h_f[i])
ncorrect += 1
end
end
if ncorrect == length
info("Success!")
else
error("Results are incorrect!")
end
In [ ]: