In [ ]:
using OpenCL
In [1]:
using PyPlot
In [1]:
BLOCK_SIZE = 16;
In [1]:
naive_transpose = "
__kernel void transpose(__global *a_t,
__global *a,
unsigned a_width,
unsigned a_height)
{
int read_idx = get_global_id(0) + get_global_id(1) * a_width;
int write_idx = get_global_id(1) + get_global_id(0) * a_height;
a_t[write_idx] = a[read_idx];
}";
In [1]:
block_transpose = "
#define BLOCK_SIZE $(BLOCK_SIZE)
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, BLOCK_SIZE, 1)))
void transpose(__global float *a_t,
__global float *a,
unsigned a_width,
unsigned a_height,
__local float *a_local)
{
int base_idx_a = get_group_id(0) * BLOCK_SIZE + get_group_id(1) * (BLOCK_SIZE * a_width);
int base_idx_a_t = get_group_id(1) * BLOCK_SIZE + get_group_id(0) * (BLOCK_SIZE * a_height);
int glob_idx_a = base_idx_a + get_local_id(0) + a_width * get_local_id(1);
int glob_idx_a_t = base_idx_a_t + get_local_id(0) + a_height * get_local_id(1);
a_local[get_local_id(1) * BLOCK_SIZE + get_local_id(0)] = a[glob_idx_a];
barrier(CLK_LOCAL_MEM_FENCE);
a_t[glob_idx_a_t] = a_local[get_local_id(0) * BLOCK_SIZE + get_local_id(1)];
}";
In [2]:
function enqueue_naive_kernel{T}(queue::cl.CmdQueue,
k::cl.Kernel,
dst::cl.Buffer{T},
src::cl.Buffer{T},
dims)
h, w = dims
@assert w % BLOCK_SIZE == 0
@assert h % BLOCK_SIZE == 0
cl.set_args!(k, dst, src, uint32(h), uint32(h))
cl.enqueue_kernel(queue, k, (h, w), (BLOCK_SIZE, BLOCK_SIZE))
end
Out[2]:
In [3]:
function enqueue_block_kernel{T}(queue::cl.CmdQueue,
k::cl.Kernel,
dst::cl.Buffer{T},
src::cl.Buffer{T},
dims::Dims)
h, w = dims
@assert w % BLOCK_SIZE == 0
@assert h % BLOCK_SIZE == 0
lmem = cl.LocalMem(Float32, BLOCK_SIZE * (BLOCK_SIZE + 1))
cl.set_args!(k, dst, src, uint32(h), uint32(w), lmem)
cl.enqueue_kernel(queue, k, (h, w), (BLOCK_SIZE, BLOCK_SIZE))
end
Out[3]:
In [7]:
function benchmark_transpose()
gpu_device = first(cl.devices(:gpu))
ctx = cl.Context(gpu_device)
for dev in cl.devices(ctx)
@assert dev[:local_mem_size] > 0
end
prg = cl.Program(ctx, source=naive_transpose) |> cl.build!
naive_kern = cl.Kernel(prg, "transpose")
prg = cl.Program(ctx, source=block_transpose) |> cl.build!
block_kernel = cl.Kernel(prg, "transpose")
queue = cl.CmdQueue(ctx, :profile)
array_sizes = [int((2^i) / 32) * 32 for i in 6:12]
mem_bandwiths = Float32[]
for (name, method, kern) in (("naive", enqueue_naive_kernel, naive_kern),
("block", enqueue_block_kernel, block_kernel))
for s in array_sizes
src = rand(Float32, (s, s))
a_buf = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=src)
a_t_buf = cl.Buffer(Float32, ctx, :w, length(src))
# warm up....
for i in 1:4
method(queue, kern, a_t_buf, a_buf, size(src))
end
# profile...
count = 20
events = cl.Event[]
for i in 1:count
push!(events, method(queue, kern, a_t_buf, a_buf, size(src)))
end
cl.wait(events[end])
time = sum([evt[:profile_duration] for evt in events])
mem_bw = 2 * sizeof(src) * count / (time * 1e-9)
push!(mem_bandwiths, mem_bw)
@printf("benchmarking %s, array size: %s^2, %.3f GB/s\n", name, s, mem_bw / 1e9)
end
end
# plot results, sleep a little to prevent overlap with show..
sleep(1)
clf()
n_sizes = length(array_sizes)
naive_bandwidths = mem_bandwiths[1:n_sizes]
block_bandwidths = mem_bandwiths[n_sizes+1:end]
plot(array_sizes, naive_bandwidths/1e9, "o-", label="naive")
plot(array_sizes, block_bandwidths/1e9, "o-", label="block")
title("Tanspose Tests for $(gpu_device[:name])")
xlabel("Matrix width/height N")
ylabel("Memory Bandwidth [GB/s]")
legend(loc="best")
grid(true)
show()
end
Out[7]:
In [8]:
benchmark_transpose()
In [ ]: