In [1]:
versioninfo()
In [2]:
using CUDAdrv, CUDAnative
using Test
In [3]:
function kernel_vadd(a, b, c)
# from CUDAnative: (implicit) CuDeviceArray type,
# and thread/block intrinsics
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
c[i] = a[i] + b[i]
return nothing
end
Out[3]:
In [4]:
dev = CuDevice(0)
ctx = CuContext(dev)
Out[4]:
In [5]:
len = 512
a = rand(Int, len)
b = rand(Int, len)
Out[5]:
In [6]:
d_a = CuArray(a)
d_b = CuArray(b)
d_c = similar(d_a)
Out[6]:
In [7]:
cpu_vec_add = a + b
Out[7]:
In [8]:
@cuda threads=(len) kernel_vadd(d_a, d_b, d_c) # from CUDAnative.jl
gpu_vec_add = Array(d_c)
@test gpu_vec_add == cpu_vec_add
Out[8]:
In [9]:
destroy!(ctx)
CUDAnative.jl takes care of all things related to native GPU programming
1) interfacing with Julia: repurpose the compiler to emit GPU-compatible LLVM IR (no calls to CPU libraries, simplified exceptions, …)
2) interfacing with LLVM (using LLVM.jl): optimize the IR, and compile to PTX
3) interfacing with CUDA (using CUDAdrv.jl): compile PTX to SASS, and upload it to the GPU
These functionalities hidden behind the call to @cuda
Intrinsics: special functions and macros that provide functionality hard or impossible to express using normal functions. For example, the {thread,block,grid}{Idx,Dim} functions provide access to the size and index of each level of work.
Creating local shared memory: @cuStaticSharedMem and @cuDynamicSharedMem macros
Display formatted string from within a kernel function: @cuprintf
In [10]:
using Test
using CUDAdrv, CUDAnative
In [11]:
#
# Main implementation
#
# Reduce a value across a warp
@inline function reduce_warp(op::F, val::T)::T where {F<:Function,T}
offset = CUDAnative.warpsize() ÷ 2
# TODO: this can be unrolled if warpsize is known...
while offset > 0
val = op(val, shfl_down(val, offset))
offset ÷= 2
end
return val
end
Out[11]:
In [12]:
# Reduce a value across a block, using shared memory for communication
@inline function reduce_block(op::F, val::T)::T where {F<:Function,T}
# shared mem for 32 partial sums
shared = @cuStaticSharedMem(T, 32)
wid, lane = fldmod1(threadIdx().x, CUDAnative.warpsize())
# each warp performs partial reduction
val = reduce_warp(op, val)
# write reduced value to shared memory
if lane == 1
@inbounds shared[wid] = val
end
# wait for all partial reductions
sync_threads()
# read from shared memory only if that warp existed
@inbounds val = (threadIdx().x <= fld(blockDim().x, CUDAnative.warpsize())) ? shared[lane] : zero(T)
# final reduce within first warp
if wid == 1
val = reduce_warp(op, val)
end
return val
end
Out[12]:
In [13]:
# Reduce an array across a complete grid
function reduce_grid(op::F, input::CuDeviceVector{T}, output::CuDeviceVector{T},
len::Integer) where {F<:Function,T}
# TODO: neutral element depends on the operator (see Base's 2 and 3 argument `reduce`)
val = zero(T)
# reduce multiple elements per thread (grid-stride loop)
# TODO: step range (see JuliaGPU/CUDAnative.jl#12)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
step = blockDim().x * gridDim().x
while i <= len
@inbounds val = op(val, input[i])
i += step
end
val = reduce_block(op, val)
if threadIdx().x == 1
@inbounds output[blockIdx().x] = val
end
return
end
Out[13]:
In [14]:
"""
Reduce a large array.
Kepler-specific implementation, ie. you need sm_30 or higher to run this code.
"""
function gpu_reduce(op::Function, input::CuVector{T}, output::CuVector{T}) where {T}
len = length(input)
# TODO: these values are hardware-dependent, with recent GPUs supporting more threads
threads = 512
blocks = min((len + threads - 1) ÷ threads, 1024)
# the output array must have a size equal to or larger than the number of thread blocks
# in the grid because each block writes to a unique location within the array.
if length(output) < blocks
throw(ArgumentError("output array too small, should be at least $blocks elements"))
end
@cuda blocks=blocks threads=threads reduce_grid(op, input, output, len)
@cuda threads=1024 reduce_grid(op, output, output, blocks)
end
Out[14]:
In [15]:
if capability(device()) < v"3.0"
@warn("this example requires a newer GPU")
exit(0)
end
In [16]:
len = 10^7
input = ones(Int32, len)
Out[16]:
In [17]:
cpu_val = reduce(+, input)
Out[17]:
In [18]:
dev = CuDevice(0)
ctx = CuContext(dev)
Out[18]:
In [19]:
gpu_input = CuArray(input)
gpu_output = similar(gpu_input)
gpu_reduce(+, gpu_input, gpu_output)
In [20]:
Array(gpu_output)[1]
Out[20]:
In [21]:
let
gpu_input = CuArray(input)
gpu_output = similar(gpu_input)
gpu_reduce(+, gpu_input, gpu_output)
gpu_val = Array(gpu_output)[1]
@assert cpu_val == gpu_val
Test.@test cpu_val == gpu_val
end
Out[21]:
In [22]:
destroy!(ctx)