In [1]:
using CUDA
In [2]:
println("CUDA driver version: $(CUDA.DriverVersion)")
In [3]:
println("Devices")
list_devices()
println()
In [4]:
dev = CuDevice(0)
# create context
ctx = create_context(dev)
Out[4]:
In [22]:
# device API functions
println("Device memory (GB): $(totalmem(dev)/1e9)")
println("Device capability: $(capability(dev))")
In [5]:
# compile kernel
run(`cat kernels/vadd.cu`)
run(`nvcc -ptx kernels/vadd.cu`)
In [6]:
println("load module from vadd.ptx")
md = CuModule("vadd.ptx")
Out[6]:
In [7]:
println("get function vadd")
f = CuFunction(md, "vadd")
Out[7]:
In [8]:
siz = (3, 4)
len = prod(siz)
Out[8]:
In [9]:
# load array a to GPU
a = round(rand(Float32, siz) * 100)
ga = CuArray(a)
# load array b to GPU
b = round(rand(Float32, siz) * 100)
gb = CuArray(b)
# create array c on GPU
gc = CuArray(Float32, siz)
Out[9]:
In [10]:
# launch kernel
launch(f, len, 1, (ga, gb, gc))
In [11]:
# fetch results from GPU
c = to_host(gc)
Out[11]:
In [12]:
# free GPU memory
free(ga)
free(gb)
free(gc)
Out[12]:
In [13]:
println("Results:")
println("a = \n$a")
println("b = \n$b")
println("c = \n$c")
In [14]:
# unload module - this is sensitive, will crash kernel if unload twice
unload(md)
# destroy context
destroy(ctx)