GPU Matrix Addition

This demonstrates the exposed API functions of CUDA.jl (Julia interface for CUDA driver API)

Julia v0.3.11


In [1]:
using CUDA


CUDA Driver Initialized

In [2]:
println("CUDA driver version: $(CUDA.DriverVersion)")


CUDA driver version: 7000

In [3]:
println("Devices")
list_devices()
println()


Devices
device[0]: GeForce GTX 970, capability 5.2, total mem = 4095 MB


In [4]:
dev = CuDevice(0)
# create context 
ctx = create_context(dev)


create context
Out[4]:
CuContext(Ptr{Void} @0x0000000008102ab0)

In [22]:
# device API functions
println("Device memory (GB): $(totalmem(dev)/1e9)")
println("Device capability: $(capability(dev))")


Device memory (GB): 4.2942464
Device capability: CuCapability(5,2)

In [5]:
# compile kernel
run(`cat kernels/vadd.cu`)
run(`nvcc -ptx kernels/vadd.cu`)


// filename: vadd.cu
// simple CUDA kernel to add 2 vectors

extern "C"
{
  __global__ void vadd(const float *a, const float *b, float *c)
  {
    int i= threadIdx.x + blockIdx.x * blockDim.x;
    c[i]=a[i]+b[i];
  }
}

In [6]:
println("load module from vadd.ptx")
md = CuModule("vadd.ptx")


load module from vadd.ptx
Out[6]:
CuModule(Ptr{Void} @0x000000000a4c5970)

In [7]:
println("get function vadd")
f = CuFunction(md, "vadd")


get function vadd
Out[7]:
CuFunction(Ptr{Void} @0x000000000b0adc70)

In [8]:
siz = (3, 4)
len = prod(siz)


Out[8]:
12

In [9]:
# load array a to GPU
a = round(rand(Float32, siz) * 100)
ga = CuArray(a)
# load array b to GPU
b = round(rand(Float32, siz) * 100)
gb = CuArray(b)
# create array c on GPU
gc = CuArray(Float32, siz)


Out[9]:
CuArray{Float32,2}(CuPtr(0x0000000503e40400),(3,4),12)

In [10]:
# launch kernel
launch(f, len, 1, (ga, gb, gc))

In [11]:
# fetch results from GPU
c = to_host(gc)


Out[11]:
3x4 Array{Float32,2}:
 107.0   91.0  57.0   31.0
  49.0  100.0  86.0  121.0
 120.0   88.0  98.0   68.0

In [12]:
# free GPU memory
free(ga)
free(gb)
free(gc)


Out[12]:
CuPtr(0x0000000000000000)

In [13]:
println("Results:")
println("a = \n$a")
println("b = \n$b")
println("c = \n$c")


Results:
a = 
Float32[7.0 47.0 3.0 3.0
        29.0 59.0 80.0 66.0
        47.0 6.0 62.0 13.0]
b = 
Float32[100.0 44.0 54.0 28.0
        20.0 41.0 6.0 55.0
        73.0 82.0 36.0 55.0]
c = 
Float32[107.0 91.0 57.0 31.0
        49.0 100.0 86.0 121.0
        120.0 88.0 98.0 68.0]

In [14]:
# unload module - this is sensitive, will crash kernel if unload twice
unload(md)
# destroy context
destroy(ctx)