GPU Matrix Addition

This demonstrates the exposed API functions of CUDA.jl (Julia interface for CUDA driver API)

Julia v0.3.11



In [1]:

    
using CUDA









    



CUDA Driver Initialized



In [2]:

    
println("CUDA driver version: $(CUDA.DriverVersion)")









    



CUDA driver version: 7000



In [3]:

    
println("Devices")
list_devices()
println()









    



Devices
device[0]: GeForce GTX 970, capability 5.2, total mem = 4095 MB



In [4]:

    
dev = CuDevice(0)
# create context 
ctx = create_context(dev)









    



create context






    Out[4]:





CuContext(Ptr{Void} @0x0000000008102ab0)



In [22]:

    
# device API functions
println("Device memory (GB): $(totalmem(dev)/1e9)")
println("Device capability: $(capability(dev))")









    



Device memory (GB): 4.2942464
Device capability: CuCapability(5,2)



In [5]:

    
# compile kernel
run(`cat kernels/vadd.cu`)
run(`nvcc -ptx kernels/vadd.cu`)









    



// filename: vadd.cu
// simple CUDA kernel to add 2 vectors

extern "C"
{
  __global__ void vadd(const float *a, const float *b, float *c)
  {
    int i= threadIdx.x + blockIdx.x * blockDim.x;
    c[i]=a[i]+b[i];
  }
}



In [6]:

    
println("load module from vadd.ptx")
md = CuModule("vadd.ptx")









    



load module from vadd.ptx






    Out[6]:





CuModule(Ptr{Void} @0x000000000a4c5970)



In [7]:

    
println("get function vadd")
f = CuFunction(md, "vadd")









    



get function vadd






    Out[7]:





CuFunction(Ptr{Void} @0x000000000b0adc70)



In [8]:

    
siz = (3, 4)
len = prod(siz)









    Out[8]:





12



In [9]:

    
# load array a to GPU
a = round(rand(Float32, siz) * 100)
ga = CuArray(a)
# load array b to GPU
b = round(rand(Float32, siz) * 100)
gb = CuArray(b)
# create array c on GPU
gc = CuArray(Float32, siz)









    Out[9]:





CuArray{Float32,2}(CuPtr(0x0000000503e40400),(3,4),12)



In [10]:

    
# launch kernel
launch(f, len, 1, (ga, gb, gc))



In [11]:

    
# fetch results from GPU
c = to_host(gc)









    Out[11]:





3x4 Array{Float32,2}:
 107.0   91.0  57.0   31.0
  49.0  100.0  86.0  121.0
 120.0   88.0  98.0   68.0



In [12]:

    
# free GPU memory
free(ga)
free(gb)
free(gc)









    Out[12]:





CuPtr(0x0000000000000000)



In [13]:

    
println("Results:")
println("a = \n$a")
println("b = \n$b")
println("c = \n$c")









    



Results:
a = 
Float32[7.0 47.0 3.0 3.0
        29.0 59.0 80.0 66.0
        47.0 6.0 62.0 13.0]
b = 
Float32[100.0 44.0 54.0 28.0
        20.0 41.0 6.0 55.0
        73.0 82.0 36.0 55.0]
c = 
Float32[107.0 91.0 57.0 31.0
        49.0 100.0 86.0 121.0
        120.0 88.0 98.0 68.0]



In [14]:

    
# unload module - this is sensitive, will crash kernel if unload twice
unload(md)
# destroy context
destroy(ctx)