In [1]:
from numba import cuda, vectorize
import numpy as np
In [2]:
@cuda.jit('void(float32[:], float32[:], float32[:])')
def cu_add1(a, b, c):
"""This kernel function will be executed by a thread."""
bx = cuda.blockIdx.x # which block in the grid?
bw = cuda.blockDim.x # what is the size of a block?
tx = cuda.threadIdx.x # unique thread ID within a blcok
i = tx + bx * bw
if i > c.size:
return
c[i] = a[i] + b[i]
In [3]:
device = cuda.get_current_device()
print(device)
n = 100
# Host memory
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32)
# Assign equivalent storage on device
da = cuda.to_device(a)
db = cuda.to_device(b)
# Assign storage on device for output
dc = cuda.device_array_like(a)
# Set up enough threads for kernel
tpb = device.WARP_SIZE
bpg = int(np.ceil(float(n)/tpb))
print('Blocks per grid: {0}'.format(bpg))
print('Threads per block: {0}'.format(tpb))
# Launch kernel
cu_add1[bpg, tpb](da, db, dc)
# Transfer output from device to host
c = dc.copy_to_host()
print(c)
Simpler version
In [4]:
@cuda.jit('void(float32[:], float32[:], float32[:])')
def cu_add2(a, b, c):
"""This kernel function will be executed by a thread."""
i = cuda.grid(1)
if i > c.shape[0]:
return
c[i] = a[i] + b[i]
In [5]:
device = cuda.get_current_device()
n = 100
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32)
c = np.empty_like(a)
tpb = device.WARP_SIZE
bpg = int(np.ceil(float(n)/tpb))
print('Blocks per grid: {0}'.format(bpg))
print('Threads per block: {0}'.format(tpb))
cu_add2[bpg, tpb](a, b, c)
print(c)
Using vectorize
In [6]:
@vectorize(['int64(int64, int64)',
'float32(float32, float32)',
'float64(float64, float64)'],
target='cuda')
def cu_add(a, b):
return a + b
In [7]:
n = 100
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32)
c = cu_add(a, b)
print(c)
In [ ]: