Matrix addition using cuda


In [1]:
from numba import cuda, vectorize
import numpy as np

In [2]:
@cuda.jit('void(float32[:], float32[:], float32[:])')
def cu_add1(a, b, c):
    """This kernel function will be executed by a thread."""
    bx = cuda.blockIdx.x # which block in the grid?
    bw = cuda.blockDim.x # what is the size of a block?
    tx = cuda.threadIdx.x # unique thread ID within a blcok
    i = tx + bx * bw

    if i > c.size:
        return

    c[i] = a[i] + b[i]

In [3]:
device = cuda.get_current_device()
print(device)

n = 100

# Host memory
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32)

# Assign equivalent storage on device
da = cuda.to_device(a)
db = cuda.to_device(b)

# Assign storage on device for output
dc = cuda.device_array_like(a)

# Set up enough threads for kernel
tpb = device.WARP_SIZE
bpg = int(np.ceil(float(n)/tpb))
print('Blocks per grid: {0}'.format(bpg))
print('Threads per block: {0}'.format(tpb))

# Launch kernel
cu_add1[bpg, tpb](da, db, dc)

# Transfer output from device to host
c = dc.copy_to_host()

print(c)


<CUDA device 0 'b'GeForce GTX 1060''>
Blocks per grid: 4
Threads per block: 32
[   0.    2.    4.    6.    8.   10.   12.   14.   16.   18.   20.   22.
   24.   26.   28.   30.   32.   34.   36.   38.   40.   42.   44.   46.
   48.   50.   52.   54.   56.   58.   60.   62.   64.   66.   68.   70.
   72.   74.   76.   78.   80.   82.   84.   86.   88.   90.   92.   94.
   96.   98.  100.  102.  104.  106.  108.  110.  112.  114.  116.  118.
  120.  122.  124.  126.  128.  130.  132.  134.  136.  138.  140.  142.
  144.  146.  148.  150.  152.  154.  156.  158.  160.  162.  164.  166.
  168.  170.  172.  174.  176.  178.  180.  182.  184.  186.  188.  190.
  192.  194.  196.  198.]

Simpler version


In [4]:
@cuda.jit('void(float32[:], float32[:], float32[:])')
def cu_add2(a, b, c):
    """This kernel function will be executed by a thread."""
    i  = cuda.grid(1)

    if i > c.shape[0]:
        return

    c[i] = a[i] + b[i]

In [5]:
device = cuda.get_current_device()

n = 100
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32)
c = np.empty_like(a)

tpb = device.WARP_SIZE
bpg = int(np.ceil(float(n)/tpb))
print('Blocks per grid: {0}'.format(bpg))
print('Threads per block: {0}'.format(tpb))

cu_add2[bpg, tpb](a, b, c)
print(c)


Blocks per grid: 4
Threads per block: 32
[   0.    2.    4.    6.    8.   10.   12.   14.   16.   18.   20.   22.
   24.   26.   28.   30.   32.   34.   36.   38.   40.   42.   44.   46.
   48.   50.   52.   54.   56.   58.   60.   62.   64.   66.   68.   70.
   72.   74.   76.   78.   80.   82.   84.   86.   88.   90.   92.   94.
   96.   98.  100.  102.  104.  106.  108.  110.  112.  114.  116.  118.
  120.  122.  124.  126.  128.  130.  132.  134.  136.  138.  140.  142.
  144.  146.  148.  150.  152.  154.  156.  158.  160.  162.  164.  166.
  168.  170.  172.  174.  176.  178.  180.  182.  184.  186.  188.  190.
  192.  194.  196.  198.]

Using vectorize


In [6]:
@vectorize(['int64(int64, int64)',
            'float32(float32, float32)',
            'float64(float64, float64)'],
           target='cuda')
def cu_add(a, b):
    return a + b

In [7]:
n = 100
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32)
c = cu_add(a, b)
print(c)


[   0.    2.    4.    6.    8.   10.   12.   14.   16.   18.   20.   22.
   24.   26.   28.   30.   32.   34.   36.   38.   40.   42.   44.   46.
   48.   50.   52.   54.   56.   58.   60.   62.   64.   66.   68.   70.
   72.   74.   76.   78.   80.   82.   84.   86.   88.   90.   92.   94.
   96.   98.  100.  102.  104.  106.  108.  110.  112.  114.  116.  118.
  120.  122.  124.  126.  128.  130.  132.  134.  136.  138.  140.  142.
  144.  146.  148.  150.  152.  154.  156.  158.  160.  162.  164.  166.
  168.  170.  172.  174.  176.  178.  180.  182.  184.  186.  188.  190.
  192.  194.  196.  198.]

In [ ]: