Deep Learning Bootcamp November 2017, GPU Computing for Data Scientists

Using CUDA, Jupyter, PyCUDA and PyTorch

02 PyCUDA add

Web: https://www.meetup.com/Tel-Aviv-Deep-Learning-Bootcamp/events/241762893/

Notebooks: On GitHub

Shlomo Kashani


In [1]:
# Ignore numpy warnings
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
# Some defaults:
plt.rcParams['figure.figsize'] = (12, 6)  # Default plot size

PyCUDA Imports


In [2]:
import pycuda
from pycuda import compiler
import pycuda.driver as drv

Make sure we have CUDA


In [3]:
drv.init()
print("%d device(s) found." % drv.Device.count())           
for ordinal in range(drv.Device.count()):
    dev = drv.Device(ordinal)
    print "Device #%d: %s" % (ordinal, dev.name())    

drv


1 device(s) found.
Device #0: GeForce GTX 1080
Out[3]:
<module 'pycuda.driver' from '/usr/local/lib/python2.7/dist-packages/pycuda/driver.pyc'>

Simple addition on the GPU: CUDA Kernel definition


In [6]:
import pycuda.autoinit
import numpy

from pycuda.compiler import SourceModule

srcGPU = """
    #include <stdio.h>
   __global__ void addGPU(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;  
  dest[i] = a[i] + b[i];
  //dest[i] = threadIdx.x + threadIdx.y + blockDim.x;
  //dest[i] = blockDim.x;
  //printf("I am %d.%d\\n", threadIdx.x, threadIdx.y);
  
}
"""

srcGPUModule = SourceModule(srcGPU)

print srcGPUModule


<pycuda.compiler.SourceModule object at 0x7ffafa85f850>

Host memory allocation


In [12]:
ARR_SIZE=16

a = numpy.random.randn(ARR_SIZE).astype(numpy.float32)
a=numpy.ones_like(a)*3
print a
b = numpy.random.randn(ARR_SIZE).astype(numpy.float32)
b=numpy.ones_like(b)*2
print b

dest = numpy.zeros_like(a)

# print dest


[ 3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.]
[ 2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.]

Execution on the GPU


In [10]:
addGPUFunc = srcGPUModule.get_function("addGPU")

print addGPUFunc

addGPUFunc(drv.Out(dest), drv.In(a), drv.In(b),
                                          block=(ARR_SIZE,32,1))
print dest


<pycuda._driver.Function object at 0x7ffafa89ddd0>
[ 5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.]

Timing Numpy vs. PyCUDA ...


In [14]:
import timeit

rounds =3
print 'pycuda', timeit.timeit(lambda: 
                              addGPUFunc(drv.Out(dest), drv.In(a), drv.In(b),
                                          grid=(ARR_SIZE,1,1), 
                                          block=(1,1,1)), 
                              number=rounds)
# print dest

# print 'pycuda', timeit.timeit(lambda: 
#                               multGPUFunc(drv.Out(dest), drv.In(a), drv.In(b),                                          
#                                           block=(ARR_SIZE,1,1)), 
#                               number=rounds)

# print dest


print 'npy', timeit.timeit(lambda:a*b , 
                              number=rounds)


pycuda 0.00541090965271
npy 1.00135803223e-05

In [ ]: