(This notebook contains code from a webinar showing how to use Python and Numba to write code that takes advantage of the GPU-capabilities on AMD APUs. For a recording of this webinar, see: )
This example requires:
The easiest way to test out the code in this notebook is to download Miniconda and run the following commands to create and activate a new environment:
conda create -n hsa_webinar python=3.4 numba libhlc pandas bokeh matplotlib basemap jupyter
source activate hsa_webinar
export LD_LIBRARY_PATH=/opt/hsa/lib
jupyter notebook
In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import numba.hsa
from bokeh.plotting import output_notebook, figure, show
from bokeh.models import NumeralTickFormatter
output_notebook()
For these examples, we use geographic points information recorded by a NASA satellite observing lightning strikes from orbit. These points are excerpted from the LIS/OTD Climatology dataset.
The data have been saved in compressed CSV form in the Git repository.
In [2]:
lstrikes = pd.read_csv('points.csv.gz')
print('Number of points in file:', len(lstrikes))
lstrikes = pd.concat([lstrikes]*10) # Expand the test data by duplicating it ten times
print('Number of points in memory:', len(lstrikes))
We can draw a map showing a subset of these values:
In [3]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 12, 6
m = Basemap(projection='merc',llcrnrlat=26,urcrnrlat=37,\
llcrnrlon=-107,urcrnrlon=-90,lat_ts=1, resolution='l')
# draw coastlines.
m.drawcoastlines(color='#253746')
m.drawcountries(color='#253746')
m.drawstates(color='#253746')
m.drawmapboundary(fill_color='#089bcc')
m.fillcontinents(color='#A4BCC2',lake_color='#089bcc')
sample = lstrikes.sample(n=1000)
m.scatter(y=sample.latitude.values, x=sample.longitude.values, latlon=True, marker='.', color='yellow')
m.plot(y=30.25, x=-97.75, latlon=True, marker="*", color="#253746")
m.tissot(-97.75, 30.25, 3.59, npts=30, alpha=0.20, facecolor='#089bcc', edgecolor='#253746', linewidth='3')
plt.show()
Universal functions are an incredibly useful concept from NumPy for describing operations on arrays of data. A simple example using two ufuncs is:
In [4]:
a = np.array([1, 4, 9])
result = np.sqrt(a) + 1
print(result)
Let's first create a function that computes the distance between two points on the surface of the earth:
In [5]:
import math
R_EARTH = 6371.0 # km
@numba.hsa.jit('float64(float64)', device=True)
def deg2rad(deg):
return math.pi * deg / 180.0
@numba.vectorize('float64(float64, float64, float64, float64)', target='hsa')
def gpu_great_circle_distance(lat1, lng1, lat2, lng2):
'''Return the great-circle distance in km between (lat1, lng1) and (lat2, lng2)
on the surface of the Earth.'''
lat1, lng1 = deg2rad(lat1), deg2rad(lng1)
lat2, lng2 = deg2rad(lat2), deg2rad(lng2)
sin_lat1, cos_lat1 = math.sin(lat1), math.cos(lat1)
sin_lat2, cos_lat2 = math.sin(lat2), math.cos(lat2)
delta = lng1 - lng2
sin_delta, cos_delta = math.sin(delta), math.cos(delta)
numerator = math.sqrt((cos_lat1 * sin_delta)**2 +
(cos_lat1 * sin_lat2 - sin_lat1 * cos_lat2 * cos_delta)**2)
denominator = sin_lat1 * sin_lat2 + cos_lat1 * cos_lat2 * cos_delta
return R_EARTH * math.atan2(numerator, denominator)
Next we can verify that it works, and see how fast it is for a particular data set size:
In [6]:
print('Distance between Austin and Chicago:', gpu_great_circle_distance(30.25, -97.75, 41.8369, -87.6847)[0], 'km')
In [7]:
%timeit distances = gpu_great_circle_distance(30.25, -97.75, lstrikes.latitude.values, lstrikes.longitude.values)
To benchmark the HSA version of the algorithm, we need a NumPy version to compare with:
In [8]:
def numpy_great_circle_distance(lat1, lng1, lat2, lng2):
'''Return the great-circle distance in km between (lat1, lng1) and (lat2, lng2)
on the surface of the Earth.'''
lat1, lng1 = np.deg2rad(lat1), np.deg2rad(lng1)
lat2, lng2 = np.deg2rad(lat2), np.deg2rad(lng2)
# See last formula in http://en.wikipedia.org/wiki/Great-circle_distance#Computational_formulas
sin_lat1 = np.sin(lat1)
cos_lat1 = np.cos(lat1)
sin_lat2 = np.sin(lat2)
cos_lat2 = np.cos(lat2)
delta = lng1 - lng2
cos_delta = np.cos(delta)
sin_delta = np.sin(delta)
numerator = np.sqrt((cos_lat1 * sin_delta)**2 +
(cos_lat1 * sin_lat2 - sin_lat1 * cos_lat2 * cos_delta)**2)
denominator = sin_lat1 * sin_lat2 + cos_lat1 * cos_lat2 * cos_delta
return R_EARTH * np.arctan2(numerator, denominator)
Finally, we can benchmark for a range of data sizes:
In [9]:
target_lat, target_lng = 30.25, -97.75
cpu = []
hsa = []
ex1_n = []
for lg2 in range(8, 21):
npoints = 2**lg2
ex1_n.append(npoints)
sample = lstrikes.iloc[:npoints]
lat = sample.latitude.values
lng = sample.longitude.values
result = %timeit -o numpy_great_circle_distance(target_lat, target_lng, lat, lng)
cpu.append(result.best)
result = %timeit -o gpu_great_circle_distance(target_lat, target_lng, lat, lng)
hsa.append(result.best)
# Compute speedup ratio
ex1_speedup = np.array(cpu) / np.array(hsa)
In [10]:
p = figure(plot_width=800, plot_height=500, title='Compute Distance from Target')
p.xaxis.axis_label = 'Number of points'
p.xaxis[0].formatter = NumeralTickFormatter(format="0,0")
p.yaxis.axis_label = 'GPU speedup ratio'
p.line(ex1_n, ex1_speedup, color='navy')
p.line(ex1_n, 1.0, color='red', line_dash=[10,10])
show(p)
In [11]:
@numba.hsa.jit('float64(float64, float64, float64, float64)', device=True)
def device_great_circle_distance(lat1, lng1, lat2, lng2):
lat1, lng1 = deg2rad(lat1), deg2rad(lng1)
lat2, lng2 = deg2rad(lat2), deg2rad(lng2)
sin_lat1 = math.sin(lat1)
cos_lat1 = math.cos(lat1)
sin_lat2 = math.sin(lat2)
cos_lat2 = math.cos(lat2)
delta = lng1 - lng2
cos_delta = math.cos(delta)
sin_delta = math.sin(delta)
numerator = math.sqrt((cos_lat1 * sin_delta)**2 +
(cos_lat1 * sin_lat2 - sin_lat1 * cos_lat2 * cos_delta)**2)
denominator = sin_lat1 * sin_lat2 + cos_lat1 * cos_lat2 * cos_delta
return R_EARTH * math.atan2(numerator, denominator)
This is the kernel that will map HSA work-items to point pairs in the distance calculation:
In [12]:
@numba.hsa.jit('(float64[:], float64[:], float64[:,:])')
def distance_matrix(lat, lng, distance):
this_idx = numba.hsa.get_global_id(0)
npoints = lat.size
if this_idx >= npoints:
return
this_lat = lat[this_idx]
this_lng = lng[this_idx]
for other_idx in range(npoints):
other_lat = lat[other_idx]
other_lng = lng[other_idx]
distance[this_idx, other_idx] = device_great_circle_distance(this_lat, this_lng,
other_lat, other_lng)
And here is how to launch a computation of the distance matrix for 4 points:
In [13]:
# Create output array
sample = lstrikes.iloc[:4]
distance = np.empty(shape=(len(sample), len(sample)), dtype=np.float64)
# Compute dimensions of grid
items_per_group = 64 # Use multiple of 64
groups = int(math.ceil(len(sample) / items_per_group))
# Call GPU kernel
distance_matrix[groups, items_per_group](sample.latitude.values,
sample.longitude.values,
distance)
print(distance)
For comparison, we can compute the same matrix with NumPy:
In [14]:
numpy_great_circle_distance(sample.latitude.values[:, np.newaxis],
sample.longitude.values[:, np.newaxis],
sample.latitude.values[np.newaxis, :],
sample.longitude.values[np.newaxis, :])
Out[14]:
Again, we can benchmark the distance matrix calculation for a range of input sizes.
In [15]:
cpu = []
hsa = []
ex2_n = []
# Adding a few points at the end to verify the asymptotic behavior.
npoints_list = (2 ** np.array(range(8,13))).astype(int).tolist() + [1024 * 5, 1024 * 6, 1024 * 7]
for npoints in npoints_list:
ex2_n.append(npoints)
sample = lstrikes.iloc[:npoints]
lat = sample.latitude.values
lng = sample.longitude.values
distance = np.empty(shape=(len(sample), len(sample)), dtype=np.float64)
result = %timeit -o numpy_great_circle_distance(lat[:, np.newaxis], lng[:, np.newaxis], lat[np.newaxis, :], lng[np.newaxis, :])
cpu.append(result.best)
items_per_group = 64 # Use multiple of 64
groups = int(math.ceil(len(sample) / items_per_group))
result = %timeit -o distance_matrix[groups, items_per_group](lat, lng, distance)
hsa.append(result.best)
# Compute speedup ratio
ex2_speedup = np.array(cpu) / np.array(hsa)
In [16]:
p = figure(plot_width=800, plot_height=500, title='Compute Distance Matrix')
p.xaxis.axis_label = 'Number of points'
p.xaxis[0].formatter = NumeralTickFormatter(format="0,0")
p.yaxis.axis_label = 'GPU speedup ratio'
p.line(ex2_n, ex2_speedup, color='navy')
p.line(ex2_n, 1.0, color='red', line_dash=[10,10])
show(p)
In [17]:
def reldiff(a, b):
return 2 * (a - b) / (a + b)
cpu_reldiff = numba.vectorize('float32(float32, float32)')(reldiff)
gpu_reldiff = numba.vectorize('float32(float32, float32)', target='hsa')(reldiff)
In [18]:
x = np.random.uniform(size=10000000).astype(np.float32)
y = np.random.uniform(size=10000000).astype(np.float32)
cpu_result = cpu_reldiff(x, y)
gpu_result = gpu_reldiff(x, y)
print('CPU:', cpu_result)
print('GPU:', gpu_result)
For extremely simple functions, like reldiff, the GPU does not offer much benefit:
In [19]:
%timeit cpu_reldiff(x, y)
%timeit gpu_reldiff(x, y)
In [ ]: