In [1]:
#https://github.com/quinnj/Rosetta-Julia/blob/master/src/Haversine.jl
haversine(lat1::Float32,lon1::Float32,lat2::Float32,lon2::Float32) = 2 * 6372.8 * asin(sqrt(sind((lat2-lat1)/2)^2 + cosd(lat1) * cosd(lat2) * sind((lon2 - lon1)/2)^2))
Out[1]:
In [2]:
function pairwise_dist(lat::Vector{Float32}, lon::Vector{Float32})
#Pre-allocate, since size is known
n = length(lat)
result = Array{Float32}(n, n)
#Brute force fill in each cell, ignore that distance [i,j] = distance [j,i]
for i in 1:n
for j in 1:n
@inbounds result[i, j] = haversine(lat[i], lon[i], lat[j], lon[j])
end
end
return result
end
Out[2]:
In [3]:
#Generate reasonable data
lat1 = rand(Float32, 1) .* 45
lon1 = rand(Float32, 1) .* -120
@time pairwise_dist(lat1, lon1);
In [4]:
#Generate reasonable data
lat10 = rand(Float32, 10) .* 45
lon10 = rand(Float32, 10) .* -120
@time pairwise_dist(lat10, lon10);
In [5]:
#Generate reasonable data
lat100 = rand(Float32, 100) .* 45
lon100 = rand(Float32, 100) .* -120
@time pairwise_dist(lat100, lon100);
In [6]:
lat1000 = rand(Float32, 1000) .* 45
lon1000 = rand(Float32, 1000) .* -120
@time pairwise_dist(lat1000, lon1000);
In [7]:
lat10000 = rand(Float32, 10000) .* 45
lon10000 = rand(Float32, 10000) .* -120
@time native_julia_cellwise = pairwise_dist(lat10000, lon10000);
In [ ]:
lat100000 = rand(Float32, 100000) .* 45
lon100000 = rand(Float32, 100000) .* -120
@time pairwise_dist(lat100000, lon100000);
In [8]:
using CUDAnative, CUDAdrv
In [9]:
function kernel_haversine(latpoint::Float32, lonpoint::Float32, lat::AbstractVector{Float32}, lon::AbstractVector{Float32}, out::AbstractVector{Float32})
#Thread index
#Need to do the n-1 dance, since CUDA expects 0 and Julia does 1-indexing
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
out[i] = 2 * 6372.8 * CUDAnative.asin(CUDAnative.sqrt(CUDAnative.sind((latpoint-lat[i])/2)^2 + CUDAnative.cosd(lat[i]) * CUDAnative.cosd(latpoint) * CUDAnative.sind((lonpoint - lon[i])/2)^2))
#Return nothing, since we're writing directly to the out array allocated on GPU
return nothing
end
Out[9]:
In [10]:
#validated kernel_haversine/distmat returns same answer as CPU haversine method (not shown)
function distmat(lat::Vector{Float32}, lon::Vector{Float32}; dev::CuDevice=CuDevice(0))
#Create a context
ctx = CuContext(dev)
#Change to objects with CUDA context
n = length(lat)
d_lat = CuArray(lat)
d_lon = CuArray(lon)
d_out = CuArray(Vector{Float32}(n))
#Calculate number of calculations, threads, blocks
len = n
threads = min(len, 1024)
blocks = Int(ceil(len/threads))
#Julia side accumulation of results to relieve GPU memory pressure
accum = Array{Float32}(n, n)
# run and time the test
secs = CUDAdrv.@elapsed begin
for i in 1:n
@cuda (blocks, threads) kernel_haversine(lat[i], lon[i], d_lat, d_lon, d_out)
accum[:, i] = Vector{Float32}(d_out)
end
end
#Clean up context
destroy!(ctx)
#Return timing and bring results back to Julia
return (secs, accum)
end
Out[10]:
In [11]:
#Run kernel!!!
timing, result = distmat(lat1, lon1)
timing
Out[11]:
In [12]:
#Run kernel!!!
timing, result = distmat(lat10, lon10)
timing
Out[12]:
In [13]:
#Run kernel!!!
timing, result = distmat(lat100, lon100)
timing
Out[13]:
In [14]:
#Run kernel!!!
timing, result = distmat(lat1000, lon1000)
timing
Out[14]:
In [15]:
#Run kernel!!!
timing, result = distmat(lat10000, lon10000)
timing
Out[15]:
In [16]:
result ≈ native_julia_cellwise
Out[16]:
In [ ]:
#Run kernel!!!
timing, result = distmat(lat100000, lon100000)
timing
In [2]:
using ECharts
vsize = [1, 10, 100, 1_000, 10_000, 100_000]
native = [0.000006, 0.000017, 0.001091, 0.090409, 5.620437, 565.008425]
gpu = [0.14232168, 0.15084915, 0.15897949, 0.16998644, 0.6376571, 24.32015]
l = scatter(vsize, hcat(native, gpu), stack = false)
smooth!(l)
legend!(l, right = "right", top = "middle")
[x._type = "line" for x in l.series]
seriesnames!(l, ["CPU", "GPU"])
yaxis!(l, name = "Time in seconds", _type = "log", scale = true, axisLine = AxisLine(show = false))
xaxis!(l, name = "Matrix dimensions (square)", _type = "log", scale = true, axisLine = AxisLine(show = false))
title!(l, text = "Haversine distance: CPU vs. GPU", left = "center", textStyle = TextStyle(fontSize = 14))
xgridlines!(l, show = false)
Out[2]: