Native Julia implementation - cellwise



In [1]:

    
#https://github.com/quinnj/Rosetta-Julia/blob/master/src/Haversine.jl
haversine(lat1::Float32,lon1::Float32,lat2::Float32,lon2::Float32) = 2 * 6372.8 * asin(sqrt(sind((lat2-lat1)/2)^2 + cosd(lat1) * cosd(lat2) * sind((lon2 - lon1)/2)^2))









    Out[1]:





haversine (generic function with 1 method)



In [2]:

    
function pairwise_dist(lat::Vector{Float32}, lon::Vector{Float32})
    
    #Pre-allocate, since size is known
    n = length(lat)
    result = Array{Float32}(n, n)
    
    #Brute force fill in each cell, ignore that distance [i,j] = distance [j,i]
    for i in 1:n
        for j in 1:n
            @inbounds result[i, j] = haversine(lat[i], lon[i], lat[j], lon[j])
        end
    end
    
    return result
    
end









    Out[2]:





pairwise_dist (generic function with 1 method)



In [3]:

    
#Generate reasonable data
lat1 = rand(Float32, 1) .* 45
lon1 = rand(Float32, 1) .* -120

@time pairwise_dist(lat1, lon1);









    



  0.037690 seconds (28.83 k allocations: 1.627 MiB)



In [4]:

    
#Generate reasonable data
lat10 = rand(Float32, 10) .* 45
lon10 = rand(Float32, 10) .* -120

@time pairwise_dist(lat10, lon10);









    



  0.000024 seconds (5 allocations: 704 bytes)



In [5]:

    
#Generate reasonable data
lat100 = rand(Float32, 100) .* 45
lon100 = rand(Float32, 100) .* -120

@time pairwise_dist(lat100, lon100);









    



  0.000573 seconds (6 allocations: 39.297 KiB)



In [6]:

    
lat1000 = rand(Float32, 1000) .* 45
lon1000 = rand(Float32, 1000) .* -120

@time pairwise_dist(lat1000, lon1000);









    



  0.054927 seconds (6 allocations: 3.815 MiB)



In [7]:

    
lat10000 = rand(Float32, 10000) .* 45
lon10000 = rand(Float32, 10000) .* -120

@time native_julia_cellwise = pairwise_dist(lat10000, lon10000);









    



  6.011676 seconds (7 allocations: 381.470 MiB, 0.22% gc time)



In [ ]:

    
lat100000 = rand(Float32, 100000) .* 45
lon100000 = rand(Float32, 100000) .* -120

@time pairwise_dist(lat100000, lon100000);

Julia implementation targeting CUDA



In [8]:

    
using CUDAnative, CUDAdrv



In [9]:

    
function kernel_haversine(latpoint::Float32, lonpoint::Float32, lat::AbstractVector{Float32}, lon::AbstractVector{Float32}, out::AbstractVector{Float32})
    
    #Thread index
    #Need to do the n-1 dance, since CUDA expects 0 and Julia does 1-indexing
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    
    out[i] =  2 * 6372.8 * CUDAnative.asin(CUDAnative.sqrt(CUDAnative.sind((latpoint-lat[i])/2)^2 + CUDAnative.cosd(lat[i]) * CUDAnative.cosd(latpoint) * CUDAnative.sind((lonpoint - lon[i])/2)^2))

    #Return nothing, since we're writing directly to the out array allocated on GPU
    return nothing
end









    Out[9]:





kernel_haversine (generic function with 1 method)



In [10]:

    
#validated kernel_haversine/distmat returns same answer as CPU haversine method (not shown)
function distmat(lat::Vector{Float32}, lon::Vector{Float32}; dev::CuDevice=CuDevice(0))

    #Create a context
    ctx = CuContext(dev)

    #Change to objects with CUDA context
    n = length(lat)
    d_lat = CuArray(lat)
    d_lon = CuArray(lon)
    d_out = CuArray(Vector{Float32}(n))

    #Calculate number of calculations, threads, blocks 
    len = n
    threads = min(len, 1024)
    blocks = Int(ceil(len/threads))
    
    #Julia side accumulation of results to relieve GPU memory pressure
    accum = Array{Float32}(n, n)
    
    # run and time the test
    secs = CUDAdrv.@elapsed begin
        for i in 1:n
            @cuda (blocks, threads) kernel_haversine(lat[i], lon[i], d_lat, d_lon, d_out)
            accum[:, i] = Vector{Float32}(d_out)
        end
    end
    
    #Clean up context
    destroy!(ctx)
    
    #Return timing and bring results back to Julia
    return (secs, accum)
    
end









    Out[10]:





distmat (generic function with 1 method)



In [11]:

    
#Run kernel!!!
timing, result = distmat(lat1, lon1)
timing









    Out[11]:





6.4018493f0



In [12]:

    
#Run kernel!!!
timing, result = distmat(lat10, lon10)
timing









    Out[12]:





0.18106991f0



In [13]:

    
#Run kernel!!!
timing, result = distmat(lat100, lon100)
timing









    Out[13]:





0.11326f0



In [14]:

    
#Run kernel!!!
timing, result = distmat(lat1000, lon1000)
timing









    Out[14]:





0.1480775f0



In [15]:

    
#Run kernel!!!
timing, result = distmat(lat10000, lon10000)
timing









    Out[15]:





0.76675004f0



In [16]:

    
result ≈ native_julia_cellwise









    Out[16]:





true



In [ ]:

    
#Run kernel!!!
timing, result = distmat(lat100000, lon100000)
timing

Plot results



In [2]:

    
using ECharts

vsize = [1, 10, 100, 1_000, 10_000, 100_000]
native = [0.000006, 0.000017, 0.001091, 0.090409, 5.620437, 565.008425]
gpu = [0.14232168, 0.15084915, 0.15897949, 0.16998644, 0.6376571, 24.32015]

l = scatter(vsize, hcat(native, gpu), stack = false)
smooth!(l)
legend!(l, right = "right", top = "middle")
[x._type = "line" for x in l.series]
seriesnames!(l, ["CPU", "GPU"])
yaxis!(l, name = "Time in seconds", _type = "log", scale = true, axisLine = AxisLine(show = false))
xaxis!(l, name = "Matrix dimensions (square)", _type = "log", scale = true,  axisLine = AxisLine(show = false))
title!(l, text = "Haversine distance: CPU vs. GPU", left = "center", textStyle = TextStyle(fontSize = 14))
xgridlines!(l, show = false)