Native Julia implementation - cellwise


In [1]:
#https://github.com/quinnj/Rosetta-Julia/blob/master/src/Haversine.jl
haversine(lat1::Float32,lon1::Float32,lat2::Float32,lon2::Float32) = 2 * 6372.8 * asin(sqrt(sind((lat2-lat1)/2)^2 + cosd(lat1) * cosd(lat2) * sind((lon2 - lon1)/2)^2))


Out[1]:
haversine (generic function with 1 method)

In [2]:
function pairwise_dist(lat::Vector{Float32}, lon::Vector{Float32})
    
    #Pre-allocate, since size is known
    n = length(lat)
    result = Array{Float32}(n, n)
    
    #Brute force fill in each cell, ignore that distance [i,j] = distance [j,i]
    for i in 1:n
        for j in 1:n
            @inbounds result[i, j] = haversine(lat[i], lon[i], lat[j], lon[j])
        end
    end
    
    return result
    
end


Out[2]:
pairwise_dist (generic function with 1 method)

In [3]:
#Generate reasonable data
lat1 = rand(Float32, 1) .* 45
lon1 = rand(Float32, 1) .* -120

@time pairwise_dist(lat1, lon1);


  0.037690 seconds (28.83 k allocations: 1.627 MiB)

In [4]:
#Generate reasonable data
lat10 = rand(Float32, 10) .* 45
lon10 = rand(Float32, 10) .* -120

@time pairwise_dist(lat10, lon10);


  0.000024 seconds (5 allocations: 704 bytes)

In [5]:
#Generate reasonable data
lat100 = rand(Float32, 100) .* 45
lon100 = rand(Float32, 100) .* -120

@time pairwise_dist(lat100, lon100);


  0.000573 seconds (6 allocations: 39.297 KiB)

In [6]:
lat1000 = rand(Float32, 1000) .* 45
lon1000 = rand(Float32, 1000) .* -120

@time pairwise_dist(lat1000, lon1000);


  0.054927 seconds (6 allocations: 3.815 MiB)

In [7]:
lat10000 = rand(Float32, 10000) .* 45
lon10000 = rand(Float32, 10000) .* -120

@time native_julia_cellwise = pairwise_dist(lat10000, lon10000);


  6.011676 seconds (7 allocations: 381.470 MiB, 0.22% gc time)

In [ ]:
lat100000 = rand(Float32, 100000) .* 45
lon100000 = rand(Float32, 100000) .* -120

@time pairwise_dist(lat100000, lon100000);

Julia implementation targeting CUDA


In [8]:
using CUDAnative, CUDAdrv

In [9]:
function kernel_haversine(latpoint::Float32, lonpoint::Float32, lat::AbstractVector{Float32}, lon::AbstractVector{Float32}, out::AbstractVector{Float32})
    
    #Thread index
    #Need to do the n-1 dance, since CUDA expects 0 and Julia does 1-indexing
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    
    out[i] =  2 * 6372.8 * CUDAnative.asin(CUDAnative.sqrt(CUDAnative.sind((latpoint-lat[i])/2)^2 + CUDAnative.cosd(lat[i]) * CUDAnative.cosd(latpoint) * CUDAnative.sind((lonpoint - lon[i])/2)^2))

    #Return nothing, since we're writing directly to the out array allocated on GPU
    return nothing
end


Out[9]:
kernel_haversine (generic function with 1 method)

In [10]:
#validated kernel_haversine/distmat returns same answer as CPU haversine method (not shown)
function distmat(lat::Vector{Float32}, lon::Vector{Float32}; dev::CuDevice=CuDevice(0))

    #Create a context
    ctx = CuContext(dev)

    #Change to objects with CUDA context
    n = length(lat)
    d_lat = CuArray(lat)
    d_lon = CuArray(lon)
    d_out = CuArray(Vector{Float32}(n))

    #Calculate number of calculations, threads, blocks 
    len = n
    threads = min(len, 1024)
    blocks = Int(ceil(len/threads))
    
    #Julia side accumulation of results to relieve GPU memory pressure
    accum = Array{Float32}(n, n)
    
    # run and time the test
    secs = CUDAdrv.@elapsed begin
        for i in 1:n
            @cuda (blocks, threads) kernel_haversine(lat[i], lon[i], d_lat, d_lon, d_out)
            accum[:, i] = Vector{Float32}(d_out)
        end
    end
    
    #Clean up context
    destroy!(ctx)
    
    #Return timing and bring results back to Julia
    return (secs, accum)
    
end


Out[10]:
distmat (generic function with 1 method)

In [11]:
#Run kernel!!!
timing, result = distmat(lat1, lon1)
timing


Out[11]:
6.4018493f0

In [12]:
#Run kernel!!!
timing, result = distmat(lat10, lon10)
timing


Out[12]:
0.18106991f0

In [13]:
#Run kernel!!!
timing, result = distmat(lat100, lon100)
timing


Out[13]:
0.11326f0

In [14]:
#Run kernel!!!
timing, result = distmat(lat1000, lon1000)
timing


Out[14]:
0.1480775f0

In [15]:
#Run kernel!!!
timing, result = distmat(lat10000, lon10000)
timing


Out[15]:
0.76675004f0

In [16]:
result  native_julia_cellwise


Out[16]:
true

In [ ]:
#Run kernel!!!
timing, result = distmat(lat100000, lon100000)
timing

Plot results


In [2]:
using ECharts

vsize = [1, 10, 100, 1_000, 10_000, 100_000]
native = [0.000006, 0.000017, 0.001091, 0.090409, 5.620437, 565.008425]
gpu = [0.14232168, 0.15084915, 0.15897949, 0.16998644, 0.6376571, 24.32015]

l = scatter(vsize, hcat(native, gpu), stack = false)
smooth!(l)
legend!(l, right = "right", top = "middle")
[x._type = "line" for x in l.series]
seriesnames!(l, ["CPU", "GPU"])
yaxis!(l, name = "Time in seconds", _type = "log", scale = true, axisLine = AxisLine(show = false))
xaxis!(l, name = "Matrix dimensions (square)", _type = "log", scale = true,  axisLine = AxisLine(show = false))
title!(l, text = "Haversine distance: CPU vs. GPU", left = "center", textStyle = TextStyle(fontSize = 14))
xgridlines!(l, show = false)


Out[2]: