In this Jupyter notebook we show how to use the GPU seamlessly in a scripting mode with Alea GPU V3. We calculate the pairwise distance of a large collection of points in different ways on the GPU and analyze the speedup of the different implementations.
Many machine learning algorithms require calculation of certain type of distances, because distance is a good measure for differences between data points. Distance matrices are used in Support Vector Machines, K-Nearest Neighbors, K-Means or Radial Basis Function Approximation. For instance the K-Nearest Neighbors problem can be solved by simply sorting the distance matrix.
Let us consider an $n$ by $m$ matrix $S$ where $n$ is the number of points and $m$ is the size of the feature space. We want to calculate all the pairwise distances between points $s_i = s_{i k}$, $s_j = s_{j k}$
$$ d_{ij} = d(s_i, s_j) \quad \forall i,j=1,\ldots,n $$A standard distance measures is the weighted Euclidian distance
$$ d_{L_2}(s_i, s_j) = \sqrt{\left( w_k \sum_k (s_{i k} - s_{j k})^2 \right)}. $$Other distance measures are the cosine similarity
$$ d_{\rho}(s_i, s_j) = \frac{\sum_k s_{i k} s_{j k}}{\sqrt{\sum_k s_{i k}^2}\sqrt{\sum_k s_{j k}^2}}, $$which is a useful measure in documents comparison and text mining, or the weighted Manhattan resp. $L_1$ norm
$$ d_{L_1}(s_i, s_j) = \sum_k w_k |s_{i k} - s_{j k}|. $$Also non-symmetric distance measures are useful, such as the Kullback-Leibler distance
$$ D(s_i \parallel s_j) = \sum_k s_{i k} \log \left( \frac{s_{i k}}{s_{j k}} \right). $$The complexity of the pairwise distance calculation is $O(m n^2)$ which becomes prohibitively lrage for large number of points and high dimensional feature spaces.
Before you continue you should install the F# kernel for Jupyter.
We use Paket to get version 3.0.3 beta2 NuGet packages of Alea GPU. You can run Alea GPU free on any CUDA capable GeForce or Quadro GPU. In case you want to run it on an enterprise GPU such as a Tesla K80 or the brand new NVIDIA P100 you need a license.
Unfortunately as of now (January 2017), it is not possible to run this notebook on Azure Notebooks with server side execution because the Azure Notebooks service does not yet provide GPU instances.
In [1]:
#load "Paket.fsx"
Paket.Version [ ("Alea", "3.0.3-beta2") ]
Paket.Package [ "NUnit" ]
Out[1]:
In [2]:
#load "packages/Alea/Alea.fsx"
#r "packages/Alea/lib/net45/Alea.Parallel.dll"
#r "packages/NUnit/lib/net45/nunit.framework.dll"
In [3]:
#load "XPlot.Plotly.Paket.fsx"
#load "XPlot.Plotly.fsx"
open XPlot.Plotly
In [4]:
open System
open System.Threading.Tasks;
open Alea
open Alea.FSharp
open Alea.Parallel
let gpu = Gpu.Default
let s = sprintf "GPU is %A, Number of Cores %A, GPU DRAM is %.3f GB, Process is %d bit"
gpu
gpu.Device.Cores
((float) gpu.Device.TotalMemory / 1024. / 1024. / 1024.)
(IntPtr.Size * 8)
{ Html = s }
Out[4]:
In [5]:
let toArray2D numPoints pointDim (flat:float[]) =
let matrix = Array2D.zeroCreate numPoints pointDim
for i = 0 to numPoints - 1 do
for j = 0 to pointDim - 1 do
matrix.[i, j] <- flat.[i * pointDim + j]
matrix
let symmetrizeUpperToLower (matrix:'T[,]) =
let n = matrix.GetLength 0
if n <> matrix.GetLength 1 then failwith "matrix has to be square"
for i = 0 to n - 1 do
for j = i + 1 to n - 1 do
matrix.[j, i] <- matrix.[i, j]
matrix
let inline diff (a:'T[]) (b:'T[]) =
Array.map2 (fun a b -> (a - b)) a b
let inline absError (a:'T[]) (b:'T[]) =
if a.Length <> b.Length then failwithf "arrays have different length %d, %d" a.Length b.Length
Array.map2 (fun a b -> abs (a - b)) a b |> Array.max
let inline absError2D (a:'T[,]) (b:'T[,]) : 'T =
if a.GetLength 0 <> b.GetLength 0 || a.GetLength 1 <> b.GetLength 1 then failwith "incompatible matrix size"
Seq.map2 (fun a b -> abs (a - b)) (Seq.cast<'T> a) (Seq.cast<'T> b) |> Seq.max
let flatten (a:'T[,]) =
a |> Seq.cast<'T> |> Seq.toArray
let random = new Random(42)
We start with a single data set and calculate the pairwise distances between all points. Because the Euclidian distance is symmetric, the resulting matrix is symmetric. For $n$ points we have to process $n(n-1)/2$ pairs.
The first implementation is a serial implementation:
In [12]:
let pairwiseDistanceCPU (points:float[,]) =
let numPoints = points.GetLength(0)
let pointDim = points.GetLength(1)
seq {
for i = 0 to numPoints - 1 do
for j = i + 1 to numPoints - 1 do
let mutable dist = 0.0
for k = 0 to pointDim - 1 do
let d = points.[i, k] - points.[j, k]
dist <- dist + d*d
yield Math.Sqrt dist
} |> Seq.toArray
To parallelize the calculation we map the index pair $(i, j)$ with $i < j$ to a linear index $k$ as follows:
In [13]:
let upperTriangularMatrixSize n =
n*(n - 1)/2
[<ReflectedDefinition>]
let toIndex n (k:int) =
let i = n - 2 - int(floor(0.5*sqrt(float(-8*k + 4*n*(n-1)-7)) - 0.5))
let j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2
i, j
let toLinear n i j =
(n*(n-1)/2) - (n-i)*((n-i)-1)/2 + j - i - 1
let toUpderTriangular n (array:'T[]) =
let matrix = Array2D.zeroCreate<'T> n n
array |> Array.iteri (fun l a -> let i, j = toIndex n l in matrix.[i, j] <- a)
matrix
In [14]:
let n = 5
type data = {Linear:int; iExpected:int; jExpected:int; iCalculated:int; jCalculated:int}
seq {
for i0 = 0 to n-1 do
for j0 = i0+1 to n-1 do
let k = toLinear n i0 j0
let i, j = toIndex n k
yield {Linear=k; iExpected=i0; jExpected=j0; iCalculated=i; jCalculated=j}
} |> Util.Table
Out[14]:
The following function handles the distance calculation for a pair of points $(i, j)$ correspoinding to a linear index $k$.
In [15]:
[<ReflectedDefinition>]
let pairwiseDistance (points:float[,]) numPoints linear =
let i, j = toIndex numPoints linear
let mutable dist = 0.0
for k = 0 to points.GetLength(1) - 1 do
let d = points.[i, k] - points.[j, k]
dist <- dist + d*d
sqrt dist
In [16]:
let pairwiseDistanceParallelForCPU (points:float[,]) =
let numPoints = points.GetLength(0);
let n = upperTriangularMatrixSize numPoints
let distances = Array.zeroCreate n;
Parallel.For(0, n, (fun linear -> distances.[linear] <- pairwiseDistance points numPoints linear)) |> ignore
distances
In [17]:
let randomMatrix numPoints pointDim =
Array2D.init numPoints pointDim (fun i j -> random.NextDouble())
let numPoints = 5
let pointDim = 3
let points = randomMatrix numPoints pointDim
In [18]:
let distances = pairwiseDistanceParallelForCPU points
distances |> toUpderTriangular (points.GetLength 0) |> symmetrizeUpperToLower
Out[18]:
In [19]:
absError distances (pairwiseDistanceCPU points)
Out[19]:
In [20]:
let pairwiseDistanceParallelForGPU (gpu:Gpu) (hostPoints:float[,]) =
let numPoints = hostPoints.GetLength(0);
let n = upperTriangularMatrixSize numPoints
let points = gpu.Allocate(hostPoints);
let distances = gpu.Allocate<double>(n);
// use IL to catch closure, then call static a function which is compiled to GPU code from a quotation
gpu.For(0, n, (fun linear -> distances.[linear] <- pairwiseDistance points numPoints linear))
let hostDistances = Gpu.CopyToHost(distances)
Gpu.Free(points)
Gpu.Free(distances)
hostDistances
In [21]:
let distancesGpu = pairwiseDistanceParallelForGPU Gpu.Default points
distancesGpu |> toUpderTriangular (points.GetLength 0) |> symmetrizeUpperToLower
Out[21]:
In [22]:
absError distances distancesGpu
Out[22]:
In [38]:
[<ReflectedDefinition>]
let toIndexForGpu n (k:int) =
let i = n - 2 - int(floor(0.5*sqrt(float(-8*k + 4*n*(n-1)-7)) - 0.5))
let j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2
int2(i, j)
[<ReflectedDefinition>]
let pairwiseDistanceGpu (points:float[,]) (index:int2) =
let mutable dist = 0.0
for k = 0 to points.GetLength(1) - 1 do
let d = points.[index.x, k] - points.[index.y, k]
dist <- dist + d*d
sqrt dist
let pairwiseDistanceParallelForGPU2(gpu:Gpu) (hostPoints:float[,]) =
let numPoints = hostPoints.GetLength(0);
let n = upperTriangularMatrixSize numPoints
let points = gpu.Allocate(hostPoints);
let distances = gpu.Allocate<float>(n);
gpu.For(0, n, (fun linear -> distances.[linear] <- pairwiseDistanceGpu points (toIndexForGpu numPoints linear)))
let hostDistances = Gpu.CopyToHost(distances)
Gpu.Free(points)
Gpu.Free(distances)
hostDistances
In [39]:
let distancesGpu = pairwiseDistanceParallelForGPU2 Gpu.Default points
distancesGpu |> toUpderTriangular (points.GetLength 0) |> symmetrizeUpperToLower
Out[39]:
In [40]:
absError distances distancesGpu
Out[40]:
We now write a specific GPU kernel that exploits shared memory to reduce the access to global GPU memory and reuse data more efficiently.
In [41]:
let blockSize = 16
[<ReflectedDefinition>]
let pairwiseDistanceGpuKernel =
<@ fun (points:float[]) (distances:float[]) numPoints pointDim ->
let Xs = __shared__.Array2D<double>(blockSize, blockSize)
let Ys = __shared__.Array2D<double>(blockSize, blockSize)
let bx = blockIdx.x
let by = blockIdx.y
let tx = threadIdx.x
let ty = threadIdx.y
let xBegin = bx*blockSize*pointDim
let yBegin = by*blockSize*pointDim
let yEnd = yBegin + pointDim - 1
let mutable y = yBegin
let mutable x = xBegin
let mutable dist = 0.0
while y <= yEnd do
Xs.[tx, ty] <- points.[x + ty*pointDim + tx]
Ys.[ty, tx] <- points.[y + ty*pointDim + tx]
DeviceFunction.SyncThreads()
for k = 0 to blockSize - 1 do
let d = Ys.[ty, k] - Xs.[k, tx]
dist <- dist + d*d
DeviceFunction.SyncThreads()
x <- x + blockSize
y <- y + blockSize
let i = (by*blockSize + ty)*numPoints + bx*blockSize + tx
distances.[i] <- sqrt dist @>
|> Compiler.makeKernel
let divUp num den = (num + den - 1) / den
let pairwiseDistanceGpuBlocked (gpu:Gpu) (hostPoints:float[]) (numPoints:int) (pointDim:int) =
let points = gpu.Allocate(hostPoints);
let distances = gpu.Allocate<float>(numPoints*numPoints)
let block = new dim3(blockSize, blockSize)
let gridSize = divUp numPoints blockSize
let grid = new dim3(gridSize, gridSize)
let lp = LaunchParam(grid, block)
gpu.Launch pairwiseDistanceGpuKernel lp points distances numPoints pointDim
let hostDistances = Gpu.CopyToHost(distances)
Gpu.Free(points)
Gpu.Free(distances)
hostDistances
In [42]:
let numPoints = 10*blockSize
let pointDim = blockSize
let points = randomMatrix numPoints pointDim
In [43]:
let distances = pairwiseDistanceGpuBlocked Gpu.Default (points |> flatten) numPoints pointDim
distances |> Array.take 5
Out[43]:
In [44]:
let distances2 =
pairwiseDistanceParallelForGPU Gpu.Default points
|> toUpderTriangular (points.GetLength 0)
|> symmetrizeUpperToLower
|> flatten
absError distances distances2
Out[44]:
In [45]:
type data =
{
NumPoints: int; PointDim: int; Implementation: string; Timing: string; Speedup: string
}
static member Create numPoints pointDim t0 impl t =
{
NumPoints = numPoints
PointDim = pointDim
Implementation = impl
Timing = sprintf "%.4f s" (t / 1000.0)
Speedup = sprintf "%.2f" (t0 / t)
}
let gpu = Gpu.Default
let timeIt (f : unit -> 'T) =
gpu.Synchronize()
GC.Collect()
GC.WaitForPendingFinalizers()
let t = System.Diagnostics.Stopwatch.StartNew()
f() |> ignore
t.Elapsed.TotalMilliseconds
let performance numPoints pointDim =
let points = randomMatrix numPoints pointDim
let pointsFlat = points |> flatten
// jit warmup
pairwiseDistanceParallelForGPU gpu points |> ignore
pairwiseDistanceParallelForGPU2 gpu points |> ignore
pairwiseDistanceGpuBlocked gpu pointsFlat numPoints pointDim |> ignore
// reference
let t0 = timeIt (fun () -> pairwiseDistanceCPU points)
let create = data.Create numPoints pointDim t0
[
create "cpu sequential" t0
create "cpu parallel for" (timeIt (fun () -> pairwiseDistanceParallelForCPU points))
create "gpu parallel for" (timeIt (fun () -> pairwiseDistanceParallelForGPU gpu points))
create "gpu parallel for with int2" (timeIt (fun () -> pairwiseDistanceParallelForGPU2 gpu points))
create "gpu custom kernel" (timeIt (fun () -> pairwiseDistanceGpuBlocked gpu pointsFlat numPoints pointDim))
]
In [46]:
let results =
[10; 50; 100; 150; 200; 250; 300]
|> List.map (fun s -> performance (s*blockSize) blockSize)
|> List.concat
results |> Util.Table
Out[46]:
In [47]:
let layout =
Layout(title = "CPU / GPU Performance",
xaxis=Xaxis(title = "Numer of Points", showgrid = false, zeroline = false),
yaxis=Yaxis(title = "Speedup", showline = false), showlegend = true)
let grouped = results |> List.groupBy (fun data -> data.Implementation)
let bars =
grouped |> List.map(fun (impl, s) ->
let x = s |> List.map (fun data -> sprintf "%d" data.NumPoints)
let y = s |> List.map (fun data -> data.Speedup)
Bar(x = x, y = y, name = impl))
bars
|> Seq.toList
|> Chart.Plot
|> Chart.WithLayout layout
Out[47]:
In [ ]: