In this Jupyter notebook we show how to use the GPU seamlessly in a scripting mode with Alea GPU V3. We develop an matrix expression GPU expression library and use it to perform Hamiltonian Monte Carlo simulation on the GPU. The highlights of the notebook are:
A light weight GPU matrix expression library which is capable to do elementwise kernel fusion to minimize creation of temporaries and reduce number of GPU kernels to be called
Using the power of F# computation expression and opeator overloading to get concise syntax close to mathematical notation
Showing how to develop Hamiltonian Monte Carlo from scratch and apply it to sample the posterior distribution of a multinomial regression model
Illustrating the concpets with the MNIST data set
Bayesian inference requires algorithms capable to fit models with thousands of parameters and complex nonlinearities. Hamiltonian Monte Carlo initially introduced by Duan, Kennedy, Pendleton and Roweth in 1987 has proven to be extremely successful. Hamiltonian Monte Carlo (HMC) is a variation of the Markov Chain Monte Carlo (MCMC) methods which use the derivative of the posterior distribution to select the step direction and size. Therefore, HMC preferentially choose directions of increasing probability.
There are several papers explaining HMC in detail.
Before you continue you should install the F# kernel for Jupyter.
We use Paket to get the newest version 3.0.3 beta2 NuGet packages of Alea GPU. You can run Alea GPU free on any CUDA capable GeForce or Quadro GPU. In case you want to run it on an enterprise GPU such as a Tesla K80 or the brand new NVIDIA P100 you need a license.
Unfortunately as of now (January 2017), it is not possible to run this notebook on Azure Notebooks with server side execution because the Azure Notebooks service does not yet provide GPU instances.
In [1]:
#load "Paket.fsx"
Paket.Version [ ("Alea", "3.0.3-beta2") ]
Paket.Package [ "NUnit" ]
Out[1]:
In [2]:
#load "packages/Alea/Alea.fsx"
#r "packages/Alea/lib/net45/Alea.Parallel.dll"
#r "packages/NUnit/lib/net45/nunit.framework.dll"
In [3]:
#load "XPlot.Plotly.Paket.fsx"
#load "XPlot.Plotly.fsx"
open XPlot.Plotly
In [4]:
open System
open System.IO
open System.IO.Compression
open System.Linq
open System.Threading.Tasks
open System.Net
open System.Diagnostics
open System.Collections.Generic
open Microsoft.FSharp.Quotations
open Alea
open Alea.CSharp
open Alea.Parallel
open Alea.cuBLAS
open Alea.cuRAND
let gpu = Gpu.Default
let s = sprintf "GPU is %A, Number of Cores %A, GPU DRAM is %.3f GB, Process is %d bit"
gpu
gpu.Device.Cores
((float) gpu.Device.TotalMemory / 1024. / 1024. / 1024.)
(IntPtr.Size * 8)
{ Html = s }
Out[4]:
In [5]:
let divUp num den = (num + den - 1) / den
let array2DAsLists (a:'T[,]) =
List.init (a.GetLength 0) (fun i -> List.init (a.GetLength 1) (fun j -> a.[i, j]))
// flatten a 2D array to an array in column major storage order
let flattenArray2D (a:'T[,]) =
let numRows = a.GetLength 0
let numCols = a.GetLength 1
let aflat = Array.zeroCreate (numRows*numCols)
let mutable k = 0
for j = 0 to numCols - 1 do
for i = 0 to numRows - 1 do
aflat.[k] <- a.[i, j]
k <- k + 1
aflat
// get a column
let columnVec (a:'T[,]) col =
let numRows = a.GetLength 0
let columnVec = Array.zeroCreate numRows
for i = 0 to numRows - 1 do
columnVec.[i] <- a.[i, col]
columnVec
// reconstruct 2D array from a flat array in column major storage order
let array2DColMajor numRows numCols (aflat:'T[]) =
let a = Array2D.zeroCreate numRows numCols
let mutable k = 0
for j = 0 to numCols - 1 do
for i = 0 to numRows - 1 do
a.[i, j] <- aflat.[k]
k <- k + 1
a
// reconstruct 2D array from a flat array in row major storage order
let array2DRowMajor numRows numCols (aflat:'T[]) =
let a = Array2D.zeroCreate numRows numCols
let mutable k = 0
for i = 0 to numRows - 1 do
for j = 0 to numCols - 1 do
a.[i, j] <- aflat.[k]
k <- k + 1
a
// matrix product a*b with a of dimension m x k, b of dimension k x n, c of dimension m x n, all in Fortran column major storage
let inline multiplyCpu m n k (a:'T[]) (b:'T[]) =
let lda = m
let ldb = k
let ldc = m
let c = Array.zeroCreate (m*n)
for row = 0 to m - 1 do
for col = 0 to n - 1 do
let mutable sum = LanguagePrimitives.GenericZero
for l = 0 to k - 1 do
sum <- sum + a.[row + l*lda] * b.[l + col*ldb]
c.[row + col*ldc] <- sum
c
// matrix product a^t*b with a of dimension k x m, b of dimension k x n, c of dimension m x n, all in Fortran column major storage
let inline multiplyTranspCpu m n k (a:'T[]) (b:'T[]) =
let lda = k
let ldb = k
let ldc = m
let c = Array.zeroCreate (m*n)
for row = 0 to m - 1 do
for col = 0 to n - 1 do
let mutable sum = LanguagePrimitives.GenericZero
for l = 0 to k - 1 do
sum <- sum + a.[l + row*lda] * b.[l + col*ldb]
c.[row + col*ldc] <- sum
c
// transpose a matrix in Fortran column major storage format
let inline transpose numRows numCols (aflat:'T[]) =
let atflat = Array.zeroCreate (numRows*numCols)
for i = 0 to numRows - 1 do
for j = 0 to numCols - 1 do
atflat.[i*numCols + j] <- aflat.[j*numRows + i]
atflat
let inline error (a:'T[]) (b:'T[]) = Array.map2 (fun a b -> abs (a - b)) a b |> Array.max
In [6]:
let [<Literal>] UseFortranOrder = true
type Session(gpu:Gpu) =
inherit DisposableObject()
let session = new Alea.Session(gpu)
let blas = new cuBLAS.Blas(gpu)
let rand = cuRAND.Generator.CreateGpu(gpu, cuRAND.RngType.PSEUDO_DEFAULT)
let memoryPoolAll = List<DeviceMemory<byte>>()
let memoryPoolBin = List<DeviceMemory<byte>>()
let mutable disposed = false
override this.Dispose(disposing) =
if disposing then
lock this <| fun _ ->
if not disposed then
memoryPoolAll |> Seq.iter (fun memory -> memory.Dispose())
memoryPoolAll.Clear()
memoryPoolBin.Clear()
blas.Dispose()
rand.Dispose()
session.Dispose()
disposed <- true
base.Dispose(disposing)
member this.Gpu = gpu
member this.Session = session
member this.BLAS = blas
member this.RAND = rand
member this.Allocate<'T>(size:int) =
lock this <| fun _ ->
if not disposed then
let size = (nativeint size) * (nativeint (Gpu.SizeOf<'T>()))
let idx = memoryPoolBin.FindIndex(fun memory -> memory.Size >= size)
if idx < 0 then // not found
let memory = gpu.AllocateDevice<byte>(size)
memoryPoolAll.Add(memory)
deviceptr<'T>(memory.Ptr.Handle)
else
let memory = memoryPoolBin.[idx]
memoryPoolBin.RemoveAt(idx)
deviceptr<'T>(memory.Ptr.Handle)
else failwith "Session is disposed!"
member this.Release(ptr:deviceptr<'T>) =
lock this <| fun _ ->
if not disposed then
let idx = memoryPoolAll.FindIndex(fun memory -> memory.Handle = ptr.Handle)
if idx >= 0 then // find
let memory = memoryPoolAll.[idx]
let compare = System.Comparison<DeviceMemory<byte>>
(fun memory1 memory2 ->
let x = memory1.Size
let y = memory2.Size
if x < y then -1 elif x > y then 1 else 0)
memoryPoolBin.Add(memory)
memoryPoolBin.Sort(compare)
type Calculation<'T>(f:Session -> 'T) =
member this.Invoke b = f b
type CalculationBuilder() =
member this.Bind(x:Calculation<'A>, res:'A -> Calculation<'B>) =
Calculation(fun b -> let r = x.Invoke(b) in res(r).Invoke(b))
member this.Return(x:'A) = Calculation(fun b -> x)
member this.ReturnFrom(x:'A) = x
member this.Zero() = Calculation(fun b -> ())
member this.For(elements:seq<'A>, forBody:'A -> Calculation<unit>) =
Calculation(fun b -> elements |> Seq.iter (fun e -> (forBody e).Invoke(b); GC.Collect(); GC.WaitForPendingFinalizers()))
member this.Combine(partOne:Calculation<unit>, partTwo:Calculation<'A>) =
Calculation(fun b -> partOne.Invoke(b); partTwo.Invoke(b))
member this.Delay(restOfComputation:unit -> Calculation<'A>) =
Calculation(fun b -> restOfComputation().Invoke(b))
member this.Using<'A, 'B when 'A :> IDisposable>(x:'A, res:'A -> Calculation<'B>) =
Calculation(fun b -> try res(x).Invoke(b) finally x.Dispose())
let gpuRun<'T> session (calculation:Calculation<'T>) =
calculation.Invoke session
let gpucalc = CalculationBuilder()
type GpuMatFill<'T> =
| Undefined
| PreAllocated of dmem:DeviceMemory<'T>
| FillFunc of fill:(deviceptr<'T> -> unit)
| Elementwise of emit:Func<int, 'T>
| Transpose of mat:GpuMat<'T>
member this.Fill (session:Alea.Session) (rows:int) (cols:int) (ptr:deviceptr<'T>) =
match this with
| Undefined -> ()
| PreAllocated _ -> ()
| FillFunc fill -> fill ptr
| Elementwise emit -> session.For(0, rows*cols, (fun i -> ptr.Set(i, emit.Invoke i)))
| Transpose mat -> failwith "Transpose is TODO."
and GpuMat<'T>(session:Session, rows:int, cols:int, initFill:GpuMatFill<'T>) =
inherit DisposableObject()
let size = rows * cols
let mutable _fill = initFill
let mutable ptrOpt =
match _fill with
| PreAllocated memory -> Some memory.Ptr
| _ -> None
override this.Dispose(disposing) =
match ptrOpt with
| Some ptr ->
session.Release(ptr)
ptrOpt <- None
| _ -> ()
GC.KeepAlive(initFill)
base.Dispose(disposing)
member inline this.HasMemory = ptrOpt.IsSome
member inline this.Fill (ptr:deviceptr<'T>) =
_fill.Fill session.Session rows cols ptr
member inline this.Ptr =
match ptrOpt with
| Some ptr -> ptr
| None ->
let ptr = session.Allocate<'T>(size)
this.Fill ptr
// this memory is owned by this mat, so we change the fill method
_fill <- Elementwise (Func<_,_> (fun i -> ptr.[i]))
ptrOpt <- Some ptr
ptr
member inline this.ElementwiseEmit (targetRows:int) (targetCols:int) =
if rows = targetRows && cols = targetCols then
match _fill with
| Elementwise emit -> emit
| _ ->
let ptr = this.Ptr
Func<_,_> (fun i -> ptr.[i])
elif rows = 1 && cols = 1 then
match _fill with
| Elementwise emit -> Func<_,_> (fun i -> emit.Invoke 0)
| _ ->
let ptr = this.Ptr
Func<_,_> (fun i -> ptr.[0])
else
failwithf "Broadcast from (%d,%d) to (%d,%d) isn't supported." rows cols targetRows targetCols
member inline this.MatrixForBlas =
match UseFortranOrder with
| true ->
match _fill with
| Transpose mat -> mat.Ptr, cuBLAS.Operation.T, this.Cols
| _ -> this.Ptr, cuBLAS.Operation.N, this.Rows
| false ->
failwith "TODO: Add a kernel to do transpose"
member this.Session = session
member this.Gpu = session.Gpu
member this.Rows = rows
member this.Cols = cols
member this.Size = size
member this.T =
let fill = Transpose this
new GpuMat<'T>(session, cols, rows, fill)
member this.ToHost() =
let ptr = this.Ptr
if not UseFortranOrder || rows = 1 || cols = 1 then
let array = Array2D.zeroCreate<'T> rows cols
Gpu.Copy(session.Gpu, ptr, array, 0L, int64 size)
array
else
// we need transpose the matrix to get C order for
let array = Array2D.zeroCreate<'T> cols rows
Gpu.Copy(session.Gpu, ptr, array, 0L, int64 size)
Array2D.init rows cols (fun row col -> array.[col, row])
static member inline (.<-) (lhs:GpuMat<float>, rhs:GpuMat<float>) =
rhs.Fill lhs.Ptr
static member inline ElementwiseTransform(a:GpuMat<'T>, op:Func<'T, 'T>) =
let session = a.Session
let rows = a.Rows
let cols = a.Cols
let emitA = a.ElementwiseEmit rows cols
let fill = Elementwise (Func<_,_> (fun i ->
let mutable a = emitA.Invoke i
let result = op.Invoke(a)
// workaround to avoid tail instruction
a <- __default_value<'T>()
result))
new GpuMat<'T>(session, rows, cols, fill)
static member inline ElementwiseTransform(a:GpuMat<'T>, b:GpuMat<'T>, op:Func<'T, 'T, 'T>) =
let session = a.Session
let rows = max a.Rows b.Rows
let cols = max a.Cols b.Cols
let emitA = a.ElementwiseEmit rows cols
let emitB = b.ElementwiseEmit rows cols
let fill = Elementwise (Func<_,_> (fun i ->
let mutable a = emitA.Invoke i
let mutable b = emitB.Invoke i
let result = op.Invoke(a, b)
// workaround to avoid tail instruction
a <- __default_value<'T>()
b <- __default_value<'T>()
result))
new GpuMat<'T>(session, rows, cols, fill)
static member inline ElementwiseTransform(a:GpuMat<'T>, b:'T, op:Func<'T, 'T, 'T>) =
let session = a.Session
let rows = a.Rows
let cols = a.Cols
let emitA = a.ElementwiseEmit rows cols
let fill = Elementwise (Func<_,_> (fun i ->
let mutable a = emitA.Invoke i
let mutable b = b
let result = op.Invoke(a, b)
// workaround to avoid tail instruction
a <- __default_value<'T>()
b <- __default_value<'T>()
result))
new GpuMat<'T>(session, rows, cols, fill)
static member inline ElementwiseTransform(a:'T, b:GpuMat<'T>, op:Func<'T, 'T, 'T>) =
let session = b.Session
let rows = b.Rows
let cols = b.Cols
let emitB = b.ElementwiseEmit rows cols
let fill = Elementwise (Func<_,_> (fun i ->
let mutable a = a
let mutable b = emitB.Invoke i
let result = op.Invoke(a, b)
// workaround to avoid tail instruction
a <- __default_value<'T>()
b <- __default_value<'T>()
result))
new GpuMat<'T>(session, rows, cols, fill)
static member (~-) (a:GpuMat<float>) =
GpuMat<float>.ElementwiseTransform(a, (fun a -> -a))
static member (.+) (a:GpuMat<float>, b:GpuMat<float>) =
GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a + b))
static member (.+) (a:float, b:GpuMat<float>) =
GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a + b))
static member (.+) (a:GpuMat<float>, b:float) =
GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a + b))
static member (.-) (a:GpuMat<float>, b:GpuMat<float>) =
GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a - b))
static member (.-) (a:float, b:GpuMat<float>) =
GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a - b))
static member (.*) (a:GpuMat<float>, b:GpuMat<float>) =
GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a * b))
static member (.*) (a:float, b:GpuMat<float>) =
GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a * b))
static member (./) (a:GpuMat<float>, b:GpuMat<float>) =
GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a / b))
static member (./) (a:GpuMat<float>, b:float) =
GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a / b))
static member ( * ) (a:GpuMat<float>, b:GpuMat<float>) =
let session = a.Session
let rowsA = a.Rows
let rowsB = b.Rows
let rowsC = rowsA
let colsA = a.Cols
let colsB = b.Cols
let colsC = colsB
let fill =
match UseFortranOrder with
| true ->
FillFunc <| fun (cptr:deviceptr<float>) ->
let ldc = rowsC
let aptr, opA, lda = a.MatrixForBlas
let bptr, opB, ldb = b.MatrixForBlas
session.BLAS.Gemm(opA, opB, rowsC, colsC, rowsB, 1.0, aptr, lda, bptr, ldb, 0.0, cptr, ldc)
GC.KeepAlive(a)
GC.KeepAlive(b)
| false ->
failwith "TODO"
new GpuMat<float>(session, rowsC, colsC, fill)
let undef (rows:int) (cols:int) =
Calculation<GpuMat<float>> <| fun session ->
new GpuMat<float>(session, rows, cols, Undefined)
let zeros (rows:int) (cols:int) =
Calculation<GpuMat<float>> <| fun session ->
let fill = Elementwise (Func<_,_> (fun i -> 0.0))
new GpuMat<float>(session, rows, cols, fill)
let ones (rows:int) (cols:int) =
Calculation<GpuMat<float>> <| fun session ->
let fill = Elementwise (Func<_,_> (fun i -> 1.0))
new GpuMat<float>(session, rows, cols, fill)
let map (gmat:GpuMat<'T>) (op:Func<'T, 'T>) =
GpuMat<'T>.ElementwiseTransform(gmat, op)
let gpuLog (gmat:GpuMat<float>) =
GpuMat<float>.ElementwiseTransform(gmat, Func<_,_> log)
let reduce (mat:GpuMat<'T>) (op:Func<'T, 'T, 'T>) =
let session = mat.Session
let fill = GpuMatFill<'T>.FillFunc <| fun (ptr:deviceptr<'T>) ->
let emit = mat.ElementwiseEmit mat.Rows mat.Cols
session.Session.Aggregate(mat.Size, emit, (fun v -> ptr.Set(0, v)), op)
GC.KeepAlive(mat)
new GpuMat<'T>(session, 1, 1, fill)
let sum (mat:GpuMat<float>) =
reduce mat (Func<_,_,_> (+))
let randn (rows:int) (cols:int) (mean:float) (stddev:float) =
Calculation<GpuMat<float>> <| fun session ->
let fill = FillFunc <| fun (ptr:deviceptr<float>) ->
session.RAND.GenerateNormal(ptr, (rows*cols) |> uint64, mean, stddev)
new GpuMat<float>(session, rows, cols, fill)
let randnInplace (mat:GpuMat<float>) (mean:float) (stddev:float) =
let session = mat.Session
session.RAND.GenerateNormal(mat.Ptr, mat.Size |> uint64, mean, stddev)
let session () =
Calculation<Session> <| fun session -> session
let toHost (mat:GpuMat<'T>) =
mat.ToHost()
let head (mat:GpuMat<'T>) =
mat.ToHost().[0,0]
let toDevice (host:'T[,]) =
let rows = host.GetLength 0
let cols = host.GetLength 1
Calculation<GpuMat<'T>> <| fun session ->
match UseFortranOrder with
| true ->
let flattenedArray2D =
let aflat = Array.zeroCreate (rows*cols)
let mutable k = 0
for j = 0 to cols - 1 do
for i = 0 to rows - 1 do
aflat.[k] <- host.[i, j]
k <- k + 1
aflat
let memory = session.Gpu.AllocateDevice(flattenedArray2D)
let fill = PreAllocated memory
new GpuMat<'T>(session, rows, cols, fill)
| false ->
failwith "TODO for C order"
In [7]:
let calc = gpucalc {
let! a = ones 4 5
let! b = ones 3 5
let c = a * b.T
return (toHost c)
}
let session = new Session(Gpu.Default)
let result = calc |> gpuRun session
result
Out[7]:
In [8]:
let n = 100
let m = 50
let k = 75
let calc = gpucalc {
let! A = randn m k 0.0 1.0
let! B = randn k n 0.0 1.0
let! C = undef m n
C .<- A * B
let hC = toHost C |> flattenArray2D
let hhC = multiplyCpu m n k (toHost A |> flattenArray2D) (toHost B |> flattenArray2D)
return (error hC hhC)
}
let session = new Session(Gpu.Default)
let err = calc |> gpuRun session
err
Out[8]:
In [9]:
let n = 100
let m = 50
let k = 75
let calc = gpucalc {
let! A = randn k m 0.0 1.0
let! B = randn k n 0.0 1.0
let! C = undef m n
C .<- A.T * B
let hC = toHost C |> flattenArray2D
let hhC = multiplyTranspCpu m n k (toHost A |> flattenArray2D) (toHost B |> flattenArray2D)
return (error hC hhC)
}
let session = new Session(Gpu.Default)
let err = calc |> gpuRun session
err
Out[9]:
In [10]:
let url = @"http://yann.lecun.com/exdb/mnist/"
let trainImagesFile = @"train-images-idx3-ubyte"
let trainLabelsFile = @"train-labels-idx1-ubyte"
let testImagesFile = @"t10k-images-idx3-ubyte"
let testLabelsFile = @"t10k-labels-idx1-ubyte"
let (@@) a b = System.IO.Path.Combine(a, b)
let decompress fileName =
let fileToDecompress = new FileInfo(fileName + ".gz")
use originalFileStream = fileToDecompress.OpenRead()
use decompressedFileStream = File.Create(fileName)
use decompressionStream = new GZipStream(originalFileStream, CompressionMode.Decompress)
decompressionStream.CopyTo(decompressedFileStream)
let get url file downloadTo =
use client = new WebClient()
client.DownloadFile(url + file + ".gz", (downloadTo @@ file) + ".gz")
decompress (downloadTo @@ file)
let downLoad url downloadTo =
let files = [trainImagesFile; trainLabelsFile; testImagesFile; testLabelsFile]
Directory.CreateDirectory downloadTo |> ignore
files |> List.iter (fun file -> if not <| File.Exists(downloadTo @@ file) then get url file downloadTo)
downloadTo
let readData (brImages:BinaryReader) (brLabels:BinaryReader) numSamples =
let npix, nclasses = 28, 10
let images = Array2D.zeroCreate<float32> numSamples (npix*npix)
let labels = Array2D.zeroCreate<float32> numSamples nclasses
for i = 0 to numSamples - 1 do
for x = 0 to npix - 1 do
for y = 0 to npix - 1 do
images.[i, x*npix + y] <- (brImages.ReadByte() |> float32)/255.0f
labels.[i, brLabels.ReadByte() |> int] <- 1.0f
images, labels
let skipImages (brImages:BinaryReader) =
brImages.ReadInt32() |> ignore // skip magic
brImages.ReadInt32() |> ignore // skip num images
brImages.ReadInt32() |> ignore // skip rows
brImages.ReadInt32() |> ignore // skip cols
let skipLabels (brLabels:BinaryReader) =
brLabels.ReadInt32() |> ignore // skip magic
brLabels.ReadInt32() |> ignore // skip num labels
let loadData path =
use ifsTestLabels = new FileStream(path @@ testLabelsFile, FileMode.Open)
use ifsTestImages = new FileStream(path @@ testImagesFile, FileMode.Open)
use ifsTrainLabels = new FileStream(path @@ trainLabelsFile, FileMode.Open)
use ifsTrainImages = new FileStream(path @@ trainImagesFile, FileMode.Open)
use brTestLabels = new BinaryReader(ifsTestLabels)
use brTestImages = new BinaryReader(ifsTestImages)
use brTrainLabels = new BinaryReader(ifsTrainLabels)
use brTrainImages = new BinaryReader(ifsTrainImages)
skipImages brTestImages
skipLabels brTestLabels
skipImages brTrainImages
skipLabels brTrainLabels
let numTrain = 55000
let numTest = 10000
let numValidation = 60000 - numTrain
let testImages, testLabels = readData brTestImages brTestLabels numTest
let trainImages, trainLabels = readData brTrainImages brTrainLabels numTrain
let validationImages, validationLabels = readData brTrainImages brTrainLabels numValidation
testImages, testLabels, trainImages, trainLabels, validationImages, validationLabels
let path = downLoad url @"data/MNIST/"
let testImages, testLabels, trainImages, trainLabels, validationImages, validationLabels = loadData path
trainImages.GetLength(0), trainImages.GetLength(1), trainLabels.GetLength(0), trainLabels.GetLength(1)
Out[10]:
In [11]:
let imageClass (imageLabels:float32[,]) i =
List.init 10 (fun j -> imageLabels.[i, j]) |> List.findIndex (fun j -> j > 0.0f)
let imageArray2D (images:float32[,]) i =
let npix = 28
let img = Array2D.zeroCreate<float> npix npix
for x = 0 to npix - 1 do
for y = 0 to npix - 1 do
img.[npix - 1 - x, y] <- images.[i, x*npix + y] |> float
img
let imageList (images:float32[,]) i =
imageArray2D images i |> array2DAsLists
let stack npix (images:float[,] list) nx ny =
if images.Length <> nx*ny then failwith "wrong dimensions"
let image = Array2D.zeroCreate (nx*npix) (ny*npix)
for i = 0 to nx - 1 do
for j = 0 to ny - 1 do
for x = 0 to npix - 1 do
for y = 0 to npix - 1 do
image.[i*npix + x, j*npix + y] <- images.[i*ny + j].[x, y]
image
let checkerBoard (images:float32[,]) from nx ny =
let npix = 28
let images = [from..(from+nx*ny-1)] |> List.map (fun i -> imageArray2D images i)
stack npix images nx ny |> array2DAsLists
let heatmap img labels w h =
let layout = Layout(title = sprintf "Image %A" labels)
Heatmap(z = img, colorscale = "YIGnBu")
|> Chart.Plot
|> Chart.WithLayout layout
|> Chart.WithWidth w
|> Chart.WithHeight h
In [12]:
let i = 133
let img = imageList trainImages i
let c = imageClass trainLabels i
heatmap img c 500 500
Out[12]:
In [13]:
let i, nx, ny = 133, 5, 5
let img = checkerBoard trainImages i nx ny
let c = [i..i+nx*nx-1] |> List.map (fun i -> imageClass trainLabels i)
heatmap img c 800 800
Out[13]:
Hamiltonian Monte Carlo (HMC) is a popular Markov Chain Monte Carlo method as it is highly efficent and widly applicable. The idea behind the algorithm is to propose new points by simulating the dynamics fo a particle using a potential engery landscape deduced from the target distribution. The simulation is done using the Hamiltonian dynamics formualtion, which results in several useful properties of the HMC method.
In the Hamiltonian dynamics the state of a physical system is described by a pair $(q,p)$, where $q$ is the position vector and $p$ is the momentum vector. The evolution of the system over time is described by Hamilton's equations
$$ \begin{align} \frac{d q_i}{dt} &= \frac{\partial H}{\partial p_i}, \\ \frac{d p_i}{dt} &= - \frac{\partial H}{\partial q_i}, \end{align} $$where $H(t,q,p)$ is the Hamiltonian of the system.
HMC is based on a frictionless particle governed by the potential energy $U(q)$ and kinetic energy $K(p)$. In this case the Hamiltionian is given by $H(q,p) = U(q) + K(p)$, which is constant in time. The kinetic energy is given by $$ K(p) = \frac{1}{2} p^{\top} M^{-1} p $$ with $M$ the mass matrix.
The key advantage of the Hamiltonian dynamics over other formulations is that solutions of Hamiltion's equations exhibit three important properties:
Reversibility: For every $s > 0$ he mapping $T_s$ from a state $q(t), p(t)$ to a state at time $t+s$ is a bijection. Hence by running time backward we can determine previous states. This is achieved by negating both time derivatives in Hamilton's equations.
Volume preservation: $T_s$ conserves volume in the $(q,p)$ space.
Conservation of the Hamiltonian: We have $\frac{dH(q,p)}{dt} = 0$ hence the Hamiltonian is invariant in time.
All these properties are useful in the application of HMC. However, not all of them can be preserved in numerical solutions. The leap frog method generates numerical solutions which maintain the reversibility and the volume preservation and also approximately converve the Hamiltonian.
Given a step size $\varepsilon$ the leap from method performs discrete updates $$ \begin{align} p_{i,t+1/2} &= p_{i,t} - \frac{\varepsilon}{2} \frac{\partial U}{\partial q}\left(q_{t}\right), \\ q_{i,t+1} &= q_{i,t} + \varepsilon \frac{\partial K}{\partial p}\left(p_{t+1/2}\right), \\ p_{i,t+1} &= p_{i,t+1/2} - \frac{\varepsilon}{2} \frac{\partial U}{\partial q}\left(q_{t+1}\right). \end{align} $$ First a half step for the momentum variables is computed, which is then used for a full position update, followed by a second momentum half step.
To apply the Hamiltonian dynamics we have to specify an appropriate energy function. In statistical mechanics the probability density $f_{target}(q)$ for observing a particle at position $q$ with engergy $E(q)$ is $f(q) \propto \exp(-E(q))$.
Given a target distribution $f_{target}(q)$ we reverse this relationship to find the appropriate potential energy $U(q)$ by $$ U(q) = - \log(f_{target}(q)) $$ where we can drop constants. In particular $f_{target}(q)$ does not need to be a normalized probability distribution. Note that $U(q)$ is just the negative log-likelihood, which directs particle movement towards low negative log-likelihood. With this potential energy the Hamiltonian becomes $$ H(q,p) = U(q) + K(p) $$ where commonly $K(p) = \frac{1}{2} p^{\top} M^{-1} p$. The additive structure of the Hamiltonian implies that the canonical distribution of $(q,p)$ factorizes $$ p(q,p) \propto \exp(H(q,p)) \propto f_{target}(q) \exp(-K(p)). $$
The HMC algorithm works as follows: given a current state $s_{t-1} = (q_{t-1}, p_{t-1})$ do
Note that the proposal distribution is symetric because of the momentum negation in the third step and the reversibiity of the integration method, which is needed in the Metropolis-Hastings acceptance probability calculation.
Let
$$
x_i = (x_{i1}, \ldots, x_{ip})^{\top}, \quad y_i = (y_{i1}, \ldots, y_{ic})^{\top}, y_{ik} \in \{0,1\}
$$
be predictors and class indicators. with
$$
X = (x_1, \ldots, x_n)^{\top} \in \mathbb{R}^{n \times p}, \quad Y = (y_1, \ldots, y_n)^{\top} \in \mathbb{R}^{n \times c}
$$
the matrix of predictors and class indicators. The goal is to classify new observations into one of the $c$ classes. Let
$$
B = (\beta_1, \ldots \beta_c) \in \mathbb{R}^{p \times c}
$$
be the matrix of regression coefficients, one column for each class. The relationship between $x_i$ and $y_i$
is modeled with the softmax function $\psi : \mathbb{R}^{n \times c} \rightarrow \mathbb{R}^{n \times c}$
$$
P(y_{ik} = 1 \mid x_i, B) =
\psi(X B)_{ik} = \frac{\exp\left(x_i^{\top} \beta_k \right)}{\sum_{j=1}^c \exp\left(x_i^{\top} \beta_j\right)},
$$
which transforms each row of $X B$ into a multinomial probability. The log-likelihood for $n$ observations is then
$$
l(B\mid X, Y) = \sum_{i=1}^n \sum_{k=1}^c y_{ik} \log \psi(x_i, B).
$$
For identifiability purpose we set $\beta_c = 0$, which is necessary
because the multinomial probabilities add up to one.
It is common to select a Cauchy distribution as prior for the regression parameters $B$. Then the log posterior kernel is $$ \log(\kappa(B\mid X, Y)) = \sum_{i=1}^n \sum_{k=1}^c y_{ik} \log(\psi(X B)_{ik}) + \sum_{i=1}^n \sum_{k=1}^p - \log (1 + \beta_{ik}^2). $$
This expression can be written entirly in terms of sum reduction, elementwise operation and matrix multiplication. If $\Gamma_{ik} = -\log(1 + \beta_{ik}^2)$. Then $$ \log(\kappa(B\mid X, Y)) = \mathrm{sum}(Y \odot \psi(X B)) + \mathrm{sum}(\Gamma) $$ wiht $\odot$ denoting elementwise multiplication. The same holds for the derivative with respect to the parameters $B$ $$ \frac{\partial \log(\kappa(B\mid X, Y))}{\partial B} = X^{\top} (Y - \psi(XB)) + \Gamma $$ Both expressions can be conveniently implemented on the GPU so that all the values remain on the GPU all the time.
In [14]:
let softmaxCpu numRows numCols (values:float32[]) =
let output = Array.zeroCreate (numRows*numCols)
for row = 0 to numRows-1 do
let mutable output = values
let mutable sum = 0.0f
let mutable maxVal = values.[row]
for col = 0 to numCols - 1 do
let v = values.[col*numRows + row]
if v > maxVal then maxVal <- v
for col = 0 to numCols - 1 do
sum <- sum + exp(values.[col*numRows + row] - maxVal)
for col = 0 to numCols - 1 do
if sum = 0.0f then
printfn "division by zero"
output.[col*numRows + row] <- exp(values.[col*numRows + row] - maxVal) / sum
output
[<ReflectedDefinition>]
let softmaxKernel32 numRows numCols (values:deviceptr<float32>) (results:deviceptr<float32>) =
let row = blockIdx.y*blockDim.y + threadIdx.y
let mutable results = results
let mutable sum = 0.0f
if row < numRows then
let mutable maxVal = values.[row]
for col = 0 to numCols - 1 do
let v = values.[col*numRows + row]
if v > maxVal then maxVal <- v
for col = 0 to numCols - 1 do
sum <- sum + exp(values.[col*numRows + row] - maxVal)
for col = 0 to numCols - 1 do
results.[col*numRows + row] <- exp(values.[col*numRows + row] - maxVal) / sum
[<ReflectedDefinition>]
let softmaxKernel numRows numCols (values:deviceptr<float>) (results:deviceptr<float>) =
let row = blockIdx.y*blockDim.y + threadIdx.y
let mutable results = results
let mutable sum = 0.0
if row < numRows then
let mutable maxVal = values.[row]
for col = 0 to numCols - 1 do
let v = values.[col*numRows + row]
if v > maxVal then maxVal <- v
for col = 0 to numCols - 1 do
sum <- sum + exp(values.[col*numRows + row] - maxVal)
for col = 0 to numCols - 1 do
results.[col*numRows + row] <- exp(values.[col*numRows + row] - maxVal) / sum
[<ReflectedDefinition>]
let argMaxRowKernel numRows numCols (values:deviceptr<float>) (results:deviceptr<float>) =
let row = blockIdx.y*blockDim.y + threadIdx.y
let mutable results = results
if row < numRows then
let mutable maxVal = values.[row]
let mutable argMax = 0
for col = 0 to numCols - 1 do
let v = values.[col*numRows + row]
if v > maxVal then
maxVal <- v
argMax <- col
results.[row] <- float argMax
[<ReflectedDefinition>]
let predictionErrorKernel numRows numCols (y:deviceptr<float>) (prediction:deviceptr<float>) (results:deviceptr<float>) =
let row = blockIdx.y*blockDim.y + threadIdx.y
let mutable results = results
if row < numRows then
let col = prediction.[row] |> int
results.[row] <- 1.0 - y.[col*numRows + row]
[<ReflectedDefinition>]
let setColumnKernel numRows numCols (values:deviceptr<'T>) (results:deviceptr<'T>) col v =
let row = blockIdx.y*blockDim.y + threadIdx.y
let mutable results = results
if row < numRows then
results.[col*numRows + row] <- v
In [15]:
let addIntercept (a:float32[,]) =
let aintercept = Array2D.zeroCreate (a.GetLength 0) (1 + a.GetLength 1)
for i = 0 to a.GetLength(0) - 1 do
for j = 0 to a.GetLength(1) do
aintercept.[i, j] <- if j = 0 then 1.0f else a.[i, j-1]
aintercept
let softmaxf (a:GpuMat<float32>) =
if UseFortranOrder then
let fill (ptr:deviceptr<float32>) =
let blockSize = 64
let block = new dim3(1, blockSize)
let gridSize = divUp a.Rows blockSize
let grid = new dim3(gridSize, gridSize)
let lp = LaunchParam(grid, block)
a.Gpu.Launch(Action<_,_,_,_>softmaxKernel32, lp, a.Rows, a.Cols, a.Ptr, ptr)
let fill = FillFunc fill
new GpuMat<float32>(a.Session, a.Rows, a.Cols, fill)
else
failwith "softmax kernel requires Fortran storage order"
let softmax (a:GpuMat<float>) =
if UseFortranOrder then
let fill (ptr:deviceptr<float>) =
let blockSize = 64
let block = new dim3(1, blockSize)
let gridSize = divUp a.Rows blockSize
let grid = new dim3(gridSize, gridSize)
let lp = LaunchParam(grid, block)
a.Gpu.Launch(Action<_,_,_,_> softmaxKernel, lp, a.Rows, a.Cols, a.Ptr, ptr)
let fill = FillFunc fill
new GpuMat<float>(a.Session, a.Rows, a.Cols, fill)
else
failwith "softmax kernel requires Fortran storage order"
let argMaxRows (a:GpuMat<float>) =
if UseFortranOrder then
let fill (ptr:deviceptr<float>) =
let blockSize = 64
let block = new dim3(1, blockSize)
let gridSize = divUp a.Rows blockSize
let grid = new dim3(gridSize, gridSize)
let lp = LaunchParam(grid, block)
a.Gpu.Launch(Action<_,_,_,_> argMaxRowKernel, lp, a.Rows, a.Cols, a.Ptr, ptr)
let fill = FillFunc fill
new GpuMat<float>(a.Session, a.Rows, a.Cols, fill)
else
failwith "softmax kernel requires Fortran storage order"
let predictionError (y:GpuMat<float>) (prediction:GpuMat<float>) =
if UseFortranOrder then
let fill (ptr:deviceptr<float>) =
let blockSize = 64
let block = new dim3(1, blockSize)
let gridSize = divUp y.Rows blockSize
let grid = new dim3(gridSize, gridSize)
let lp = LaunchParam(grid, block)
y.Gpu.Launch(Action<_,_,_,_,_> predictionErrorKernel, lp, y.Rows, y.Cols, y.Ptr, prediction.Ptr, ptr)
let fill = FillFunc fill
new GpuMat<float>(y.Session, y.Rows, 1, fill)
else
failwith "softmax kernel requires Fortran storage order"
let setColumn (a:GpuMat<'T>) (col:int) v =
if UseFortranOrder then
let fill (ptr:deviceptr<'T>) =
let blockSize = 64
let block = new dim3(1, blockSize)
let gridSize = divUp a.Rows blockSize
let grid = new dim3(gridSize, gridSize)
let lp = LaunchParam(grid, block)
a.Gpu.Launch(Action<_,_,_,_,_,_> setColumnKernel, lp, a.Rows, a.Cols, a.Ptr, ptr, col, v)
let fill = FillFunc fill
new GpuMat<'T>(a.Session, a.Rows, a.Cols, fill)
else
failwith "softmax kernel requires Fortran storage order"
let maskf (a:GpuMat<float32>) = setColumn a (a.Cols-1) 0.0f
let mask (a:GpuMat<float>) = setColumn a (a.Cols-1) 0.0
In [16]:
// beta (p, c)
let cauchyPriorLogDensity (beta:GpuMat<float>) =
sum (-gpuLog (1.0 .+ beta .* beta))
let multinomialLogLik (y:GpuMat<float>) (softmaxVal:GpuMat<float>) =
sum (y .* gpuLog (softmaxVal .+ 1e-12))
let priorGradBeta (beta:GpuMat<float>) =
-2.0 .* beta ./ (1.0 .+ beta .* beta)
let logLikGradBeta (x:GpuMat<float>) (y:GpuMat<float>) (softmaxVal:GpuMat<float>) =
x.T * (y .- softmaxVal)
// x (n, p), y (n, c)
let calc (random:Random) (hX:float[,]) (hY:float[,]) L nBurnIn nSamples epsBurnIn epsSim T annealRate verbose = gpucalc {
let n = hX.GetLength 0
let p = hX.GetLength 1
let c = hY.GetLength 1
let! x = toDevice hX
let! y = toDevice hY
let! beta = randn p c 0.0 0.01
let! momentum = undef p c
let! softmaxVal = undef n c
let! betaAv = zeros p c
let! betaGrad = undef p c
let! logLik = undef 1 1
let! priorLogLik = undef 1 1
let! momSquareSum = undef 1 1
let! betaOld = undef p c
let! softmaxAv = zeros n c
let! predictions = undef n 1
let! error = undef 1 1
let hamiltonianMonteCarloSample eps =
let t = System.Diagnostics.Stopwatch.StartNew()
randnInplace momentum 0.0 1.0
momentum .<- mask momentum
softmaxVal .<- softmax (x * beta)
logLik .<- multinomialLogLik y softmaxVal
priorLogLik .<- cauchyPriorLogDensity beta
momSquareSum .<- sum (momentum .* momentum)
let initLogLik = head logLik
let currentK = head momSquareSum
let currentU = initLogLik + head priorLogLik
betaOld .<- beta
betaGrad .<- (logLikGradBeta x y softmaxVal) .+ (priorGradBeta beta)
betaGrad .<- mask betaGrad
momentum .<- momentum .+ (eps / 2.0) .* betaGrad
for i = 0 to L-1 do
beta .<- beta .+ eps .* momentum
// update gradient
betaGrad .<- (logLikGradBeta x y (softmax (x * beta))) .+ (priorGradBeta beta)
betaGrad .<- mask betaGrad
if i <> L - 1 then
momentum .<- momentum .+ eps .* betaGrad
momentum .<- momentum .+ (eps / 2.0) .* betaGrad
softmaxVal .<- softmax (x * beta)
logLik .<- multinomialLogLik y softmaxVal
priorLogLik .<- cauchyPriorLogDensity beta
let finalLogLik = head logLik
let proposedU = finalLogLik + head priorLogLik
momSquareSum .<- sum (momentum .* momentum)
let proposedK = 0.5 * head momSquareSum
let diff = ((proposedU - proposedK) - (currentU - currentK)) / T
let alpha = max diff 0.0
let u = log(random.NextDouble())
let result, accept = if u < alpha then beta, 1 else betaOld, 0
t.Stop()
if verbose then
printfn "Current value of log-kernel %f" currentU
printfn "Proposed value of log-kernel %f" proposedU
printfn "Current momentum %f" currentK
printfn "Proposed momentum %f " proposedK
printfn "Total diff %A" diff
printfn "Current log likelihood %f" initLogLik
printfn "Proposed log-like %f" finalLogLik
printfn "Alpha %f, uniform random number %f" alpha u
printfn "----- %A -----" t.Elapsed
result, accept
let hamiltonianMonteCarloSimulation () =
let rec burnIn n eps t accepted i =
if i < n then
let beta, accept = hamiltonianMonteCarloSample epsBurnIn
burnIn n eps (1.0 + t*annealRate) (accepted + accept) (i+1)
else
accepted
let acceptedBurnIn = burnIn nBurnIn epsBurnIn T 0 0
let rec sample n eps t accepted (betas:float[,] list) i =
if i < n then
let beta, accept = hamiltonianMonteCarloSample epsSim
betaAv .<- betaAv .+ beta
softmaxVal .<- softmax (x * beta)
softmaxAv .<- softmaxAv .+ softmaxVal
sample n eps t (accepted + accept) ((toHost beta) :: betas) (i+1)
else
List.rev betas, accepted
let betas, accepted = sample nSamples epsSim 1.0 0 [] 0
betaAv .<- betaAv ./ (float nSamples)
softmaxAv .<- softmaxAv ./ (float nSamples)
predictions .<- argMaxRows softmaxAv
error .<- sum (predictionError y predictions)
let error = (head error) / (float y.Rows)
let accuracy = 1.0 - error
(toHost betaAv), betas, (toHost softmaxAv), accepted, error, accuracy
return hamiltonianMonteCarloSimulation()
}
In [33]:
let run () =
let random = Random 42
let verbose = true
let nBurnIn = 100 // burn-in iterations before sampling starts
let nSamples = 100 // number of posterior samples
let L = 100
let epsBurnIn = 1e-4 // step size during burning, typically larger than during sampling
let epsSim = 1e-5
let T = 1000.0
let annealRate = 0.9
let hX = addIntercept testImages
let hY = testLabels
let calc = calc random (hX |> Array2D.map float) (hY |> Array2D.map float) L nBurnIn nSamples epsBurnIn epsSim T annealRate verbose
use session = new Session(Gpu.Default)
let betaAv, betas, softmaxAv, accepted, error, accuracy = gpuRun session calc
betaAv, betas, softmaxAv, accepted, error, accuracy
In [34]:
let betaAv, betas, softmaxAv, accepted, error, accuracy = run()
In [39]:
accepted, accuracy
Out[39]:
In [19]:
let betaForClass (betaAv:float[,]) cls =
let beta = columnVec betaAv cls |> Array.take (betaAv.GetLength(0) - 1) // chop off the intercept
array2DColMajor 28 28 beta
In [37]:
let cls = 1
heatmap (betaForClass betaAv cls) (sprintf "Average Betas %d" cls) 500 500
Out[37]:
In [31]:
let images = List.init (betaAv.GetLength 1) (fun cls -> betaForClass betaAv cls)
let stacked = stack 28 images 5 2 |> array2DAsLists
heatmap stacked "Average Betas" 500 1000
Out[31]:
In [ ]: