In this Jupyter notebook we show how to use the GPU seamlessly in a scripting mode with Alea GPU V3. We develop an matrix expression GPU expression library and use it to perform Hamiltonian Monte Carlo simulation on the GPU. The highlights of the notebook are:
A light weight GPU matrix expression library which is capable to do elementwise kernel fusion to minimize creation of temporaries and reduce number of GPU kernels to be called
Using the power of F# computation expression and opeator overloading to get concise syntax close to mathematical notation
Showing how to develop Hamiltonian Monte Carlo from scratch and apply it to sample the posterior distribution of a multinomial regression model
Illustrating the concpets with the MNIST data set
Bayesian inference requires algorithms capable to fit models with thousands of parameters and complex nonlinearities. Hamiltonian Monte Carlo initially introduced by Duan, Kennedy, Pendleton and Roweth in 1987 has proven to be extremely successful. Hamiltonian Monte Carlo (HMC) is a variation of the Markov Chain Monte Carlo (MCMC) methods which use the derivative of the posterior distribution to select the step direction and size. Therefore, HMC preferentially choose directions of increasing probability.
There are several papers explaining HMC in detail.
Before you continue you should install the F# kernel for Jupyter.
We use Paket to get the newest version 3.0.3 beta2 NuGet packages of Alea GPU. You can run Alea GPU free on any CUDA capable GeForce or Quadro GPU. In case you want to run it on an enterprise GPU such as a Tesla K80 or the brand new NVIDIA P100 you need a license.
Unfortunately as of now (January 2017), it is not possible to run this notebook on Azure Notebooks with server side execution because the Azure Notebooks service does not yet provide GPU instances.
In [1]:
    
#load "Paket.fsx"
Paket.Version [ ("Alea", "3.0.3-beta2") ]
Paket.Package [ "NUnit" ]
    
    Out[1]:
In [2]:
    
#load "packages/Alea/Alea.fsx"
#r "packages/Alea/lib/net45/Alea.Parallel.dll"
#r "packages/NUnit/lib/net45/nunit.framework.dll"
    
In [3]:
    
#load "XPlot.Plotly.Paket.fsx"
#load "XPlot.Plotly.fsx"
open XPlot.Plotly
    
    
In [4]:
    
open System
open System.IO
open System.IO.Compression
open System.Linq
open System.Threading.Tasks
open System.Net
open System.Diagnostics
open System.Collections.Generic
open Microsoft.FSharp.Quotations
open Alea
open Alea.CSharp
open Alea.Parallel
open Alea.cuBLAS
open Alea.cuRAND
let gpu = Gpu.Default
let s = sprintf "GPU is %A, Number of Cores %A, GPU DRAM is %.3f GB, Process is %d bit"
           gpu
           gpu.Device.Cores
           ((float) gpu.Device.TotalMemory / 1024. / 1024. / 1024.)
           (IntPtr.Size * 8)
           
{ Html = s }
    
    Out[4]:
In [5]:
    
let divUp num den = (num + den - 1) / den
let array2DAsLists (a:'T[,]) =
    List.init (a.GetLength 0) (fun i -> List.init (a.GetLength 1) (fun j -> a.[i, j]))
// flatten a 2D array to an array in column major storage order
let flattenArray2D (a:'T[,]) =
    let numRows = a.GetLength 0
    let numCols = a.GetLength 1
    let aflat = Array.zeroCreate (numRows*numCols)
    let mutable k = 0
    for j = 0 to numCols - 1 do
        for i = 0 to numRows - 1 do
            aflat.[k] <- a.[i, j]
            k <- k + 1
    aflat
// get a column 
let columnVec (a:'T[,]) col =
    let numRows = a.GetLength 0
    let columnVec = Array.zeroCreate numRows
    for i = 0 to numRows - 1 do
        columnVec.[i] <- a.[i, col]
    columnVec
    
// reconstruct 2D array from a flat array in column major storage order 
let array2DColMajor numRows numCols (aflat:'T[]) =
    let a = Array2D.zeroCreate numRows numCols
    let mutable k = 0
    for j = 0 to numCols - 1 do
        for i = 0 to numRows - 1 do
            a.[i, j] <- aflat.[k]
            k <- k + 1
    a
// reconstruct 2D array from a flat array in row major storage order 
let array2DRowMajor numRows numCols (aflat:'T[]) =
    let a = Array2D.zeroCreate numRows numCols
    let mutable k = 0
    for i = 0 to numRows - 1 do
        for j = 0 to numCols - 1 do
            a.[i, j] <- aflat.[k]
            k <- k + 1
    a
    
// matrix product a*b with a of dimension m x k, b of dimension k x n, c of dimension m x n, all in Fortran column major storage
let inline multiplyCpu m n k (a:'T[]) (b:'T[]) =
    let lda = m
    let ldb = k
    let ldc = m
    let c = Array.zeroCreate (m*n)
    for row = 0 to m - 1 do
        for col = 0 to n - 1 do
            let mutable sum = LanguagePrimitives.GenericZero
            for l = 0 to k - 1 do 
                sum <- sum + a.[row + l*lda] * b.[l + col*ldb]
            c.[row + col*ldc] <- sum
    c
// matrix product a^t*b with a of dimension k x m, b of dimension k x n, c of dimension m x n, all in Fortran column major storage
let inline multiplyTranspCpu m n k (a:'T[]) (b:'T[]) =
    let lda = k
    let ldb = k
    let ldc = m
    let c = Array.zeroCreate (m*n)
    for row = 0 to m - 1 do
        for col = 0 to n - 1 do
            let mutable sum = LanguagePrimitives.GenericZero
            for l = 0 to k - 1 do 
                sum <- sum + a.[l + row*lda] * b.[l + col*ldb]
            c.[row + col*ldc] <- sum
    c
    
// transpose a matrix in Fortran column major storage format
let inline transpose numRows numCols (aflat:'T[]) =
    let atflat = Array.zeroCreate (numRows*numCols)
    for i = 0 to numRows - 1 do 
        for j = 0 to numCols - 1 do
            atflat.[i*numCols + j] <- aflat.[j*numRows + i] 
    atflat
let inline error (a:'T[]) (b:'T[]) = Array.map2 (fun a b -> abs (a - b)) a b |> Array.max
    
In [6]:
    
let [<Literal>] UseFortranOrder = true
type Session(gpu:Gpu) =
    inherit DisposableObject()
    let session = new Alea.Session(gpu)
    let blas = new cuBLAS.Blas(gpu)
    let rand = cuRAND.Generator.CreateGpu(gpu, cuRAND.RngType.PSEUDO_DEFAULT)
    let memoryPoolAll = List<DeviceMemory<byte>>()
    let memoryPoolBin = List<DeviceMemory<byte>>()
    let mutable disposed = false
    override this.Dispose(disposing) =
        if disposing then
            lock this <| fun _ ->
                if not disposed then
                    memoryPoolAll |> Seq.iter (fun memory -> memory.Dispose())
                    memoryPoolAll.Clear()
                    memoryPoolBin.Clear()
                    blas.Dispose()
                    rand.Dispose()
                    session.Dispose()
                    disposed <- true
        base.Dispose(disposing)
    member this.Gpu = gpu
    member this.Session = session
    member this.BLAS = blas
    member this.RAND = rand
    member this.Allocate<'T>(size:int) =
        lock this <| fun _ ->
            if not disposed then
                let size = (nativeint size) * (nativeint (Gpu.SizeOf<'T>()))
                let idx = memoryPoolBin.FindIndex(fun memory -> memory.Size >= size)
                if idx < 0 then // not found
                    let memory = gpu.AllocateDevice<byte>(size)
                    memoryPoolAll.Add(memory)
                    deviceptr<'T>(memory.Ptr.Handle)
                else
                    let memory = memoryPoolBin.[idx]
                    memoryPoolBin.RemoveAt(idx)
                    deviceptr<'T>(memory.Ptr.Handle)
            else failwith "Session is disposed!"
    member this.Release(ptr:deviceptr<'T>) =
        lock this <| fun _ ->
            if not disposed then
                let idx = memoryPoolAll.FindIndex(fun memory -> memory.Handle = ptr.Handle)
                if idx >= 0 then // find
                    let memory = memoryPoolAll.[idx]
                    let compare = System.Comparison<DeviceMemory<byte>>
                                    (fun memory1 memory2 ->
                                        let x = memory1.Size
                                        let y = memory2.Size
                                        if x < y then -1 elif x > y then 1 else 0)
                    memoryPoolBin.Add(memory)
                    memoryPoolBin.Sort(compare)
type Calculation<'T>(f:Session -> 'T) =
    member this.Invoke b = f b
type CalculationBuilder() =
    member this.Bind(x:Calculation<'A>, res:'A -> Calculation<'B>) =
        Calculation(fun b -> let r = x.Invoke(b) in res(r).Invoke(b))
    member this.Return(x:'A) = Calculation(fun b -> x)
    member this.ReturnFrom(x:'A) = x
    member this.Zero() = Calculation(fun b -> ())
    member this.For(elements:seq<'A>, forBody:'A -> Calculation<unit>) =
        Calculation(fun b -> elements |> Seq.iter (fun e -> (forBody e).Invoke(b); GC.Collect(); GC.WaitForPendingFinalizers()))
    member this.Combine(partOne:Calculation<unit>, partTwo:Calculation<'A>) =
        Calculation(fun b -> partOne.Invoke(b); partTwo.Invoke(b))
    member this.Delay(restOfComputation:unit -> Calculation<'A>) =
        Calculation(fun b -> restOfComputation().Invoke(b))
    member this.Using<'A, 'B when 'A :> IDisposable>(x:'A, res:'A -> Calculation<'B>) =
        Calculation(fun b -> try res(x).Invoke(b) finally x.Dispose())
let gpuRun<'T> session (calculation:Calculation<'T>) =
    calculation.Invoke session 
let gpucalc = CalculationBuilder()
type GpuMatFill<'T> =
    | Undefined
    | PreAllocated of dmem:DeviceMemory<'T>
    | FillFunc of fill:(deviceptr<'T> -> unit)
    | Elementwise of emit:Func<int, 'T>
    | Transpose of mat:GpuMat<'T>
    member this.Fill (session:Alea.Session) (rows:int) (cols:int) (ptr:deviceptr<'T>) =
        match this with
        | Undefined -> ()
        | PreAllocated _ -> ()
        | FillFunc fill -> fill ptr
        | Elementwise emit -> session.For(0, rows*cols, (fun i -> ptr.Set(i, emit.Invoke i)))
        | Transpose mat -> failwith "Transpose is TODO."
and GpuMat<'T>(session:Session, rows:int, cols:int, initFill:GpuMatFill<'T>) =
    inherit DisposableObject()
    let size = rows * cols
    let mutable _fill = initFill
    let mutable ptrOpt =
        match _fill with
        | PreAllocated memory -> Some memory.Ptr
        | _ -> None
    override this.Dispose(disposing) =
        match ptrOpt with 
        | Some ptr -> 
            session.Release(ptr)
            ptrOpt <- None
        | _ -> ()
        GC.KeepAlive(initFill)
        base.Dispose(disposing)
    member inline this.HasMemory = ptrOpt.IsSome
    member inline this.Fill (ptr:deviceptr<'T>) =
        _fill.Fill session.Session rows cols ptr
    member inline this.Ptr =
        match ptrOpt with
        | Some ptr -> ptr
        | None ->
            let ptr = session.Allocate<'T>(size)
            this.Fill ptr
            // this memory is owned by this mat, so we change the fill method
            _fill <- Elementwise (Func<_,_> (fun i -> ptr.[i])) 
            ptrOpt <- Some ptr
            ptr
    member inline this.ElementwiseEmit (targetRows:int) (targetCols:int) =
        if rows = targetRows && cols = targetCols then            
            match _fill with
            | Elementwise emit -> emit
            | _ -> 
                let ptr = this.Ptr
                Func<_,_> (fun i -> ptr.[i])
        elif rows = 1 && cols = 1 then
            match _fill with
            | Elementwise emit -> Func<_,_> (fun i -> emit.Invoke 0)
            | _ ->
                let ptr = this.Ptr
                Func<_,_> (fun i -> ptr.[0])
        else
            failwithf "Broadcast from (%d,%d) to (%d,%d) isn't supported." rows cols targetRows targetCols
    member inline this.MatrixForBlas =
        match UseFortranOrder with
        | true ->
            match _fill with
            | Transpose mat -> mat.Ptr, cuBLAS.Operation.T, this.Cols
            | _ -> this.Ptr, cuBLAS.Operation.N, this.Rows
        | false ->
            failwith "TODO: Add a kernel to do transpose"
    member this.Session = session
    member this.Gpu = session.Gpu
    member this.Rows = rows
    member this.Cols = cols
    member this.Size = size
    member this.T =
        let fill = Transpose this
        new GpuMat<'T>(session, cols, rows, fill)
    member this.ToHost() =
        let ptr = this.Ptr
        if not UseFortranOrder || rows = 1 || cols = 1 then
            let array = Array2D.zeroCreate<'T> rows cols
            Gpu.Copy(session.Gpu, ptr, array, 0L, int64 size)
            array
        else
            // we need transpose the matrix to get C order for
            let array = Array2D.zeroCreate<'T> cols rows
            Gpu.Copy(session.Gpu, ptr, array, 0L, int64 size)
            Array2D.init rows cols (fun row col -> array.[col, row])
    static member inline (.<-) (lhs:GpuMat<float>, rhs:GpuMat<float>) =
        rhs.Fill lhs.Ptr
    static member inline ElementwiseTransform(a:GpuMat<'T>, op:Func<'T, 'T>) =
        let session = a.Session
        let rows = a.Rows
        let cols = a.Cols
        let emitA = a.ElementwiseEmit rows cols
        let fill = Elementwise (Func<_,_> (fun i ->
            let mutable a = emitA.Invoke i
            let result = op.Invoke(a)
            // workaround to avoid tail instruction
            a <- __default_value<'T>()
            result))
        new GpuMat<'T>(session, rows, cols, fill)
    static member inline ElementwiseTransform(a:GpuMat<'T>, b:GpuMat<'T>, op:Func<'T, 'T, 'T>) =
        let session = a.Session
        let rows = max a.Rows b.Rows
        let cols = max a.Cols b.Cols
        let emitA = a.ElementwiseEmit rows cols
        let emitB = b.ElementwiseEmit rows cols
        let fill = Elementwise (Func<_,_> (fun i ->
            let mutable a = emitA.Invoke i
            let mutable b = emitB.Invoke i
            let result = op.Invoke(a, b)
            // workaround to avoid tail instruction
            a <- __default_value<'T>()
            b <- __default_value<'T>()
            result))
        new GpuMat<'T>(session, rows, cols, fill)
    static member inline ElementwiseTransform(a:GpuMat<'T>, b:'T, op:Func<'T, 'T, 'T>) =
        let session = a.Session
        let rows = a.Rows
        let cols = a.Cols
        let emitA = a.ElementwiseEmit rows cols
        let fill = Elementwise (Func<_,_> (fun i ->
            let mutable a = emitA.Invoke i
            let mutable b = b
            let result = op.Invoke(a, b)
            // workaround to avoid tail instruction
            a <- __default_value<'T>()
            b <- __default_value<'T>()
            result))
        new GpuMat<'T>(session, rows, cols, fill)
    static member inline ElementwiseTransform(a:'T, b:GpuMat<'T>, op:Func<'T, 'T, 'T>) =
        let session = b.Session
        let rows = b.Rows
        let cols = b.Cols
        let emitB = b.ElementwiseEmit rows cols
        let fill = Elementwise (Func<_,_> (fun i ->
            let mutable a = a
            let mutable b = emitB.Invoke i
            let result = op.Invoke(a, b)
            // workaround to avoid tail instruction
            a <- __default_value<'T>()
            b <- __default_value<'T>()
            result))
        new GpuMat<'T>(session, rows, cols, fill)
    static member (~-) (a:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, (fun a -> -a))
    static member (.+) (a:GpuMat<float>, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a + b))
    static member (.+) (a:float, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a + b))
    static member (.+) (a:GpuMat<float>, b:float) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a + b))
    static member (.-) (a:GpuMat<float>, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a - b))
    static member (.-) (a:float, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a - b))
    static member (.*) (a:GpuMat<float>, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a * b))
    static member (.*) (a:float, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a * b))
    static member (./) (a:GpuMat<float>, b:GpuMat<float>) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a / b))
    static member (./) (a:GpuMat<float>, b:float) =
        GpuMat<float>.ElementwiseTransform(a, b, (fun a b -> a / b))
    static member ( * ) (a:GpuMat<float>, b:GpuMat<float>) =
        let session = a.Session
        let rowsA = a.Rows
        let rowsB = b.Rows
        let rowsC = rowsA
        let colsA = a.Cols
        let colsB = b.Cols
        let colsC = colsB
        let fill = 
            match UseFortranOrder with
            | true ->
                FillFunc <| fun (cptr:deviceptr<float>) ->
                    let ldc = rowsC
                    let aptr, opA, lda = a.MatrixForBlas
                    let bptr, opB, ldb = b.MatrixForBlas
                    session.BLAS.Gemm(opA, opB, rowsC, colsC, rowsB, 1.0, aptr, lda, bptr, ldb, 0.0, cptr, ldc)
                    GC.KeepAlive(a)
                    GC.KeepAlive(b)
            | false ->
                failwith "TODO"
        new GpuMat<float>(session, rowsC, colsC, fill)
let undef (rows:int) (cols:int) =
    Calculation<GpuMat<float>> <| fun session ->
        new GpuMat<float>(session, rows, cols, Undefined)
let zeros (rows:int) (cols:int) =
    Calculation<GpuMat<float>> <| fun session ->
        let fill = Elementwise (Func<_,_> (fun i -> 0.0))
        new GpuMat<float>(session, rows, cols, fill)
let ones (rows:int) (cols:int) =
    Calculation<GpuMat<float>> <| fun session ->
        let fill = Elementwise (Func<_,_> (fun i -> 1.0))
        new GpuMat<float>(session, rows, cols, fill)
let map (gmat:GpuMat<'T>) (op:Func<'T, 'T>) =
    GpuMat<'T>.ElementwiseTransform(gmat, op)
let gpuLog (gmat:GpuMat<float>) =
    GpuMat<float>.ElementwiseTransform(gmat, Func<_,_> log)
let reduce (mat:GpuMat<'T>) (op:Func<'T, 'T, 'T>) =
    let session = mat.Session
    let fill = GpuMatFill<'T>.FillFunc <| fun (ptr:deviceptr<'T>) ->
        let emit = mat.ElementwiseEmit mat.Rows mat.Cols
        session.Session.Aggregate(mat.Size, emit, (fun v -> ptr.Set(0, v)), op)
        GC.KeepAlive(mat)
    new GpuMat<'T>(session, 1, 1, fill)
let sum (mat:GpuMat<float>) =
    reduce mat (Func<_,_,_> (+))
let randn (rows:int) (cols:int) (mean:float) (stddev:float) =
    Calculation<GpuMat<float>> <| fun session ->
        let fill = FillFunc <| fun (ptr:deviceptr<float>) ->
            session.RAND.GenerateNormal(ptr, (rows*cols) |> uint64, mean, stddev)
        new GpuMat<float>(session, rows, cols, fill)
let randnInplace (mat:GpuMat<float>) (mean:float) (stddev:float) =
    let session = mat.Session
    session.RAND.GenerateNormal(mat.Ptr, mat.Size |> uint64, mean, stddev)
let session () =
    Calculation<Session> <| fun session -> session
let toHost (mat:GpuMat<'T>) =
    mat.ToHost()
let head (mat:GpuMat<'T>) = 
    mat.ToHost().[0,0]
let toDevice (host:'T[,]) =
    let rows = host.GetLength 0
    let cols = host.GetLength 1
    Calculation<GpuMat<'T>> <| fun session ->
        match UseFortranOrder with
        | true ->
            let flattenedArray2D =
                let aflat = Array.zeroCreate (rows*cols)
                let mutable k = 0
                for j = 0 to cols - 1 do
                    for i = 0 to rows - 1 do
                        aflat.[k] <- host.[i, j]
                        k <- k + 1
                aflat
            let memory = session.Gpu.AllocateDevice(flattenedArray2D)
            let fill = PreAllocated memory
            new GpuMat<'T>(session, rows, cols, fill)
        | false -> 
            failwith "TODO for C order"
    
In [7]:
    
let calc = gpucalc {
    let! a = ones 4 5
    let! b = ones 3 5
    let c = a * b.T
    return (toHost c) 
}
let session = new Session(Gpu.Default)
let result = calc |> gpuRun session 
result
    
    Out[7]:
In [8]:
    
let n = 100
let m = 50
let k = 75
let calc = gpucalc {
    let! A = randn m k 0.0 1.0
    let! B = randn k n 0.0 1.0
    let! C = undef m n
    C .<- A * B
    let hC = toHost C |> flattenArray2D
    let hhC = multiplyCpu m n k (toHost A |> flattenArray2D) (toHost B |> flattenArray2D)
    return (error hC hhC)
}
let session = new Session(Gpu.Default)
let err = calc |> gpuRun session 
err
    
    Out[8]:
In [9]:
    
let n = 100
let m = 50
let k = 75
let calc = gpucalc {
    let! A = randn k m 0.0 1.0
    let! B = randn k n 0.0 1.0
    let! C = undef m n
    C .<- A.T * B
    let hC = toHost C |> flattenArray2D
    let hhC = multiplyTranspCpu m n k (toHost A |> flattenArray2D) (toHost B |> flattenArray2D)
    return (error hC hhC)
}
let session = new Session(Gpu.Default)
let err = calc |> gpuRun session 
err
    
    Out[9]:
In [10]:
    
let url = @"http://yann.lecun.com/exdb/mnist/"
let trainImagesFile = @"train-images-idx3-ubyte"
let trainLabelsFile = @"train-labels-idx1-ubyte"
let testImagesFile = @"t10k-images-idx3-ubyte"
let testLabelsFile = @"t10k-labels-idx1-ubyte"
let (@@) a b = System.IO.Path.Combine(a, b)
let decompress fileName =
    let fileToDecompress = new FileInfo(fileName + ".gz")  
    use originalFileStream = fileToDecompress.OpenRead()
    use decompressedFileStream = File.Create(fileName)
    use decompressionStream = new GZipStream(originalFileStream, CompressionMode.Decompress)
    decompressionStream.CopyTo(decompressedFileStream)
let get url file downloadTo =
    use client = new WebClient()
    client.DownloadFile(url + file + ".gz", (downloadTo @@ file) + ".gz")
    decompress (downloadTo @@ file)
let downLoad url downloadTo =
    let files = [trainImagesFile; trainLabelsFile; testImagesFile; testLabelsFile]
    Directory.CreateDirectory downloadTo |> ignore
    files |> List.iter (fun file -> if not <| File.Exists(downloadTo @@ file) then get url file downloadTo)
    downloadTo
let readData (brImages:BinaryReader) (brLabels:BinaryReader) numSamples =
    let npix, nclasses = 28, 10
    let images = Array2D.zeroCreate<float32> numSamples (npix*npix)
    let labels = Array2D.zeroCreate<float32> numSamples nclasses
    for i = 0 to numSamples - 1 do
        for x = 0 to npix - 1 do 
            for y = 0 to npix - 1 do 
                images.[i, x*npix + y] <- (brImages.ReadByte() |> float32)/255.0f
        labels.[i, brLabels.ReadByte() |> int] <- 1.0f
    images, labels
let skipImages (brImages:BinaryReader) =
    brImages.ReadInt32() |> ignore // skip magic
    brImages.ReadInt32() |> ignore // skip num images
    brImages.ReadInt32() |> ignore // skip rows
    brImages.ReadInt32() |> ignore // skip cols
let skipLabels (brLabels:BinaryReader) =
    brLabels.ReadInt32() |> ignore // skip magic
    brLabels.ReadInt32() |> ignore // skip num labels
let loadData path =
    use ifsTestLabels = new FileStream(path @@ testLabelsFile, FileMode.Open)
    use ifsTestImages = new FileStream(path @@ testImagesFile, FileMode.Open)
    use ifsTrainLabels = new FileStream(path @@ trainLabelsFile, FileMode.Open)
    use ifsTrainImages = new FileStream(path @@ trainImagesFile, FileMode.Open)
    use brTestLabels = new BinaryReader(ifsTestLabels)
    use brTestImages = new BinaryReader(ifsTestImages)
    use brTrainLabels = new BinaryReader(ifsTrainLabels)
    use brTrainImages = new BinaryReader(ifsTrainImages)          
    skipImages brTestImages
    skipLabels brTestLabels
    skipImages brTrainImages
    skipLabels brTrainLabels
    let numTrain = 55000
    let numTest = 10000
    let numValidation = 60000 - numTrain
    let testImages, testLabels = readData brTestImages brTestLabels numTest
    let trainImages, trainLabels = readData brTrainImages brTrainLabels numTrain
    let validationImages, validationLabels = readData brTrainImages brTrainLabels numValidation
    testImages, testLabels, trainImages, trainLabels, validationImages, validationLabels
    
let path = downLoad url @"data/MNIST/"
let testImages, testLabels, trainImages, trainLabels, validationImages, validationLabels = loadData path
trainImages.GetLength(0), trainImages.GetLength(1), trainLabels.GetLength(0), trainLabels.GetLength(1)
    
    Out[10]:
In [11]:
    
let imageClass (imageLabels:float32[,]) i =
    List.init 10 (fun j -> imageLabels.[i, j]) |> List.findIndex (fun j -> j > 0.0f)
let imageArray2D (images:float32[,]) i =
    let npix = 28
    let img = Array2D.zeroCreate<float> npix npix
    for x = 0 to npix - 1 do 
        for y = 0 to npix - 1 do 
            img.[npix - 1 - x, y] <- images.[i, x*npix + y] |> float
    img
let imageList (images:float32[,]) i = 
    imageArray2D images i |> array2DAsLists
let stack npix (images:float[,] list) nx ny =
    if images.Length <> nx*ny then failwith "wrong dimensions"
    let image = Array2D.zeroCreate (nx*npix) (ny*npix)
    for i = 0 to nx - 1 do
        for j = 0 to ny - 1 do 
            for x = 0 to npix - 1 do 
                for y = 0 to npix - 1 do
                    image.[i*npix + x, j*npix + y] <- images.[i*ny + j].[x, y]
    image
    
let checkerBoard (images:float32[,]) from nx ny =
    let npix = 28
    let images = [from..(from+nx*ny-1)] |> List.map (fun i -> imageArray2D images i)
    stack npix images nx ny |> array2DAsLists
    
let heatmap img labels w h =
    let layout = Layout(title = sprintf "Image %A" labels)
    Heatmap(z = img, colorscale = "YIGnBu")
    |> Chart.Plot
    |> Chart.WithLayout layout
    |> Chart.WithWidth w
    |> Chart.WithHeight h
    
In [12]:
    
let i = 133
let img = imageList trainImages i
let c = imageClass trainLabels i
heatmap img c 500 500
    
    Out[12]:
In [13]:
    
let i, nx, ny = 133, 5, 5
let img = checkerBoard trainImages i nx ny
let c = [i..i+nx*nx-1] |> List.map (fun i -> imageClass trainLabels i)
heatmap img c 800 800
    
    Out[13]:
Hamiltonian Monte Carlo (HMC) is a popular Markov Chain Monte Carlo method as it is highly efficent and widly applicable. The idea behind the algorithm is to propose new points by simulating the dynamics fo a particle using a potential engery landscape deduced from the target distribution. The simulation is done using the Hamiltonian dynamics formualtion, which results in several useful properties of the HMC method.
In the Hamiltonian dynamics the state of a physical system is described by a pair $(q,p)$, where $q$ is the position vector and $p$ is the momentum vector. The evolution of the system over time is described by Hamilton's equations
$$ \begin{align} \frac{d q_i}{dt} &= \frac{\partial H}{\partial p_i}, \\ \frac{d p_i}{dt} &= - \frac{\partial H}{\partial q_i}, \end{align} $$where $H(t,q,p)$ is the Hamiltonian of the system.
HMC is based on a frictionless particle governed by the potential energy $U(q)$ and kinetic energy $K(p)$. In this case the Hamiltionian is given by $H(q,p) = U(q) + K(p)$, which is constant in time. The kinetic energy is given by $$ K(p) = \frac{1}{2} p^{\top} M^{-1} p $$ with $M$ the mass matrix.
The key advantage of the Hamiltonian dynamics over other formulations is that solutions of Hamiltion's equations exhibit three important properties:
Reversibility: For every $s > 0$ he mapping $T_s$ from a state $q(t), p(t)$ to a state at time $t+s$ is a bijection. Hence by running time backward we can determine previous states. This is achieved by negating both time derivatives in Hamilton's equations.
Volume preservation: $T_s$ conserves volume in the $(q,p)$ space.
Conservation of the Hamiltonian: We have $\frac{dH(q,p)}{dt} = 0$ hence the Hamiltonian is invariant in time.
All these properties are useful in the application of HMC. However, not all of them can be preserved in numerical solutions. The leap frog method generates numerical solutions which maintain the reversibility and the volume preservation and also approximately converve the Hamiltonian.
Given a step size $\varepsilon$ the leap from method performs discrete updates $$ \begin{align} p_{i,t+1/2} &= p_{i,t} - \frac{\varepsilon}{2} \frac{\partial U}{\partial q}\left(q_{t}\right), \\ q_{i,t+1} &= q_{i,t} + \varepsilon \frac{\partial K}{\partial p}\left(p_{t+1/2}\right), \\ p_{i,t+1} &= p_{i,t+1/2} - \frac{\varepsilon}{2} \frac{\partial U}{\partial q}\left(q_{t+1}\right). \end{align} $$ First a half step for the momentum variables is computed, which is then used for a full position update, followed by a second momentum half step.
To apply the Hamiltonian dynamics we have to specify an appropriate energy function. In statistical mechanics the probability density $f_{target}(q)$ for observing a particle at position $q$ with engergy $E(q)$ is $f(q) \propto \exp(-E(q))$.
Given a target distribution $f_{target}(q)$ we reverse this relationship to find the appropriate potential energy $U(q)$ by $$ U(q) = - \log(f_{target}(q)) $$ where we can drop constants. In particular $f_{target}(q)$ does not need to be a normalized probability distribution. Note that $U(q)$ is just the negative log-likelihood, which directs particle movement towards low negative log-likelihood. With this potential energy the Hamiltonian becomes $$ H(q,p) = U(q) + K(p) $$ where commonly $K(p) = \frac{1}{2} p^{\top} M^{-1} p$. The additive structure of the Hamiltonian implies that the canonical distribution of $(q,p)$ factorizes $$ p(q,p) \propto \exp(H(q,p)) \propto f_{target}(q) \exp(-K(p)). $$
The HMC algorithm works as follows: given a current state $s_{t-1} = (q_{t-1}, p_{t-1})$ do
Note that the proposal distribution is symetric because of the momentum negation in the third step and the reversibiity of the integration method, which is needed in the Metropolis-Hastings acceptance probability calculation.
Let 
$$
x_i = (x_{i1}, \ldots, x_{ip})^{\top}, \quad y_i = (y_{i1}, \ldots, y_{ic})^{\top}, y_{ik} \in \{0,1\} 
$$
be predictors and class indicators. with 
$$
X = (x_1, \ldots, x_n)^{\top} \in \mathbb{R}^{n \times p}, \quad Y = (y_1, \ldots, y_n)^{\top} \in \mathbb{R}^{n \times c}
$$
the matrix of predictors and class indicators. The goal is to classify new observations into one of the $c$ classes. Let
$$
B = (\beta_1, \ldots \beta_c) \in \mathbb{R}^{p \times c}
$$ 
be the matrix of regression coefficients, one column for each class. The relationship between $x_i$ and $y_i$ 
is modeled with the softmax function $\psi : \mathbb{R}^{n \times c} \rightarrow \mathbb{R}^{n \times c}$ 
$$
P(y_{ik} = 1 \mid x_i, B) = 
    \psi(X B)_{ik} = \frac{\exp\left(x_i^{\top} \beta_k \right)}{\sum_{j=1}^c \exp\left(x_i^{\top} \beta_j\right)},
$$
which transforms each row of $X B$ into a multinomial probability. The log-likelihood for $n$ observations is then
$$
l(B\mid X, Y) = \sum_{i=1}^n \sum_{k=1}^c y_{ik} \log \psi(x_i, B).
$$
For identifiability purpose we set $\beta_c = 0$, which is necessary 
because the multinomial probabilities add up to one.
It is common to select a Cauchy distribution as prior for the regression parameters $B$. Then the log posterior kernel is $$ \log(\kappa(B\mid X, Y)) = \sum_{i=1}^n \sum_{k=1}^c y_{ik} \log(\psi(X B)_{ik}) + \sum_{i=1}^n \sum_{k=1}^p - \log (1 + \beta_{ik}^2). $$
This expression can be written entirly in terms of sum reduction, elementwise operation and matrix multiplication. If $\Gamma_{ik} = -\log(1 + \beta_{ik}^2)$. Then $$ \log(\kappa(B\mid X, Y)) = \mathrm{sum}(Y \odot \psi(X B)) + \mathrm{sum}(\Gamma) $$ wiht $\odot$ denoting elementwise multiplication. The same holds for the derivative with respect to the parameters $B$ $$ \frac{\partial \log(\kappa(B\mid X, Y))}{\partial B} = X^{\top} (Y - \psi(XB)) + \Gamma $$ Both expressions can be conveniently implemented on the GPU so that all the values remain on the GPU all the time.
In [14]:
    
let softmaxCpu numRows numCols (values:float32[]) =
    let output = Array.zeroCreate (numRows*numCols)
    for row = 0 to numRows-1 do
        let mutable output = values
        let mutable sum = 0.0f    
        let mutable maxVal = values.[row]
        for col = 0 to numCols - 1 do 
            let v = values.[col*numRows + row]
            if v > maxVal then maxVal <- v
        for col = 0 to numCols - 1 do
            sum <- sum + exp(values.[col*numRows + row] - maxVal)
        for col = 0 to numCols - 1 do 
            if sum = 0.0f then
                printfn "division by zero"
            output.[col*numRows + row] <- exp(values.[col*numRows + row] - maxVal) / sum 
    output
[<ReflectedDefinition>]
let softmaxKernel32 numRows numCols (values:deviceptr<float32>) (results:deviceptr<float32>) = 
    let row = blockIdx.y*blockDim.y + threadIdx.y
    let mutable results = results
    let mutable sum = 0.0f    
    if row < numRows then 
        let mutable maxVal = values.[row]
        for col = 0 to numCols - 1 do 
            let v = values.[col*numRows + row]
            if v > maxVal then maxVal <- v
        for col = 0 to numCols - 1 do
            sum <- sum + exp(values.[col*numRows + row] - maxVal)
        for col = 0 to numCols - 1 do 
            results.[col*numRows + row] <- exp(values.[col*numRows + row] - maxVal) / sum 
            
[<ReflectedDefinition>]
let softmaxKernel numRows numCols (values:deviceptr<float>) (results:deviceptr<float>) = 
    let row = blockIdx.y*blockDim.y + threadIdx.y
    let mutable results = results
    let mutable sum = 0.0
    if row < numRows then 
        let mutable maxVal = values.[row]
        for col = 0 to numCols - 1 do 
            let v = values.[col*numRows + row]
            if v > maxVal then maxVal <- v
        for col = 0 to numCols - 1 do
            sum <- sum + exp(values.[col*numRows + row] - maxVal)
        for col = 0 to numCols - 1 do 
            results.[col*numRows + row] <- exp(values.[col*numRows + row] - maxVal) / sum 
[<ReflectedDefinition>]
let argMaxRowKernel numRows numCols (values:deviceptr<float>) (results:deviceptr<float>) = 
    let row = blockIdx.y*blockDim.y + threadIdx.y
    let mutable results = results
    if row < numRows then 
        let mutable maxVal = values.[row]
        let mutable argMax = 0
        for col = 0 to numCols - 1 do 
            let v = values.[col*numRows + row]
            if v > maxVal then 
                maxVal <- v
                argMax <- col
        results.[row] <- float argMax
[<ReflectedDefinition>]
let predictionErrorKernel numRows numCols (y:deviceptr<float>) (prediction:deviceptr<float>) (results:deviceptr<float>) = 
    let row = blockIdx.y*blockDim.y + threadIdx.y
    let mutable results = results
    if row < numRows then 
        let col = prediction.[row] |> int
        results.[row] <- 1.0 - y.[col*numRows + row]
                     
[<ReflectedDefinition>]
let setColumnKernel numRows numCols (values:deviceptr<'T>) (results:deviceptr<'T>) col v = 
    let row = blockIdx.y*blockDim.y + threadIdx.y
    let mutable results = results
    if row < numRows then
         results.[col*numRows + row] <- v
    
In [15]:
    
let addIntercept (a:float32[,]) =
    let aintercept = Array2D.zeroCreate (a.GetLength 0) (1 + a.GetLength 1)
    for i = 0 to a.GetLength(0) - 1 do
        for j = 0 to a.GetLength(1) do
            aintercept.[i, j] <- if j = 0 then 1.0f else a.[i, j-1]
    aintercept
let softmaxf (a:GpuMat<float32>) = 
    if UseFortranOrder then
        let fill (ptr:deviceptr<float32>) =
            let blockSize = 64
            let block = new dim3(1, blockSize)
            let gridSize = divUp a.Rows blockSize
            let grid = new dim3(gridSize, gridSize)
            let lp = LaunchParam(grid, block)   
            a.Gpu.Launch(Action<_,_,_,_>softmaxKernel32, lp, a.Rows, a.Cols, a.Ptr, ptr)
        let fill = FillFunc fill
        new GpuMat<float32>(a.Session, a.Rows, a.Cols, fill)
    else
        failwith "softmax kernel requires Fortran storage order"
let softmax (a:GpuMat<float>) =
    if UseFortranOrder then
        let fill (ptr:deviceptr<float>) =
            let blockSize = 64
            let block = new dim3(1, blockSize)
            let gridSize = divUp a.Rows blockSize
            let grid = new dim3(gridSize, gridSize)
            let lp = LaunchParam(grid, block)   
            a.Gpu.Launch(Action<_,_,_,_> softmaxKernel, lp, a.Rows, a.Cols, a.Ptr, ptr)
        let fill = FillFunc fill
        new GpuMat<float>(a.Session, a.Rows, a.Cols, fill)
    else
        failwith "softmax kernel requires Fortran storage order"
let argMaxRows (a:GpuMat<float>) =
    if UseFortranOrder then
        let fill (ptr:deviceptr<float>) =
            let blockSize = 64
            let block = new dim3(1, blockSize)
            let gridSize = divUp a.Rows blockSize
            let grid = new dim3(gridSize, gridSize)
            let lp = LaunchParam(grid, block)   
            a.Gpu.Launch(Action<_,_,_,_> argMaxRowKernel, lp, a.Rows, a.Cols, a.Ptr, ptr)
        let fill = FillFunc fill
        new GpuMat<float>(a.Session, a.Rows, a.Cols, fill)
    else
        failwith "softmax kernel requires Fortran storage order"
let predictionError (y:GpuMat<float>) (prediction:GpuMat<float>) =
    if UseFortranOrder then
        let fill (ptr:deviceptr<float>) =
            let blockSize = 64
            let block = new dim3(1, blockSize)
            let gridSize = divUp y.Rows blockSize
            let grid = new dim3(gridSize, gridSize)
            let lp = LaunchParam(grid, block)   
            y.Gpu.Launch(Action<_,_,_,_,_> predictionErrorKernel, lp, y.Rows, y.Cols, y.Ptr, prediction.Ptr, ptr)
        let fill = FillFunc fill
        new GpuMat<float>(y.Session, y.Rows, 1, fill)
    else
        failwith "softmax kernel requires Fortran storage order"
let setColumn (a:GpuMat<'T>) (col:int) v = 
    if UseFortranOrder then
        let fill (ptr:deviceptr<'T>) =
            let blockSize = 64
            let block = new dim3(1, blockSize)
            let gridSize = divUp a.Rows blockSize
            let grid = new dim3(gridSize, gridSize)
            let lp = LaunchParam(grid, block)   
            a.Gpu.Launch(Action<_,_,_,_,_,_> setColumnKernel, lp, a.Rows, a.Cols, a.Ptr, ptr, col, v)   
        let fill = FillFunc fill
        new GpuMat<'T>(a.Session, a.Rows, a.Cols, fill)
    else
        failwith "softmax kernel requires Fortran storage order"
let maskf (a:GpuMat<float32>) = setColumn a (a.Cols-1) 0.0f
let mask (a:GpuMat<float>) = setColumn a (a.Cols-1) 0.0
    
In [16]:
    
// beta (p, c)
let cauchyPriorLogDensity (beta:GpuMat<float>) =
    sum (-gpuLog (1.0 .+ beta .* beta))
let multinomialLogLik (y:GpuMat<float>) (softmaxVal:GpuMat<float>) =
    sum (y .* gpuLog (softmaxVal .+ 1e-12))      
let priorGradBeta (beta:GpuMat<float>) = 
    -2.0 .* beta ./ (1.0 .+ beta .* beta)
let logLikGradBeta (x:GpuMat<float>) (y:GpuMat<float>) (softmaxVal:GpuMat<float>) =
    x.T * (y .- softmaxVal)  
// x (n, p), y (n, c)
let calc (random:Random) (hX:float[,]) (hY:float[,]) L nBurnIn nSamples epsBurnIn epsSim T annealRate verbose = gpucalc {
    let n = hX.GetLength 0
    let p = hX.GetLength 1
    let c = hY.GetLength 1
    let! x = toDevice hX 
    let! y = toDevice hY
    let! beta = randn p c 0.0 0.01
    let! momentum = undef p c
    let! softmaxVal = undef n c
    let! betaAv = zeros p c
    let! betaGrad = undef p c
    let! logLik = undef 1 1
    let! priorLogLik = undef 1 1
    let! momSquareSum = undef 1 1
    let! betaOld = undef p c
    let! softmaxAv = zeros n c
    let! predictions = undef n 1
    let! error = undef 1 1
    let hamiltonianMonteCarloSample eps =
        let t = System.Diagnostics.Stopwatch.StartNew()
        randnInplace momentum 0.0 1.0
        momentum .<- mask momentum
        softmaxVal .<- softmax (x * beta)
        logLik .<- multinomialLogLik y softmaxVal
        priorLogLik .<- cauchyPriorLogDensity beta
        momSquareSum .<- sum (momentum .* momentum)
        let initLogLik = head logLik 
        let currentK = head momSquareSum 
        let currentU = initLogLik + head priorLogLik 
        betaOld .<- beta
        betaGrad .<- (logLikGradBeta x y softmaxVal) .+ (priorGradBeta beta)
        betaGrad .<- mask betaGrad 
        momentum .<- momentum .+ (eps / 2.0) .* betaGrad
        for i = 0 to L-1 do
            beta .<- beta .+ eps .* momentum
            // update gradient
            betaGrad .<- (logLikGradBeta x y (softmax (x * beta))) .+ (priorGradBeta beta)
            betaGrad .<- mask betaGrad
            if i <> L - 1 then
                momentum .<- momentum .+ eps .* betaGrad
        momentum .<- momentum .+ (eps / 2.0) .* betaGrad
        softmaxVal .<- softmax (x * beta)
        logLik .<- multinomialLogLik y softmaxVal
        priorLogLik .<- cauchyPriorLogDensity beta
        let finalLogLik = head logLik 
        let proposedU = finalLogLik + head priorLogLik 
        momSquareSum .<- sum (momentum .* momentum)
        let proposedK = 0.5 * head momSquareSum 
        let diff = ((proposedU - proposedK) - (currentU - currentK)) / T
        let alpha = max diff 0.0
        let u = log(random.NextDouble()) 
        let result, accept = if u < alpha then beta, 1 else betaOld, 0
        t.Stop()
        if verbose then
            printfn "Current value of log-kernel %f" currentU
            printfn "Proposed value of log-kernel %f" proposedU     
            printfn "Current momentum %f" currentK
            printfn "Proposed momentum %f " proposedK      
            printfn "Total diff %A" diff
            printfn "Current log likelihood %f" initLogLik
            printfn "Proposed log-like %f" finalLogLik
            printfn "Alpha  %f, uniform random number %f" alpha u    
            printfn "----- %A -----" t.Elapsed
        result, accept  
    let hamiltonianMonteCarloSimulation () = 
        let rec burnIn n eps t accepted i =
            if i < n then 
                let beta, accept = hamiltonianMonteCarloSample epsBurnIn
                burnIn n eps (1.0 + t*annealRate) (accepted + accept) (i+1) 
            else
                accepted
        let acceptedBurnIn = burnIn nBurnIn epsBurnIn T 0 0
        let rec sample n eps t accepted (betas:float[,] list) i =
            if i < n then 
                let beta, accept = hamiltonianMonteCarloSample epsSim
                betaAv .<- betaAv .+ beta
                softmaxVal .<- softmax (x * beta)
                softmaxAv .<- softmaxAv .+ softmaxVal
                sample n eps t (accepted + accept) ((toHost beta) :: betas) (i+1)
            else
                List.rev betas, accepted
        let betas, accepted = sample nSamples epsSim 1.0 0 [] 0  
        
        betaAv .<- betaAv ./ (float nSamples)
        softmaxAv .<- softmaxAv ./ (float nSamples)
        predictions .<- argMaxRows softmaxAv
        error .<- sum (predictionError y predictions)
        let error = (head error) / (float y.Rows)
        let accuracy = 1.0 - error
        (toHost betaAv), betas, (toHost softmaxAv), accepted, error, accuracy
    return hamiltonianMonteCarloSimulation()
}
    
In [33]:
    
let run () =
    let random = Random 42
    let verbose = true
    let nBurnIn = 100     // burn-in iterations before sampling starts
    let nSamples = 100    // number of posterior samples
    let L = 100
    let epsBurnIn = 1e-4  // step size during burning, typically larger than during sampling
    let epsSim = 1e-5
    let T = 1000.0
    let annealRate = 0.9
    let hX = addIntercept testImages 
    let hY = testLabels
    let calc = calc random (hX |> Array2D.map float) (hY |> Array2D.map float) L nBurnIn nSamples epsBurnIn epsSim T annealRate verbose
    use session = new Session(Gpu.Default)
    let betaAv, betas, softmaxAv, accepted, error, accuracy = gpuRun session calc
    betaAv, betas, softmaxAv, accepted, error, accuracy
    
In [34]:
    
let betaAv, betas, softmaxAv, accepted, error, accuracy = run()
    
In [39]:
    
accepted, accuracy
    
    Out[39]:
In [19]:
    
let betaForClass (betaAv:float[,]) cls =
    let beta = columnVec betaAv cls |> Array.take (betaAv.GetLength(0) - 1) // chop off the intercept
    array2DColMajor 28 28 beta
    
In [37]:
    
let cls = 1
heatmap (betaForClass betaAv cls) (sprintf "Average Betas %d" cls) 500 500
    
    Out[37]:
In [31]:
    
let images = List.init (betaAv.GetLength 1) (fun cls -> betaForClass betaAv cls)
let stacked = stack 28 images 5 2 |> array2DAsLists 
heatmap stacked "Average Betas" 500 1000
    
    Out[31]:
In [ ]: