In [ ]:
%install-location $cwd/swift-install
%install '.package(path: "$cwd/FastaiNotebook_06_cuda")' FastaiNotebook_06_cuda


Installing packages:
	.package(path: "/home/sgugger/git/course-v3/nbs/swift/FastaiNotebook_06_cuda")
		FastaiNotebook_06_cuda
With SwiftPM flags: []
Working in: /tmp/tmp6kdf0qjm/swift-install
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
Updating https://github.com/mxcl/Path.swift
Updating https://github.com/saeta/Just
Updating https://github.com/latenitesoft/NotebookExport
Completed resolution in 2.03s
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
warning: /home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
/home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)[1/11] Compiling FastaiNotebook_06_cuda 01_matmul.swift
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
[2/11] Compiling FastaiNotebook_06_cuda 03_minibatch_training.swift
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
[3/11] Compiling FastaiNotebook_06_cuda 02_fully_connected.swift
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
[4/11] Compiling FastaiNotebook_06_cuda 05b_early_stopping.swift
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
[5/11] Compiling FastaiNotebook_06_cuda 06_cuda.swift
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
[6/11] Compiling FastaiNotebook_06_cuda 05_anneal.swift
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
[7/11] Compiling FastaiNotebook_06_cuda 02a_why_sqrt5.swift
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
[8/11] Compiling FastaiNotebook_06_cuda 00_load_data.swift
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
[9/11] Compiling FastaiNotebook_06_cuda 04_callbacks.swift
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
[10/11] Compiling FastaiNotebook_06_cuda 01a_fastai_layers.swift
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
[11/12] Merging module FastaiNotebook_06_cuda
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
/home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)[12/13] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
[13/14] Merging module jupyterInstalledPackages
/home/sgugger/swift/usr/bin/swift: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift)
/home/sgugger/swift/usr/bin/swiftc: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swiftc)
/home/sgugger/swift/usr/bin/swift-autolink-extract: /home/sgugger/anaconda3/lib/libuuid.so.1: no version information available (required by /home/sgugger/swift/usr/bin/swift-autolink-extract)
[14/14] Linking libjupyterInstalledPackages.so
Initializing Swift...
Installation complete!

In [ ]:
//export
import Path
import TensorFlow
import Python

In [ ]:
import FastaiNotebook_06_cuda

In [ ]:
%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")


Out[ ]:
('inline', 'module://ipykernel.pylab.backend_inline')

Load data


In [ ]:
let data = mnistDataBunch(flat: false, bs: 512)

In [ ]:
func optFunc(_ model: CnnModel) -> SGD<CnnModel> { return SGD(for: model, learningRate: 0.4) }
func modelInit() -> CnnModel { return CnnModel(channelIn: 1, nOut: 10, filters: [8, 16, 32, 32]) }
let learner = Learner(data: data, lossFunc: softmaxCrossEntropy, optFunc: optFunc, modelInit: modelInit)
let recorder = learner.makeDefaultDelegates(metrics: [accuracy])
learner.addDelegates([learner.makeNormalize(mean: mnistStats.mean, std: mnistStats.std),
                      learner.makeAddChannel()])

In [ ]:
time { try! learner.fit(1) }


Epoch 0: [0.48433208, 0.8463]                                                  
average: 4766.350142 ms,   min: 4766.350142 ms,   max: 4766.350142 ms      

Batchnorm

Custom

Let's start by building our own BatchNorm layer from scratch. Eventually we intend for this code to do the trick:


In [ ]:
struct AlmostBatchNorm<Scalar: TensorFlowFloatingPoint>: Differentiable {
    // Configuration hyperparameters
    @noDerivative let momentum, epsilon: Scalar
    // Running statistics
    @noDerivative var runningMean, runningVariance: Tensor<Scalar>
    // Trainable parameters
    var scale, offset: Tensor<Scalar>
    
    init(featureCount: Int, momentum: Scalar = 0.9, epsilon: Scalar = 1e-5) {
        self.momentum = momentum
        self.epsilon = epsilon
        self.scale = Tensor(ones: [featureCount])
        self.offset = Tensor(zeros: [featureCount])
        self.runningMean = Tensor(0)
        self.runningVariance = Tensor(1)
    }

    mutating func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        let mean: Tensor<Scalar>
        let variance: Tensor<Scalar>
        switch Context.local.learningPhase {
        case .training:
            mean = input.mean(alongAxes: [0, 1, 2])
            variance = input.variance(alongAxes: [0, 1, 2])
            runningMean += (mean - runningMean) * (1 - momentum)
            runningVariance += (variance - runningVariance) * (1 - momentum)
        case .inference:
            mean = runningMean
            variance = runningVariance
        }
        let normalizer = rsqrt(variance + epsilon) * scale
        return (input - mean) * normalizer + offset
    }
}

But there are some automatic differentiation limitations (control flow support) and Layer protocol constraints (mutating call) that make this impossible for now (note the lack of @differentiable or a Layer conformance), so we'll need a few workarounds. A Reference will let us update running statistics without declaring the applied method mutating:


In [ ]:
//export
public class Reference<T> {
    public var value: T
    public init(_ value: T) { self.value = value }
}

The following snippet will let us differentiate a layer's call method if it's composed of training and inference implementations that are each differentiable:


In [ ]:
//export
public protocol LearningPhaseDependent: FALayer {
    associatedtype Input
    associatedtype Output
    
    @differentiable func forwardTraining(_ input: Input) -> Output
    @differentiable func forwardInference(_ input: Input) -> Output
}

extension LearningPhaseDependent {
    // This `@differentiable` attribute is necessary, to tell the compiler that this satisfies the FALayer
    // protocol requirement, even though there is a `@differentiating(forward)` method below.
    // TODO: It seems nondeterministically necessary. Some subsequent notebooks import this successfully without it,
    // some require it. Investigate.
    @differentiable
    public func forward(_ input: Input) -> Output {
        switch Context.local.learningPhase {
        case .training:  return forwardTraining(input)
        case .inference: return forwardInference(input)
        }
    }

    @differentiating(forward)
    func gradForward(_ input: Input) ->
        (value: Output, pullback: (Self.Output.TangentVector) ->
            (Self.TangentVector, Self.Input.TangentVector)) {
        switch Context.local.learningPhase {
        case .training:
            return valueWithPullback(at: input) { $0.forwardTraining ($1) }
        case .inference:
            return valueWithPullback(at: input) { $0.forwardInference($1) }
        }
    }
}

Now we can implement a BatchNorm that we can use in our models:


In [ ]:
//export
public protocol Norm: Layer where Input == Tensor<Scalar>, Output == Tensor<Scalar>{
    associatedtype Scalar
    init(featureCount: Int, epsilon: Scalar)
}

public struct FABatchNorm<Scalar: TensorFlowFloatingPoint>: LearningPhaseDependent, Norm {
    // TF-603 workaround.
    public typealias Input = Tensor<Scalar>
    public typealias Output = Tensor<Scalar>
    @noDerivative public var delegates: [(Self.Output) -> ()] = []
    
    // Configuration hyperparameters
    @noDerivative var momentum, epsilon: Scalar
    // Running statistics
    @noDerivative let runningMean, runningVariance: Reference<Tensor<Scalar>>
    // Trainable parameters
    public var scale, offset: Tensor<Scalar>
    
    public init(featureCount: Int, momentum: Scalar, epsilon: Scalar = 1e-5) {
        self.momentum = momentum
        self.epsilon = epsilon
        self.scale = Tensor(ones: [featureCount])
        self.offset = Tensor(zeros: [featureCount])
        self.runningMean = Reference(Tensor(0))
        self.runningVariance = Reference(Tensor(1))
    }
    
    public init(featureCount: Int, epsilon: Scalar = 1e-5) {
        self.init(featureCount: featureCount, momentum: 0.9, epsilon: epsilon)
    }

    @differentiable
    public func forwardTraining(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        let mean = input.mean(alongAxes: [0, 1, 2])
        let variance = input.variance(alongAxes: [0, 1, 2])
        runningMean.value += (mean - runningMean.value) * (1 - momentum)
        runningVariance.value += (variance - runningVariance.value) * (1 - momentum)
        let normalizer = rsqrt(variance + epsilon) * scale
        return (input - mean) * normalizer + offset
    }
    
    @differentiable
    public func forwardInference(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        let mean = runningMean.value
        let variance = runningVariance.value
        let normalizer = rsqrt(variance + epsilon) * scale
        return (input - mean) * normalizer + offset
    }
}

TensorFlow provides a highly optimized batch norm implementation, let us redefine our batch norm to invoke it directly.


In [ ]:
//export
struct BatchNormResult<Scalar : TensorFlowFloatingPoint> : Differentiable{
    var y, batchMean, batchVariance, reserveSpace1, reserveSpace2: Tensor<Scalar>
}

public struct TFBatchNorm<Scalar: TensorFlowFloatingPoint>: LearningPhaseDependent, Norm {
    // Configuration hyperparameters
    @noDerivative var momentum, epsilon: Scalar
    // Running statistics
    @noDerivative let runningMean, runningVariance: Reference<Tensor<Scalar>>
    // Trainable parameters
    public var scale, offset: Tensor<Scalar>
    @noDerivative public var delegates: [(Self.Output) -> ()] = []
    
    public init(featureCount: Int, momentum: Scalar, epsilon: Scalar = 1e-5) {
        self.momentum = momentum
        self.epsilon = epsilon
        self.scale = Tensor(ones: [featureCount])
        self.offset = Tensor(zeros: [featureCount])
        self.runningMean = Reference(Tensor(0))
        self.runningVariance = Reference(Tensor(1))
    }
    
    public init(featureCount: Int, epsilon: Scalar = 1e-5) {
        self.init(featureCount: featureCount, momentum: 0.9, epsilon: epsilon)
    }

    @differentiable
    public func forwardTraining(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        let res = TFBatchNorm<Scalar>.fusedBatchNorm(
            input, scale: scale, offset: offset, epsilon: epsilon)
        let (output, mean, variance) = (res.y, res.batchMean, res.batchVariance)
        runningMean.value += (mean - runningMean.value) * (1 - momentum)
        runningVariance.value += (variance - runningVariance.value) * (1 - momentum)
        return output
     }
    
    @differentiable
    public func forwardInference(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        let mean = runningMean.value
        let variance = runningVariance.value
        let normalizer = rsqrt(variance + epsilon) * scale
        return (input - mean) * normalizer + offset
    }
    
    @differentiable(wrt: (x, scale, offset), vjp: _vjpFusedBatchNorm)
    static func fusedBatchNorm(
        _ x : Tensor<Scalar>, scale: Tensor<Scalar>, offset: Tensor<Scalar>, epsilon: Scalar
    ) -> BatchNormResult<Scalar> {
        let ret = Raw.fusedBatchNormV2(
            x, scale: scale, offset: offset, 
            mean: Tensor<Scalar>([] as [Scalar]), variance: Tensor<Scalar>([] as [Scalar]), 
            epsilon: Double(epsilon))
        return BatchNormResult(
            y: ret.y, batchMean: ret.batchMean, batchVariance: ret.batchVariance,
            reserveSpace1: ret.reserveSpace1, reserveSpace2: ret.reserveSpace2
        )
    }

    static func _vjpFusedBatchNorm(
        _ x : Tensor<Scalar>, scale: Tensor<Scalar>, offset: Tensor<Scalar>, epsilon: Scalar
    ) -> (BatchNormResult<Scalar>, 
          (BatchNormResult<Scalar>.TangentVector) -> (Tensor<Scalar>.TangentVector, 
                                                        Tensor<Scalar>.TangentVector, 
                                                        Tensor<Scalar>.TangentVector)) {
      let bnresult = fusedBatchNorm(x, scale: scale, offset: offset, epsilon: epsilon)
  
        return (
            bnresult, 
            {v in 
                let res = Raw.fusedBatchNormGradV2(
                    yBackprop: v.y, x, scale: Tensor<Float>(scale), 
                    reserveSpace1: bnresult.reserveSpace1, 
                    reserveSpace2: bnresult.reserveSpace2, 
                    epsilon: Double(epsilon))
                return (res.xBackprop, res.scaleBackprop, res.offsetBackprop)
            })
    }
}

In [ ]:
//export
public struct ConvBN<Scalar: TensorFlowFloatingPoint>: FALayer {
    // TF-603 workaround.
    public typealias Input = Tensor<Scalar>
    public typealias Output = Tensor<Scalar>
    @noDerivative public var delegates: [(Self.Output) -> ()] = []
    public var conv: FANoBiasConv2D<Scalar>
    public var norm: FABatchNorm<Scalar>
    
    public init(_ cIn: Int, _ cOut: Int, ks: Int = 3, stride: Int = 1){
        // TODO (when control flow AD works): use Conv2D without bias
        self.conv = FANoBiasConv2D(cIn, cOut, ks: ks, stride: stride, activation: relu)
        self.norm = FABatchNorm(featureCount: cOut, epsilon: 1e-5)
    }

    @differentiable
    public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        return norm.forward(conv.forward(input))
    }
}

In [ ]:
// Would be great if this generic could work
// struct ConvNorm<NormType: Norm, Scalar: TensorFlowFloatingPoint>: Layer
//     where NormType.Scalar == Scalar {
//     var conv: Conv2D<Scalar>
//     var norm: NormType
//     init(
//         filterShape: (Int, Int, Int, Int),
//         strides: (Int, Int) = (1, 1),
//         padding: Padding = .valid,
//         activation: @escaping Conv2D<Scalar>.Activation = identity
//     ) {
//         // TODO (when control flow AD works): use Conv2D without bias
//         self.conv = Conv2D(
//             filterShape: filterShape,
//             strides: strides,
//             padding: padding,
//             activation: activation)
//         self.norm = NormType.init(featureCount: filterShape.3, epsilon: 1e-5)
//     }

//     @differentiable
//     func applied(to input: Tensor<Scalar>) -> Tensor<Scalar> {
//         return norm.applied(to: conv.applied(to: input))
//     }
// }
//typealias ConvBN = ConvNorm<BatchNorm<Float>, Float>

In [ ]:
//export
public struct CnnModelBN: Layer {
    public var convs: [ConvBN<Float>]
    public var pool = FAGlobalAvgPool2D<Float>()
    public var linear: FADense<Float>
    @noDerivative public var delegates: [(Self.Output) -> ()] = []
    
    public init(channelIn: Int, nOut: Int, filters: [Int]){
        let allFilters = [channelIn] + filters
        convs = Array(0..<filters.count).map { i in
            return ConvBN(allFilters[i], allFilters[i+1], ks: 3, stride: 2)
        }
        linear = FADense<Float>(filters.last!, nOut)
    }
    
    @differentiable
    public func callAsFunction(_ input: TF) -> TF {
        // TODO: Work around https://bugs.swift.org/browse/TF-606
        return linear.forward(pool.forward(convs(input)))
    }
}

In [ ]:
func optFunc(_ model: CnnModelBN) -> SGD<CnnModelBN> { return SGD(for: model, learningRate: 0.4) }
func modelInit() -> CnnModelBN { return CnnModelBN(channelIn: 1, nOut: 10, filters: [8, 16, 32, 32]) }
let learner = Learner(data: data, lossFunc: softmaxCrossEntropy, optFunc: optFunc, modelInit: modelInit)
let recorder = learner.makeDefaultDelegates(metrics: [accuracy])
learner.addDelegates([learner.makeNormalize(mean: mnistStats.mean, std: mnistStats.std),
                      learner.makeAddChannel()])

In [ ]:
time { try! learner.fit(1) }


Epoch 0: [nan(0x1fffff), 0.098]                                                
average: 2481.692572 ms,   min: 2481.692572 ms,   max: 2481.692572 ms      

More norms

Layer norm

From the paper: "batch normalization cannot be applied to online learning tasks or to extremely large distributed models where the minibatches have to be small".

General equation for a norm layer with learnable affine:

$$y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta$$

The difference with BatchNorm is

  1. we don't keep a moving average
  2. we don't average over the batches dimension but over the hidden dimension, so it's independent of the batch size

In [ ]:
struct LayerNorm2D<Scalar: TensorFlowFloatingPoint>: Norm {
    @noDerivative public var delegates: [(Self.Output) -> ()] = []
    // Configuration hyperparameters
    @noDerivative let epsilon: Scalar
    // Trainable parameters
    var scale: Tensor<Scalar>
    var offset: Tensor<Scalar>
    
    init(featureCount: Int, epsilon: Scalar = 1e-5) {
        self.epsilon = epsilon
        self.scale = Tensor(ones: [featureCount])
        self.offset = Tensor(zeros: [featureCount])
    }
    
    @differentiable
    func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        let mean = input.mean(alongAxes: [1, 2, 3])
        let variance = input.variance(alongAxes: [1, 2, 3])
        let normalizer = rsqrt(variance + epsilon) * scale
        return (input - mean) * normalizer + offset
    }
}

struct ConvLN<Scalar: TensorFlowFloatingPoint>: FALayer {
    @noDerivative public var delegates: [(Self.Output) -> ()] = []
    var conv: FANoBiasConv2D<Scalar>
    var norm: LayerNorm2D<Scalar>
    
    init(_ cIn: Int, _ cOut: Int, ks: Int = 3, stride: Int = 2){
        // TODO (when control flow AD works): use Conv2D without bias
        self.conv = FANoBiasConv2D(cIn, cOut, ks: ks, stride: stride, activation: relu)
        self.norm = LayerNorm2D(featureCount: cOut, epsilon: 1e-5)
    }

    @differentiable
    func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        return norm.callAsFunction(conv.forward(input))
    }
}

In [ ]:
public struct CnnModelLN: Layer {
    public var convs: [ConvLN<Float>]
    public var pool = FAGlobalAvgPool2D<Float>()
    public var linear: FADense<Float>
    
    public init(channelIn: Int, nOut: Int, filters: [Int]){
        let allFilters = [channelIn] + filters
        convs = Array(0..<filters.count).map { i in
            return ConvLN(allFilters[i], allFilters[i+1], ks: 3, stride: 2)
        }
        linear = FADense<Float>(filters.last!, nOut)
    }
    
    @differentiable
    public func callAsFunction(_ input: TF) -> TF {
        // TODO: Work around https://bugs.swift.org/browse/TF-606
        return linear.forward(pool.forward(convs(input)))
    }
}

In [ ]:
struct InstanceNorm<Scalar: TensorFlowFloatingPoint>: Norm {
    @noDerivative public var delegates: [(Self.Output) -> ()] = []
    // Configuration hyperparameters
    @noDerivative let epsilon: Scalar
    // Trainable parameters
    var scale: Tensor<Scalar>
    var offset: Tensor<Scalar>
    
    init(featureCount: Int, epsilon: Scalar = 1e-5) {
        self.epsilon = epsilon
        self.scale = Tensor(ones: [featureCount])
        self.offset = Tensor(zeros: [featureCount])
    }
    
    @differentiable
    func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        let mean = input.mean(alongAxes: [2, 3])
        let variance = input.variance(alongAxes: [2, 3])
        let normalizer = rsqrt(variance + epsilon) * scale
        return (input - mean) * normalizer + offset
    }
}

struct ConvIN<Scalar: TensorFlowFloatingPoint>: FALayer {
    @noDerivative public var delegates: [(Self.Output) -> ()] = []
    var conv: FANoBiasConv2D<Scalar>
    var norm: InstanceNorm<Scalar>
    
    init(_ cIn: Int, _ cOut: Int, ks: Int = 3, stride: Int = 2){
        // TODO (when control flow AD works): use Conv2D without bias
        self.conv = FANoBiasConv2D(cIn, cOut, ks: ks, stride: stride, activation: relu)
        self.norm = InstanceNorm(featureCount: cOut, epsilon: 1e-5)
    }

    @differentiable
    func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        return norm.callAsFunction(conv.forward(input))
    }
}

Lost in all those norms? The authors from the group norm paper have you covered:

TODO/skipping GroupNorm

Running Batch Norm


In [ ]:
struct RunningBatchNorm<Scalar: TensorFlowFloatingPoint>: LearningPhaseDependent, Norm {
    @noDerivative public var delegates: [(Self.Output) -> ()] = []
    // Configuration hyperparameters
    @noDerivative let momentum: Scalar
    @noDerivative let epsilon: Scalar
    // Running statistics
    @noDerivative let runningSum: Reference<Tensor<Scalar>>
    @noDerivative let runningSumOfSquares: Reference<Tensor<Scalar>>
    @noDerivative let runningCount: Reference<Scalar>
    @noDerivative let samplesSeen: Reference<Int>
    // Trainable parameters
    var scale: Tensor<Scalar>
    var offset: Tensor<Scalar>
    
    init(featureCount: Int, momentum: Scalar, epsilon: Scalar = 1e-5) {
        self.momentum = momentum
        self.epsilon = epsilon
        self.scale = Tensor(ones: [featureCount])
        self.offset = Tensor(zeros: [featureCount])
        self.runningSum = Reference(Tensor(0))
        self.runningSumOfSquares = Reference(Tensor(0))
        self.runningCount = Reference(Scalar(0))
        self.samplesSeen = Reference(0)
    }
    
    init(featureCount: Int, epsilon: Scalar = 1e-5) {
        self.init(featureCount: featureCount, momentum: 0.9, epsilon: epsilon)
    }

    @differentiable
    func forwardTraining(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        let (batch, channels) = (input.shape[0], Scalar(input.shape[3]))
        let sum = input.sum(alongAxes: [0, 1, 2])
        let sumOfSquares = (input * input).sum(alongAxes: [0, 1, 2])
        // TODO: Work around https://bugs.swift.org/browse/TF-607
        let count = withoutDerivative(at: Scalar(input.scalarCount)) { tmp in tmp } / channels
        let mom = momentum / sqrt(Scalar(batch) - 1)
        let runningSum = mom * self.runningSum.value + (1 - mom) * sum
        let runningSumOfSquares = mom * self.runningSumOfSquares.value + (
            1 - mom) * sumOfSquares
        let runningCount = mom * self.runningCount.value + (1 - mom) * count
        
        self.runningSum.value = runningSum
        self.runningSumOfSquares.value = runningSumOfSquares
        self.runningCount.value = runningCount
        self.samplesSeen.value += batch
        
        let mean = runningSum / runningCount
        let variance = runningSumOfSquares / runningCount - mean * mean
        
        let normalizer = rsqrt(variance + epsilon) * scale
        return (input - mean) * normalizer + offset
    }
    
    @differentiable
    func forwardInference(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
        let mean = runningSum.value / runningCount.value
        let variance = runningSumOfSquares.value / runningCount.value - mean * mean
        let normalizer = rsqrt(variance + epsilon) * scale
        return (input - mean) * normalizer + offset
    }
}

TODO: XLA compilation + test RBN

Export


In [ ]:
import NotebookExport
let exporter = NotebookExport(Path.cwd/"07_batchnorm.ipynb")
print(exporter.export(usingPrefix: "FastaiNotebook_"))


success

In [ ]:


In [ ]: