DGEMM for KNL

Reproduction of: Lim, R., Lee, Y., Kim, R. et al. Cluster Comput (2018) 21: 1785. https://doi.org/10.1007/s10586-018-2810-y


In [ ]:
require 'BOAST'
include BOAST
set_lang(C)
set_array_start(0)
type_map = { 4 => NArray::SFLOAT, 8 => NArray::FLOAT}

Micro-kernel

Definition

The first step is implementing an efficient micro-kernel The micro-kernel update a block of $C$ of size $[mr, nr]$ noted $\widehat{C}$ using a block of $A$ of size $[mr, kb]$ noted $\widehat{A}$ and a block of $B$ of size $[kb, nr]$ noted $\widehat{B}$. $\widehat{A}$ is stored in column major order while $\widehat{C}$ and $\widehat{B}$ are stored in row major order.


In [ ]:
def micro_kernel(vector_length: 4, nr: nil, mr: nil, kb: nil)
  raise "nr must be a multiple of vector_length!" unless nr % vector_length == 0
  nvec = nr / vector_length
  register_number = mr * nvec + nvec
  puts "Using #{register_number} registers..."
  
  ah = Real :ah, dim: [Dim(mr), Dim(kb)], dir: :in, restrict: true
  bh = Real :bh, vector_length: vector_length, dim: [Dim(nvec), Dim(kb)], dir: :in, restrict: true
  ch = Real :ch, vector_length: vector_length, dim: [Dim(nvec), Dim(mr)], dir: :inout, restrict: true
  inp = Procedure("inp_#{vector_length}_#{nr}_#{mr}_#{kb}", [ah, bh, ch]) {
    regs = (0...nvec).collect { |n|
      (0...mr).collect { |m|
        Real :"reg_#{n}_#{m}", vector_length: vector_length, register: true
      }
    }
    regs_b = (0...nvec).collect { |n|
      Real :"regs_b_#{n}", vector_length: vector_length, register: true
    }
    decl *regs.flatten
    decl *regs_b
    (0...mr).collect { |m|
      (0...nvec).collect { |n|
        pr regs[n][m] === ch[n, m]
      }
    }
    i = Int :i
    decl i
    pr For(i, 0, kb - 1) {
      (0...nvec).each { |n|
        pr regs_b[n] === bh[n, i]
      }
      (0...mr).each { |m|
        (0...nvec).each { |n|
          pr regs[n][m] === FMA(Set(ah[m, i], regs_b[0]), regs_b[n], regs[n][m])
        }
      }
    }
    (0...mr).collect { |m|
      (0...nvec).collect { |n|
        pr ch[n, m] === regs[n][m]
      }
    }
  }
  inp.ckernel(includes: "immintrin.h")
end

Test


In [ ]:
vector_length = 4
mr = 4
nr = 12
kb = 480
nvec = nr / vector_length
type = type_map[get_default_real_size]
alignment = get_default_real_size*vector_length
a = NMatrix::new(type, kb, mr).random!
b = NMatrix::new(type, nr, kb).random!
c = NMatrix::new(type, nr, mr).random!

ah = ANArray::new(type, alignment, mr, kb)
bh = ANArray::new(type, alignment, vector_length, nvec, kb)
ch = ANArray::new(type, alignment, vector_length, nvec, mr)
c_ref = ANArray::new(type, alignment, vector_length, nvec, mr)
ah[true, true] = a.transpose(1,0)[true, true]
bh[true, true, true] = b.reshape(vector_length, nvec, kb)[true, true, true]
ch[true, true, true] = c.reshape(vector_length, nvec, mr)[true, true, true]
c_ref[true, true, true] = (a*b + c).reshape(vector_length, nvec, mr)[true, true, true]

p = micro_kernel(vector_length: vector_length, mr: mr, nr: nr, kb: kb)
p.run(ah, bh, ch)
max_error = (ch - c_ref).abs.max
raise "Computation error!" if max_error > 1e-8
puts "Success!"
nil

In [ ]:
p.run(ah, bh, ch)
repeat_inner = 100
res = 1000.times.collect {
  p.run(ah, bh, ch, repeat: repeat_inner)
}
best = res.min { |r1, r2|
  r1[:duration] <=> r2[:duration]
}
perf = mr * nr * kb * 2 / (best[:duration] * 1e9 / repeat_inner )
puts "time: #{best[:duration] / repeat_inner} s, GFlops: #{perf}"

Medium Kernel

Definition

The medium kernel works using blocks of intermediate size. The medium kernel updates a block of $C$ of size $[kb,n]$ noted $\widetilde{C}$ using a block of $A$ of size $[mb,kb]$ noted $\widetilde{A}$ and a block of $B$ of of size $[kb,n]$ noted $\widetilde{B}$. $\widetilde{A}$ is stored as $mb/mr$ consecutive blocks of size $[mr, kb]$ ($\widehat{A}$) in column major order while $\widetilde{C}$ is stored as $(mb/mr)*(n/nr)$ consecutive blocks of size $[mr,nr]$ ($\widehat{C}$) in row major order and $\widetilde{B}$ is stored as $n/nr$ blocks of size $[kb, nr]$ ($\widehat{B}$) in row major order.


In [ ]:
def medium_kernel(vector_length: 4, mb: nil, nr: nil, mr: nil, kb: nil)
  raise "nr must be a multiple of vector_length!" unless nr % vector_length == 0
  raise "mr must be a multiple of mb!" unless mb % mr == 0
  nvec = nr / vector_length
  nblocka = mb / mr

  inp = micro_kernel(vector_length: vector_length, nr: nr, mr: mr, kb: kb)
  
  #n = Int :n, dir: :in
  nblockn = Int :nblockn, dir: :in #n / nr
  at = Real :at, dim: [Dim(mr), Dim(kb), Dim(nblocka)], dir: :in, restrict: true
  bt = Real :bt, vector_length: vector_length, dim: [Dim(nvec), Dim(kb),  Dim(nblockn)], dir: :in, restrict: true
  ct = Real :ct, vector_length: vector_length, dim: [Dim(nvec), Dim(mr),  Dim(nblocka), Dim(nblockn)], dir: :inout, restrict: true
  medp = Procedure( "medp_#{vector_length}_#{mb}_#{nr}_#{mr}_#{kb}", [nblockn, at, bt, ct] ) {
    jr = Int :jr
    ir = Int :ir
    decl jr, ir
    pr For(jr, 0, nblockn - 1) {
      pr For(ir, 0, nblocka - 1) {
        pr inp.procedure.call(at[0, 0, ir].address, bt[0, 0, jr].address, ct[0, 0, ir, jr].address)
      }
    }
  }
  k = CKernel::new(includes: "immintrin.h") {
    pr inp.procedure
    pr medp
  }
  k.procedure = medp
  k
end

Test


In [ ]:
vector_length = 4
mr = 4
nr = 12
nblocka = 18
mb = mr * nblocka
kb = 480
nblockn = 1024
n  = nr * nblockn
nvec = nr / vector_length

type = type_map[get_default_real_size]
alignment = get_default_real_size*vector_length
a = NMatrix::new(type, kb, mb).random!
b = NMatrix::new(type, n, kb).random!
c = NMatrix::new(type, n, mb).random!

at = ANArray::new(type, alignment, mr, kb, nblocka)
bt = ANArray::new(type, alignment, vector_length, nvec, kb, nblockn)
ct = ANArray::new(type, alignment, vector_length, nvec, mr, nblocka, nblockn)
c_ref = ANArray::new(type, alignment, vector_length, nvec, mr, nblocka, nblockn)
at[true, true, true] = a.reshape(kb, mr, nblocka).transpose(1, 0, 2)[true, true, true]
bt[true, true, true, true] = b.reshape(vector_length, nvec, nblockn, kb)
                              .transpose(0, 1, 3, 2)[true, true, true, true]
ct[true, true, true, true, true] = c.reshape(vector_length, nvec, nblockn, mr, nblocka)
                              .transpose(0, 1, 3, 4, 2)[true, true, true, true, true]
c_ref[true, true, true, true, true] = (a*b + c).reshape(vector_length, nvec, nblockn, mr, nblocka)
                                         .transpose(0, 1, 3, 4, 2)[true, true, true, true, true]

p = medium_kernel(vector_length: vector_length, mb: mb, mr: mr, nr: nr, kb: kb)
p.run(nblockn, at, bt, ct)
max_error = (ct - c_ref).abs.max
raise "Computation error!" if max_error > 1e-8
puts "Success!"
nil

In [ ]:
p.run(nblockn, at, bt, ct)
repeat_inner = 10
res = 10.times.collect {
  p.run(nblockn, at, bt, ct, repeat: repeat_inner)
}
best = res.min { |r1, r2|
  r1[:duration] <=> r2[:duration]
}
perf = mb * n * kb * 2 / (best[:duration] * 1e9 / repeat_inner )
puts "time: #{best[:duration] / repeat_inner} s, GFlops: #{perf}"

Large Kernel

Definition

The large kernel uses matrix of any size. In our implementation we will consider the matrix to be already transformed. The original matrices are $C$ of size $[m,n]$, $A$ of size $[m,k]$ and $B$ of size $[k,n]$. The layout used here are: $C$ is stored as $m/mb$ blocks of $\widetilde{C}$, $A$ is stored as $(k/kb)*(m/mb)$ blocks of $\widetilde{A}$ and $B$ is stored as $(k/kb)$ blocks of $\widetilde{B}$.


In [ ]:
def large_kernel(vector_length: 4, mb: nil, nr: nil, mr: nil, kb: nil)
  raise "nr must be a multiple of vector_length!" unless nr % vector_length == 0
  raise "mr must be a multiple of mb!" unless mb % mr == 0
  
  inp = micro_kernel(vector_length: vector_length, nr: nr, mr: mr, kb: kb)
  medp = medium_kernel(vector_length: vector_length, mb: mb, nr: nr, mr: mr, kb: kb)
  
  #m = Int :m, dir: :in
  #n = Int :n, dir: :in
  #k = Int :k, dir: :in
  nvec = nr / vector_length
  nblockm = Int :nblockm, dir: :in #m / mb
  nblocka = mb / mr
  nblockn = Int :nblockn, dir: :in #n / nr
  nblockk = Int :nblockk, dir: :in #k / kb
  a = Real :a, dim: [Dim(mr), Dim(kb), Dim(nblocka), Dim(nblockm), Dim(nblockk)], dir: :in, restrict: true
  b = Real :b, vector_length: vector_length, dim: [Dim(nvec), Dim(kb), Dim(nblockn), Dim(nblockk)], dir: :in, restrict: true
  c = Real :c, vector_length: vector_length, dim: [Dim(nvec), Dim(mr), Dim(nblocka), Dim(nblockn), Dim(nblockm)], dir: :inout, restrict: true
  larp = Procedure( "larp_#{vector_length}_#{mb}_#{nr}_#{mr}_#{kb}", [nblockm, nblockn, nblockk, a, b, c] ) {
    p = Int :p
    i = Int :i
    decl p, i
    pr OpenMP::Parallel(default: :shared) {
    pr OpenMP::Single() {
    pr For(p, 0, nblockk - 1) {
      pr For(i, 0, nblockm - 1) {
        pr OpenMP::Task(depend: {in: [a[:all, :all, :all, i, p],
                                      b[:all, :all, :all, p]],
                                 inout: [c[:all, :all, :all, :all, i]] } ) {
          pr medp.procedure.call(nblockn, a[0, 0, 0, i, p].address, b[0, 0, 0, p].address, c[0, 0, 0, 0, i].address)
        }
      }
    }
    }
    }
  }
  
  k = CKernel::new(includes: "immintrin.h") {
    pr inp.procedure
    pr medp.procedure
    pr larp
  }
  k.procedure = larp
  k
end

Test


In [ ]:
vector_length = 4
mr = 4
nr = 12
nblocka = 18
mb = mr * nblocka
kb = 480
nblockn = 128
n  = nr * nblockn
nvec = nr / vector_length
nblockm = 21
m = mb * nblockm
nblockk = 3
k = kb * nblockk

type = type_map[get_default_real_size]
alignment = get_default_real_size*vector_length
a = NMatrix::new(type, k, m).random!
b = NMatrix::new(type, n, k).random!
c = NMatrix::new(type, n, m).random!

puts "a: #{m}x#{k} (#{m*k*get_default_real_size/(1e9)} GiB)"
puts "b: #{k}x#{n} (#{k*n*get_default_real_size/(1e9)} GiB)"
puts "c: #{m}x#{n} (#{m*n*get_default_real_size/(1e9)} GiB)"

ap = ANArray::new(type, alignment, mr, kb, nblocka, nblockm, nblockk)
bp = ANArray::new(type, alignment, vector_length, nvec, kb, nblockn, nblockk)
cp = ANArray::new(type, alignment, vector_length, nvec, mr, nblocka, nblockn, nblockm)
c_ref = ANArray::new(type, alignment, vector_length, nvec, mr, nblocka, nblockn, nblockm)
ap[true, true, true, true, true] = a.reshape(kb, nblockk, mr, nblocka, nblockm)
                                    .transpose(2, 0, 3, 4, 1)[true, true, true, true, true]
bp[true, true, true, true, true] = b.reshape(vector_length, nvec, nblockn, kb, nblockk)
                                    .transpose(0, 1, 3, 2, 4)[true, true, true, true, true]
cp[true, true, true, true, true, true] = c.reshape(vector_length, nvec, nblockn, mr, nblocka, nblockm)
                                          .transpose(0, 1, 3, 4, 2)[true, true, true, true, true, true]
c_ref[true, true, true, true, true, true] = (a*b + c).reshape(vector_length, nvec, nblockn, mr, nblocka, nblockm)
                                                     .transpose(0, 1, 3, 4, 2)[true, true, true, true, true, true]
push_env(use_vla: true) {
  p = large_kernel(vector_length: vector_length, mb: mb, mr: mr, nr: nr, kb: kb)
  p.build(openmp: true)
}
p.run(nblockm, nblockn, nblockk, ap, bp, cp)
  
max_error = (cp - c_ref).abs.max
raise "Computation error!" if max_error > 1e-8
puts "Success!"
nil

In [ ]:
p.run(nblockm, nblockn, nblockk, ap, bp, cp)
res = 100.times.collect {
  p.run(nblockm, nblockn, nblockk, ap, bp, cp)
}
best = res.min { |r1, r2|
  r1[:duration] <=> r2[:duration]
}
perf = m * n * k * 2 / (best[:duration] * 1e9)
puts "time: #{best[:duration]} s, GFlops: #{perf}"

In [ ]: