BOAST Interactive Advanced Tutorial

Symmetric Result DGEMM inner kernel

Based on the work of Eric Bainville

Initialization

Load BOAST and bring BOAST namespace into the global namespace



In [ ]:

    
require 'BOAST'
include BOAST

We wil first Be importing the reference implementation of the kernel directly from BigDFT sources (slight modification to remove the accumulation in the result)



In [ ]:

    
set_lang(C)
set_array_start(0)



In [ ]:

    
type_map = { 4 => NArray::SFLOAT, 8 => NArray::FLOAT}

This reference kernel computes a 2x4 block (transposed) of a matrix multiply operation C = A * B. As acccesses can be amortized for big matrix (n^2 acesses n^3 computations) the lines of A and collumns of B are first interleaved. This kernel expects 2 interleaved lines of A and 4 interleaved columns of B.

packed lines: $A[0]$, $A[1]$, $A[LDA]$, $A[LDA+1]$, $A[2]$, $A[3]$, $A[LDA+2]$, $A[LDA+3]$
packed collumns: $B[0]$, $B[1]$, $B[LDB]$, $B[LDB+1]$, $B[2*LDB]$, $B[2*LDB+1]$, $B[3*LDB]$, $B[3*LDB+1]$, $B[2]$, $B[3]$, $B[LDB+2]$, $B[LDB+3]$, etc...

The kernel is unrolled by a factor of 4. Maintaining and upgrading such a kernel is costly and a lot of hand tuning took place to obtain this version.



In [ ]:

    
def reference
  tsy = 2
  tsx = 4
  vl = 2
  alignment = vl*get_default_real_size
  # We describe the parameters of the procedure so that BOAST can interface with it.
  nvec = Int :nvec, dir: :in
  a = Real :a, dim: [Dim(vl), Dim(tsy), Dim(nvec)], dir: :in
  b = Real :b, dim: [Dim(vl), Dim(tsx), Dim(nvec)], dir: :in
  c = Real :c, dim: [Dim(tsy), Dim(tsx)], dir: :inout
  ldc = Int :ldc, dir: :in
  p = Procedure( :gemm_block_2x4_c, [a, b, nvec, c, ldc])
  # The source of the kernel is just a string containing the original procedure.
  # The kernel is in C and needs a header in order to compile.
  k = CKernel::new(lang: C, includes: "immintrin.h") {
  get_output.puts <<EOF
void gemm_block_2x4_c(const double * a,const double * b,long long int n,double * y,long long int ldy)
{
  __m128d A0,A1,B0,B1,B2,B3;
  __m128d S00,S01,S02,S03,S10,S11,S12,S13;
  S00 = _mm_setzero_pd();
  S01 = _mm_setzero_pd();
  S02 = _mm_setzero_pd();
  S03 = _mm_setzero_pd();
  S10 = _mm_setzero_pd();
  S11 = _mm_setzero_pd();
  S12 = _mm_setzero_pd();
  S13 = _mm_setzero_pd();
  unsigned long long int k=n>>2;

  do {
    A0 = _mm_load_pd(a);
    a += 4;
    B0 = _mm_load_pd(b);
    S00 = _mm_add_pd(S00,_mm_mul_pd(A0,B0));
    B1 = _mm_load_pd(b+2);
    b += 8;
    S01 = _mm_add_pd(S01,_mm_mul_pd(A0,B1));
    B2 = _mm_load_pd(b-4);
    S02 = _mm_add_pd(S02,_mm_mul_pd(A0,B2));
    B3 = _mm_load_pd(b-2);
    S03 = _mm_add_pd(S03,_mm_mul_pd(A0,B3));
    A1 = _mm_load_pd(a-2);
    S10 = _mm_add_pd(S10,_mm_mul_pd(A1,B0));
    S11 = _mm_add_pd(S11,_mm_mul_pd(A1,B1));
    S12 = _mm_add_pd(S12,_mm_mul_pd(A1,B2));
    S13 = _mm_add_pd(S13,_mm_mul_pd(A1,B3));
    A0 = _mm_load_pd(a);
    a += 4;
    B0 = _mm_load_pd(b);
    S00 = _mm_add_pd(S00,_mm_mul_pd(A0,B0));
    B1 = _mm_load_pd(b+2);
    b += 8;
    S01 = _mm_add_pd(S01,_mm_mul_pd(A0,B1));
    B2 = _mm_load_pd(b-4);
    S02 = _mm_add_pd(S02,_mm_mul_pd(A0,B2));
    B3 = _mm_load_pd(b-2);
    S03 = _mm_add_pd(S03,_mm_mul_pd(A0,B3));
    A1 = _mm_load_pd(a-2);
    S10 = _mm_add_pd(S10,_mm_mul_pd(A1,B0));
    S11 = _mm_add_pd(S11,_mm_mul_pd(A1,B1));
    S12 = _mm_add_pd(S12,_mm_mul_pd(A1,B2));
    S13 = _mm_add_pd(S13,_mm_mul_pd(A1,B3));
    A0 = _mm_load_pd(a);
    a += 4;
    B0 = _mm_load_pd(b);
    S00 = _mm_add_pd(S00,_mm_mul_pd(A0,B0));
    B1 = _mm_load_pd(b+2);
    b += 8;
    S01 = _mm_add_pd(S01,_mm_mul_pd(A0,B1));
    B2 = _mm_load_pd(b-4);
    S02 = _mm_add_pd(S02,_mm_mul_pd(A0,B2));
    B3 = _mm_load_pd(b-2);
    S03 = _mm_add_pd(S03,_mm_mul_pd(A0,B3));
    A1 = _mm_load_pd(a-2);
    S10 = _mm_add_pd(S10,_mm_mul_pd(A1,B0));
    S11 = _mm_add_pd(S11,_mm_mul_pd(A1,B1));
    S12 = _mm_add_pd(S12,_mm_mul_pd(A1,B2));
    S13 = _mm_add_pd(S13,_mm_mul_pd(A1,B3));
    A0 = _mm_load_pd(a);
    a += 4;
    B0 = _mm_load_pd(b);
    S00 = _mm_add_pd(S00,_mm_mul_pd(A0,B0));
    B1 = _mm_load_pd(b+2);
    b += 8;
    S01 = _mm_add_pd(S01,_mm_mul_pd(A0,B1));
    B2 = _mm_load_pd(b-4);
    S02 = _mm_add_pd(S02,_mm_mul_pd(A0,B2));
    B3 = _mm_load_pd(b-2);
    S03 = _mm_add_pd(S03,_mm_mul_pd(A0,B3));
    A1 = _mm_load_pd(a-2);
    S10 = _mm_add_pd(S10,_mm_mul_pd(A1,B0));
    S11 = _mm_add_pd(S11,_mm_mul_pd(A1,B1));
    S12 = _mm_add_pd(S12,_mm_mul_pd(A1,B2));
    S13 = _mm_add_pd(S13,_mm_mul_pd(A1,B3));
  } while (--k>0);
  _mm_store_pd(y, _mm_hadd_pd(S00,S10));
  _mm_store_pd(y+ldy, _mm_hadd_pd(S01,S11));
  _mm_store_pd(y+2*ldy, _mm_hadd_pd(S02,S12));
  _mm_store_pd(y+3*ldy, _mm_hadd_pd(S03,S13));
}
EOF
  }
  # we set the entry point of the kernel
  k.procedure = p
  k
end

These helper function pack lines and collumns from the source matrices



In [ ]:

    
def pack_lines(mat, line_start, tile_size_y:, vector_length:, unroll:, **ignored)
  n_vec = (mat.shape[0].to_f / vector_length).ceil
  n_vec = (n_vec.to_f/unroll).ceil*unroll
  package = ANArray::new(mat.typecode, vector_length*mat.element_size,
    vector_length, tile_size_y, n_vec)
  mat.shape[0].times { |i|
    (line_start...[line_start + tile_size_y, mat.shape[1]].min).each { |j|
      package[i%vector_length, j-line_start, i/vector_length] = mat[i, j]
    }
  }
  package
end



In [ ]:

    
def pack_columns(mat, column_start, tile_size_x:, vector_length:, unroll:, **ignored)
  n_vec = (mat.shape[1].to_f / vector_length).ceil
  n_vec = (n_vec.to_f/unroll).ceil*unroll
  package = ANArray::new(mat.typecode, vector_length*mat.element_size,
    vector_length, tile_size_x, n_vec)
  mat.shape[1].times { |i|
    (column_start...[column_start + tile_size_x, mat.shape[0]].min).each { |j|
      package[i%vector_length, j-column_start, i/vector_length] = mat[j, i]
    }
  }
  package
end

Testing the reference

We will be using NArray matricx multiply implementation to generate a result block of the matrix



In [ ]:

    
a = NMatrix::new( type_map[get_default_real_size], 5000, 100 ).randomn!
b = a.transpose
c = a * b
column_start = 8
line_start = 4
tsy = 2
tsx = 4
p c_ruby_block = c[line_start...(line_start+tsy), column_start...(column_start+tsx)]
nil;

We pack the corresponding lines and columns:



In [ ]:

    
pl = pack_lines(a, 4, vector_length: 2, tile_size_x: 4, tile_size_y:2, unroll: 4)
pc = pack_columns(b, 8, vector_length: 2, tile_size_x: 4, tile_size_y:2, unroll: 4)
nil;

We obtain the computing kernel and build it.



In [ ]:

    
k_ref = reference
k_ref.build
nil;

We allocate a result buffer and set some performance evaluation variables.



In [ ]:

    
c_ref_block = NMatrix::new( type_map[get_default_real_size], 2, 4 )
repeat = 100
repeat_inner = 10
epsilon = 1e-8
nil;

We execute the kernel $repeat$ times and gather the result. Inside of boast the computation is done $repeat_inner$ times to account for the very short computation time.



In [ ]:

    
res = repeat.times.collect {
  k_ref.run(pl, pc, pl.shape[2], c_ref_block, 2, repeat: repeat_inner)
}
nil;

We verify the precision of the computation:



In [ ]:

    
p c_ref_block
err = c_ruby_block - c_ref_block
raise "Computation error" if err.abs.max > epsilon

We pick the shortest runitme to compute the performance of the kernel. Arithmetic complexity is $2 * ALineLength * TileSizex *TileSizey$. And we account for the repetition inside of BOAST.



In [ ]:

    
best = res.min { |r1, r2|
  r1[:duration] <=> r2[:duration]
}
perf = repeat_inner * 2 * a.shape[0] * 4 * 2 / (best[:duration] * 1e9 )
puts "time: #{best[:duration]} s, GFlops: #{perf}"
nil;

Creating a new Meta-Programmed Implementation

We want to be able to chose the length of the vectors, the tiling and the inner unrolling. So we define a function that accpets those parameters and that will return the corresponding kernel. For default values we use the parameters from the reference kernel. We slightly modified the interface so that the kernel also compiles using FORTRAN.



In [ ]:

    
def inner_kernel(opts={})
  default = {vector_length: 2, tile_size_x: 4, tile_size_y:2, unroll: 4}
  opts = default.merge(opts)
  vl = opts[:vector_length]
  tsx = opts[:tile_size_x]
  tsy = opts[:tile_size_y]
  unroll = opts[:unroll]

  # We describe the interface of the procedure:
  nvec = Int :n, dir: :in
  a = Real :a, vector_length: vl, dim: [Dim(tsy), Dim(nvec)], dir: :in, restrict: true
  b = Real :b, vector_length: vl, dim: [Dim(tsx), Dim(nvec)], dir: :in, restrict: true
  c = Real :c, dim: [Dim(tsy),Dim(tsx)], dir: :inout
  # The procedure body directly follows the Procedure declaration:
  p = Procedure( :"inner_v#{vl}_x#{tsx}_y#{tsy}_u#{unroll}", [nvec, a, b, c]) {
    # We allocate one vector per resulting value:
    tmp_res = tsx.times.collect { |k|
      tsy.times.collect { |l|
        Real :"tmpres_#{k}_#{l}", vector_length: vl
      }
    }
    # And one temporary value per tile dimension:
    tmp_a = tsy.times.collect { |l|
      Real :"tmpa_#{l}", vector_length: vl
    }
    tmp_b = tsx.times.collect { |l|
      Real :"tmpb_#{l}", vector_length: vl
    }
    # An iterator variable
    i = Int :i

    # Declaration of all the variables
    decl *tmp_res.flatten
    decl *tmp_a
    decl *tmp_b
    decl i
    
    # Initialization of the temporary result values
    tmp_res.flatten.each { |tmp|
      pr tmp.set 0.0
    }
    # The inner block will multiply tsy vectors of A with tsx vectors of B
    # We use a flag map to keep track of the values that have already been loaded
    p_inn = lambda { |offset|
      loaded = {}
      tsy.times { |k|
        pr tmp_a[k] === a[k, offset]
        tsx.times { |l|
          unless loaded[l]
            pr tmp_b[l] === b[l, offset]
            loaded[l] = true
          end
          # The FMA(a, b, c) function will return a * b + c
          # if the architecture doesn't support FMA
          pr tmp_res[l][k] === FMA(tmp_a[k], tmp_b[l], tmp_res[l][k])
        }
      }
    }
    # The main loop, unrolled
    pr For(i, 0, nvec - 1, step: unroll) {
      unroll.times { |j|
        p_inn.call(i+j)
      }
    }
    # We compute the resulting values by reducing the vectors components
    tsy.times { |k|
      tsx.times { |j|
        pr c[k,j] === vl.times.collect { |l| tmp_res[j][k][l] }.reduce(:+)
      }
    }
  }
  
  # The Kernel body is the BOAST print of the procedure so we can use
  p.ckernel(:includes => "immintrin.h")
end



In [ ]:

    
k = inner_kernel



In [ ]:

    
#set_verbose(true)
#set_debug_source(true)
k.build
nil;

Checking the kernel result and first performance evaluation:



In [ ]:

    
c_block = NMatrix::new( type_map[get_default_real_size], 2, 4)
res = repeat.times.collect {
  k.run(pl.shape[2], pl, pc, c_block, repeat: repeat_inner)
}
p c_block
err = c_ruby_block - c_block
raise "Computation error" if err.abs.max > epsilon
best = res.min { |r1, r2|
  r1[:duration] <=> r2[:duration]
}
perf = repeat_inner * 2 * a.shape[0] * 4 * 2 / (best[:duration] * 1e9 )
puts "time: #{best[:duration]} s, GFlops: #{perf}"
nil;

Generating new versions, checking the correctness and finding the best version

Firs define the exploration space:



In [ ]:

    
s = OptimizationSpace::new(vector_length: [2,4], tile_size_x: 1..4, tile_size_y: 1..4, unroll: 1..4)
s.to_h

The optimizer will create all possible configuration and pass them to the block. The result of the block will be the metric used and the optimizer will search the configuration yielding the minimum metric.



In [ ]:

    
optimum = BruteForceOptimizer::new(s).optimize { |opts|
  p opts
  tsx = opts[:tile_size_x]
  tsy = opts[:tile_size_y]
  c_ruby_block = c[line_start...(line_start + tsy), column_start...(column_start + tsx)]
  # We create an compile a new kernel for each configuration
  k = inner_kernel(**opts)
  k.build
  #Instanciate a new result block and pack the corresponding lines and columns
  c_block = NMatrix::new( type_map[get_default_real_size], tsy, tsx)
  pl = pack_lines(a, line_start, **opts)
  pc = pack_columns(b, column_start, **opts)
  # We evaluate the kernel's performance and accuracy using the same method as before
  res = repeat.times.collect {
    k.run(pl.shape[2], pl, pc, c_block, repeat: repeat_inner)
  }
  err = c_ruby_block - c_block
  err.each { |v|
    if v.abs > epsilon
      puts k
      puts "c:"
      p c_ruby_block
      puts "c_block:"
      p c_block
      puts "error:"
      p err
      raise "Computation error!" if v.abs > epsilon
    end
  }
  best = res.min { |r1, r2|
    r1[:duration] <=> r2[:duration]
  }
  perf = repeat_inner * 2 * a.shape[0] * tsx * tsy / (best[:duration] * 1e9 )
  puts "time: #{best[:duration]} s, GFlops: #{perf}"
  # we try to maximize perf so we return the inverse
  1.0/perf
}
nil;

The resulting configuration, using the default compilation flags and the experimenters architecture:



In [ ]:

    
p optimum[0]
puts "GFlops: #{1/optimum[1]}"



In [ ]:

    
k = inner_kernel(**optimum[0])