In [ ]:
require 'BOAST'
include BOAST
We wil first Be importing the reference implementation of the kernel directly from BigDFT sources (slight modification to remove the accumulation in the result)
In [ ]:
set_lang(C)
set_array_start(0)
In [ ]:
type_map = { 4 => NArray::SFLOAT, 8 => NArray::FLOAT}
This reference kernel computes a 2x4 block (transposed) of a matrix multiply operation C = A * B. As acccesses can be amortized for big matrix (n^2 acesses n^3 computations) the lines of A and collumns of B are first interleaved. This kernel expects 2 interleaved lines of A and 4 interleaved columns of B.
The kernel is unrolled by a factor of 4. Maintaining and upgrading such a kernel is costly and a lot of hand tuning took place to obtain this version.
In [ ]:
def reference
tsy = 2
tsx = 4
vl = 2
alignment = vl*get_default_real_size
# We describe the parameters of the procedure so that BOAST can interface with it.
nvec = Int :nvec, dir: :in
a = Real :a, dim: [Dim(vl), Dim(tsy), Dim(nvec)], dir: :in
b = Real :b, dim: [Dim(vl), Dim(tsx), Dim(nvec)], dir: :in
c = Real :c, dim: [Dim(tsy), Dim(tsx)], dir: :inout
ldc = Int :ldc, dir: :in
p = Procedure( :gemm_block_2x4_c, [a, b, nvec, c, ldc])
# The source of the kernel is just a string containing the original procedure.
# The kernel is in C and needs a header in order to compile.
k = CKernel::new(lang: C, includes: "immintrin.h") {
get_output.puts <<EOF
void gemm_block_2x4_c(const double * a,const double * b,long long int n,double * y,long long int ldy)
{
__m128d A0,A1,B0,B1,B2,B3;
__m128d S00,S01,S02,S03,S10,S11,S12,S13;
S00 = _mm_setzero_pd();
S01 = _mm_setzero_pd();
S02 = _mm_setzero_pd();
S03 = _mm_setzero_pd();
S10 = _mm_setzero_pd();
S11 = _mm_setzero_pd();
S12 = _mm_setzero_pd();
S13 = _mm_setzero_pd();
unsigned long long int k=n>>2;
do {
A0 = _mm_load_pd(a);
a += 4;
B0 = _mm_load_pd(b);
S00 = _mm_add_pd(S00,_mm_mul_pd(A0,B0));
B1 = _mm_load_pd(b+2);
b += 8;
S01 = _mm_add_pd(S01,_mm_mul_pd(A0,B1));
B2 = _mm_load_pd(b-4);
S02 = _mm_add_pd(S02,_mm_mul_pd(A0,B2));
B3 = _mm_load_pd(b-2);
S03 = _mm_add_pd(S03,_mm_mul_pd(A0,B3));
A1 = _mm_load_pd(a-2);
S10 = _mm_add_pd(S10,_mm_mul_pd(A1,B0));
S11 = _mm_add_pd(S11,_mm_mul_pd(A1,B1));
S12 = _mm_add_pd(S12,_mm_mul_pd(A1,B2));
S13 = _mm_add_pd(S13,_mm_mul_pd(A1,B3));
A0 = _mm_load_pd(a);
a += 4;
B0 = _mm_load_pd(b);
S00 = _mm_add_pd(S00,_mm_mul_pd(A0,B0));
B1 = _mm_load_pd(b+2);
b += 8;
S01 = _mm_add_pd(S01,_mm_mul_pd(A0,B1));
B2 = _mm_load_pd(b-4);
S02 = _mm_add_pd(S02,_mm_mul_pd(A0,B2));
B3 = _mm_load_pd(b-2);
S03 = _mm_add_pd(S03,_mm_mul_pd(A0,B3));
A1 = _mm_load_pd(a-2);
S10 = _mm_add_pd(S10,_mm_mul_pd(A1,B0));
S11 = _mm_add_pd(S11,_mm_mul_pd(A1,B1));
S12 = _mm_add_pd(S12,_mm_mul_pd(A1,B2));
S13 = _mm_add_pd(S13,_mm_mul_pd(A1,B3));
A0 = _mm_load_pd(a);
a += 4;
B0 = _mm_load_pd(b);
S00 = _mm_add_pd(S00,_mm_mul_pd(A0,B0));
B1 = _mm_load_pd(b+2);
b += 8;
S01 = _mm_add_pd(S01,_mm_mul_pd(A0,B1));
B2 = _mm_load_pd(b-4);
S02 = _mm_add_pd(S02,_mm_mul_pd(A0,B2));
B3 = _mm_load_pd(b-2);
S03 = _mm_add_pd(S03,_mm_mul_pd(A0,B3));
A1 = _mm_load_pd(a-2);
S10 = _mm_add_pd(S10,_mm_mul_pd(A1,B0));
S11 = _mm_add_pd(S11,_mm_mul_pd(A1,B1));
S12 = _mm_add_pd(S12,_mm_mul_pd(A1,B2));
S13 = _mm_add_pd(S13,_mm_mul_pd(A1,B3));
A0 = _mm_load_pd(a);
a += 4;
B0 = _mm_load_pd(b);
S00 = _mm_add_pd(S00,_mm_mul_pd(A0,B0));
B1 = _mm_load_pd(b+2);
b += 8;
S01 = _mm_add_pd(S01,_mm_mul_pd(A0,B1));
B2 = _mm_load_pd(b-4);
S02 = _mm_add_pd(S02,_mm_mul_pd(A0,B2));
B3 = _mm_load_pd(b-2);
S03 = _mm_add_pd(S03,_mm_mul_pd(A0,B3));
A1 = _mm_load_pd(a-2);
S10 = _mm_add_pd(S10,_mm_mul_pd(A1,B0));
S11 = _mm_add_pd(S11,_mm_mul_pd(A1,B1));
S12 = _mm_add_pd(S12,_mm_mul_pd(A1,B2));
S13 = _mm_add_pd(S13,_mm_mul_pd(A1,B3));
} while (--k>0);
_mm_store_pd(y, _mm_hadd_pd(S00,S10));
_mm_store_pd(y+ldy, _mm_hadd_pd(S01,S11));
_mm_store_pd(y+2*ldy, _mm_hadd_pd(S02,S12));
_mm_store_pd(y+3*ldy, _mm_hadd_pd(S03,S13));
}
EOF
}
# we set the entry point of the kernel
k.procedure = p
k
end
These helper function pack lines and collumns from the source matrices
In [ ]:
def pack_lines(mat, line_start, tile_size_y:, vector_length:, unroll:, **ignored)
n_vec = (mat.shape[0].to_f / vector_length).ceil
n_vec = (n_vec.to_f/unroll).ceil*unroll
package = ANArray::new(mat.typecode, vector_length*mat.element_size,
vector_length, tile_size_y, n_vec)
mat.shape[0].times { |i|
(line_start...[line_start + tile_size_y, mat.shape[1]].min).each { |j|
package[i%vector_length, j-line_start, i/vector_length] = mat[i, j]
}
}
package
end
In [ ]:
def pack_columns(mat, column_start, tile_size_x:, vector_length:, unroll:, **ignored)
n_vec = (mat.shape[1].to_f / vector_length).ceil
n_vec = (n_vec.to_f/unroll).ceil*unroll
package = ANArray::new(mat.typecode, vector_length*mat.element_size,
vector_length, tile_size_x, n_vec)
mat.shape[1].times { |i|
(column_start...[column_start + tile_size_x, mat.shape[0]].min).each { |j|
package[i%vector_length, j-column_start, i/vector_length] = mat[j, i]
}
}
package
end
In [ ]:
a = NMatrix::new( type_map[get_default_real_size], 5000, 100 ).randomn!
b = a.transpose
c = a * b
column_start = 8
line_start = 4
tsy = 2
tsx = 4
p c_ruby_block = c[line_start...(line_start+tsy), column_start...(column_start+tsx)]
nil;
We pack the corresponding lines and columns:
In [ ]:
pl = pack_lines(a, 4, vector_length: 2, tile_size_x: 4, tile_size_y:2, unroll: 4)
pc = pack_columns(b, 8, vector_length: 2, tile_size_x: 4, tile_size_y:2, unroll: 4)
nil;
We obtain the computing kernel and build it.
In [ ]:
k_ref = reference
k_ref.build
nil;
We allocate a result buffer and set some performance evaluation variables.
In [ ]:
c_ref_block = NMatrix::new( type_map[get_default_real_size], 2, 4 )
repeat = 100
repeat_inner = 10
epsilon = 1e-8
nil;
We execute the kernel $repeat$ times and gather the result. Inside of boast the computation is done $repeat_inner$ times to account for the very short computation time.
In [ ]:
res = repeat.times.collect {
k_ref.run(pl, pc, pl.shape[2], c_ref_block, 2, repeat: repeat_inner)
}
nil;
We verify the precision of the computation:
In [ ]:
p c_ref_block
err = c_ruby_block - c_ref_block
raise "Computation error" if err.abs.max > epsilon
We pick the shortest runitme to compute the performance of the kernel. Arithmetic complexity is $2 * ALineLength * TileSizex *TileSizey$. And we account for the repetition inside of BOAST.
In [ ]:
best = res.min { |r1, r2|
r1[:duration] <=> r2[:duration]
}
perf = repeat_inner * 2 * a.shape[0] * 4 * 2 / (best[:duration] * 1e9 )
puts "time: #{best[:duration]} s, GFlops: #{perf}"
nil;
We want to be able to chose the length of the vectors, the tiling and the inner unrolling. So we define a function that accpets those parameters and that will return the corresponding kernel. For default values we use the parameters from the reference kernel. We slightly modified the interface so that the kernel also compiles using FORTRAN.
In [ ]:
def inner_kernel(opts={})
default = {vector_length: 2, tile_size_x: 4, tile_size_y:2, unroll: 4}
opts = default.merge(opts)
vl = opts[:vector_length]
tsx = opts[:tile_size_x]
tsy = opts[:tile_size_y]
unroll = opts[:unroll]
# We describe the interface of the procedure:
nvec = Int :n, dir: :in
a = Real :a, vector_length: vl, dim: [Dim(tsy), Dim(nvec)], dir: :in, restrict: true
b = Real :b, vector_length: vl, dim: [Dim(tsx), Dim(nvec)], dir: :in, restrict: true
c = Real :c, dim: [Dim(tsy),Dim(tsx)], dir: :inout
# The procedure body directly follows the Procedure declaration:
p = Procedure( :"inner_v#{vl}_x#{tsx}_y#{tsy}_u#{unroll}", [nvec, a, b, c]) {
# We allocate one vector per resulting value:
tmp_res = tsx.times.collect { |k|
tsy.times.collect { |l|
Real :"tmpres_#{k}_#{l}", vector_length: vl
}
}
# And one temporary value per tile dimension:
tmp_a = tsy.times.collect { |l|
Real :"tmpa_#{l}", vector_length: vl
}
tmp_b = tsx.times.collect { |l|
Real :"tmpb_#{l}", vector_length: vl
}
# An iterator variable
i = Int :i
# Declaration of all the variables
decl *tmp_res.flatten
decl *tmp_a
decl *tmp_b
decl i
# Initialization of the temporary result values
tmp_res.flatten.each { |tmp|
pr tmp.set 0.0
}
# The inner block will multiply tsy vectors of A with tsx vectors of B
# We use a flag map to keep track of the values that have already been loaded
p_inn = lambda { |offset|
loaded = {}
tsy.times { |k|
pr tmp_a[k] === a[k, offset]
tsx.times { |l|
unless loaded[l]
pr tmp_b[l] === b[l, offset]
loaded[l] = true
end
# The FMA(a, b, c) function will return a * b + c
# if the architecture doesn't support FMA
pr tmp_res[l][k] === FMA(tmp_a[k], tmp_b[l], tmp_res[l][k])
}
}
}
# The main loop, unrolled
pr For(i, 0, nvec - 1, step: unroll) {
unroll.times { |j|
p_inn.call(i+j)
}
}
# We compute the resulting values by reducing the vectors components
tsy.times { |k|
tsx.times { |j|
pr c[k,j] === vl.times.collect { |l| tmp_res[j][k][l] }.reduce(:+)
}
}
}
# The Kernel body is the BOAST print of the procedure so we can use
p.ckernel(:includes => "immintrin.h")
end
In [ ]:
k = inner_kernel
In [ ]:
#set_verbose(true)
#set_debug_source(true)
k.build
nil;
In [ ]:
c_block = NMatrix::new( type_map[get_default_real_size], 2, 4)
res = repeat.times.collect {
k.run(pl.shape[2], pl, pc, c_block, repeat: repeat_inner)
}
p c_block
err = c_ruby_block - c_block
raise "Computation error" if err.abs.max > epsilon
best = res.min { |r1, r2|
r1[:duration] <=> r2[:duration]
}
perf = repeat_inner * 2 * a.shape[0] * 4 * 2 / (best[:duration] * 1e9 )
puts "time: #{best[:duration]} s, GFlops: #{perf}"
nil;
Firs define the exploration space:
In [ ]:
s = OptimizationSpace::new(vector_length: [2,4], tile_size_x: 1..4, tile_size_y: 1..4, unroll: 1..4)
s.to_h
The optimizer will create all possible configuration and pass them to the block. The result of the block will be the metric used and the optimizer will search the configuration yielding the minimum metric.
In [ ]:
optimum = BruteForceOptimizer::new(s).optimize { |opts|
p opts
tsx = opts[:tile_size_x]
tsy = opts[:tile_size_y]
c_ruby_block = c[line_start...(line_start + tsy), column_start...(column_start + tsx)]
# We create an compile a new kernel for each configuration
k = inner_kernel(**opts)
k.build
#Instanciate a new result block and pack the corresponding lines and columns
c_block = NMatrix::new( type_map[get_default_real_size], tsy, tsx)
pl = pack_lines(a, line_start, **opts)
pc = pack_columns(b, column_start, **opts)
# We evaluate the kernel's performance and accuracy using the same method as before
res = repeat.times.collect {
k.run(pl.shape[2], pl, pc, c_block, repeat: repeat_inner)
}
err = c_ruby_block - c_block
err.each { |v|
if v.abs > epsilon
puts k
puts "c:"
p c_ruby_block
puts "c_block:"
p c_block
puts "error:"
p err
raise "Computation error!" if v.abs > epsilon
end
}
best = res.min { |r1, r2|
r1[:duration] <=> r2[:duration]
}
perf = repeat_inner * 2 * a.shape[0] * tsx * tsy / (best[:duration] * 1e9 )
puts "time: #{best[:duration]} s, GFlops: #{perf}"
# we try to maximize perf so we return the inverse
1.0/perf
}
nil;
The resulting configuration, using the default compilation flags and the experimenters architecture:
In [ ]:
p optimum[0]
puts "GFlops: #{1/optimum[1]}"
In [ ]:
k = inner_kernel(**optimum[0])