Julia developer Simon Danisch set out a "challenge" to other programming languages: implement fast array computing with broadcasting!
https://nextjournal.com/sdanisch/the-julia-challenge
This is quite elegant in Julia, but we thought: C++ can do this, too.
In [ ]:
#include <vector>
#include <array>
#include <tuple>
#include <algorithm>
#include <iostream>
In [ ]:
template <class S>
struct closure_type
{
using underlying_type = std::conditional_t<std::is_const<std::remove_reference_t<S>>::value,
const std::decay_t<S>,
std::decay_t<S>>;
using type = typename std::conditional<std::is_lvalue_reference<S>::value,
underlying_type&,
underlying_type>::type;
};
template <class T>
using closure_type_t = typename closure_type<T>::type;
In [ ]:
template <class CT, std::size_t N>
class qview
{
public:
using value_type = typename std::decay_t<CT>::value_type;
// Constructor to adapt a block of linear memory (data) to a n-dimensional shape
explicit qview(CT data, const ptrdiff_t (&i_shape)[N]);
// The indexing function
template <class... Args>
value_type& operator()(Args... args);
template <class... Args>
const value_type& operator()(Args... args) const;
// Note that shape and strides are stored in std::arrays to benefit from an allocation
// on the stack
std::array<ptrdiff_t, N> shape;
std::array<ptrdiff_t, N> strides;
closure_type_t<CT> memory;
private:
// This function computes the row-major strides for a n-dimensional view on the data
// For more information on how array strides work, this Wikipedia article dives
// further into the theory https://en.wikipedia.org/wiki/Stride_of_an_array
auto compute_strides();
// Special case: compute offset without arguments == 0
auto constexpr compute_offset() const;
template <class Arg, class... Args>
auto constexpr compute_offset(Arg a1, Args... args) const;
};
In [ ]:
// Constructor to adapt a block of linear memory (data) to a n-dimensional shape
template <class CT, std::size_t N>
qview<CT, N>::qview(CT data, const ptrdiff_t (&i_shape)[N])
: memory(std::forward<CT>(data))
{
std::copy(std::begin(i_shape), std::end(i_shape), shape.begin());
// The following function will compute the strides for the shape!
compute_strides();
}
In [ ]:
template <class CT, std::size_t N>
auto qview<CT, N>::compute_strides()
{
ptrdiff_t data_size = 1;
if constexpr (N > 0)
{
for (std::ptrdiff_t i = N - 1; i >= 0; --i)
{
// This is a trick for broadcasting: if the shape is 1 in dim (i),
// we set the stride to 0 to not move
strides[i] = shape[i] != 1 ? data_size : 0;
data_size *= shape[i];
}
}
return data_size;
}
In [ ]:
// Special case: compute offset without arguments == 0
template <class CT, std::size_t N>
auto constexpr qview<CT, N>::compute_offset() const
{
return ptrdiff_t(0);
}
// And the "real" compute offset function
template <class CT, std::size_t N>
template <class Arg, class... Args>
auto constexpr qview<CT, N>::compute_offset(Arg a1, Args... args) const
{
// If we have more index arguments than dimensions, remove the first index
if constexpr (sizeof...(Args) + 1 > N)
{
return compute_offset(args...);
}
else
{
// unpack the arguments into an index and compute the inner product with the strides
std::array<ptrdiff_t, sizeof...(Args) + 1> idx({static_cast<long>(a1), static_cast<long>(args)...});
ptrdiff_t offset = 0;
for (std::size_t i = 0; i < N; ++i)
{
offset += strides[i] * idx[i];
}
return offset;
}
}
In [ ]:
// The indexing function
template <class CT, std::size_t N>
template <class... Args>
auto qview<CT, N>::operator()(Args... args) -> value_type&
{
std::size_t offset = compute_offset(args...);
return memory[offset];
}
In [ ]:
// The indexing function
template <class CT, std::size_t N>
template <class... Args>
auto qview<CT, N>::operator()(Args... args) const -> const value_type&
{
std::size_t offset = compute_offset(args...);
return memory[offset];
}
In [ ]:
auto v = std::vector<double>({1,2,3,4,5,6,7,8,9});
In [ ]:
auto qv = qview{v, {3, 3}};
The contents of the qv should look like
1, 2, 3
4, 5, 6
7, 8, 9
In [ ]:
qv(2, 1)
In [ ]:
auto qv2 = qview{v, {3, 1, 1, 3}};
In [ ]:
std::cout << qv(1, 2) << " == " <<
qv(5, 10, 1, 2) << " == " <<
qv2(1, 10, 4, 2);
In [ ]:
template <class... Args>
constexpr auto max_dim() {
constexpr auto arr = std::array<size_t, sizeof...(Args)>{std::tuple_size<Args>::value...};
return *std::max_element(arr.begin(), arr.end());
}
In [ ]:
// A simple lazy, broadcasting function
template <class F, class... X>
class qfunction
{
public:
F f; // the functor
// with the max_dim constexpr function we compute the largest shape size on all arguments
std::array<std::ptrdiff_t, max_dim<std::decay_t<decltype(std::declval<X>().shape)>...>()> shape;
std::tuple<X...> args;
qfunction(F i_f, X... i_args);
// We unpack the index into all arguments and put them all into our functor!
template <std::size_t... I, class... Idxs>
auto access_impl(std::index_sequence<I...>, Idxs... idxs) const;
template <class... Idx>
auto operator()(Idx... idxs) const;
};
In [ ]:
template <typename T, typename F, std::size_t... I>
void for_each_impl(F&& f, T&& tuple, std::index_sequence<I...>) {
(void) std::initializer_list<int>{
(f(std::get<I>(std::forward<T>(tuple))), void(), int{})...
};
}
template <typename T, typename F>
void for_each(F&& f, T&& tuple) {
constexpr std::size_t N = std::tuple_size<std::decay_t<T>>::value;
for_each_impl(std::forward<F>(f), std::forward<T>(tuple),
std::make_index_sequence<N>{});
}
Computes the broadcast shape for all function arguments.
1, bigger one is chosenE.g. leads to
broadcast { (3, 4), (3, 3, 1) }
(1) 3, 4
3, 3, 1
===========
3, 3, 4
In [ ]:
template <class F, class... X>
qfunction<F, X...>::qfunction(F i_f, X... i_args)
: f(i_f), args(i_args...)
{
std::fill(shape.begin(), shape.end(), 1);
// Lambda to compute the broadcasted shape
auto broadcast_shape = [this](const auto& v) constexpr {
// we need the offset to "align" the shapes at the end
// the size of the shape of the function is the max size of all arguments
std::size_t offset = this->shape.size() - v.shape.size();
for (std::size_t i = 0; i < v.shape.size(); ++i)
{
if (this->shape[offset + i] == 1)
{
this->shape[offset + i] = v.shape[i];
}
else
{
if (v.shape[i] != this->shape[offset + i] && v.shape[i] != 1)
throw std::runtime_error("Broadcast error.");
}
}
return true;
};
// broadcast shape of all arguments
for_each(broadcast_shape, args);
}
In [ ]:
template <class F, class... X>
template <std::size_t... I, class... Idxs>
auto qfunction<F, X...>::access_impl(std::index_sequence<I...>, Idxs... idxs) const
{
return f(std::get<I>(args)(idxs...)...);
}
In [ ]:
template <class F, class... X>
template <class... Idxs>
auto qfunction<F, X...>::operator()(Idxs... idxs) const
{
return access_impl(std::make_index_sequence<sizeof...(X)>(), idxs...);
}
In [ ]:
template <class L, class... Args>
auto make_qfunction(L func, Args&&... args)
{
return qfunction<L, closure_type_t<Args>...>(func, std::forward<Args>(args)...);
}
In [ ]:
// And here we define the operator+ for qviews, returning a lazy qfunction
template <class L, class R>
auto operator+(L&& lexpr, R&& rexpr) {
return make_qfunction([](const auto& lhs, const auto& rhs) {
return lhs + rhs;
}, std::forward<L>(lexpr), std::forward<R>(rexpr));
}
In [ ]:
auto qf = qv + qv
In [ ]:
qv(0, 1)
In [ ]:
qf(0, 1)
In [ ]:
template <class T>
void print_type(T t)
{
std::cout << __PRETTY_FUNCTION__ << std::endl;
}
In [ ]:
print_type(qf)
The recursive for starts with no arguments at all, and then runs a for loop for every dimension, counting up. For each static dimension, a new for loop is created, resulting in something like this:
for (i = 0; i < shape[0]; i++)
for (j = 0; j < shape[1]; j++)...
f(i, j, ...); // function call!
Where f is a lambda!
In [ ]:
template <std::size_t N, std::size_t I = 0, class... Funcs, class... Idxs>
auto recursive_for(const std::tuple<Funcs...>& func, Idxs... idxs) {
if constexpr (I == N)
{
for (std::size_t i = 0; i < std::get<0>(func).shape[I]; ++i)
{
std::get<0>(func)(idxs..., i);
}
}
else
{
for (std::size_t i = 0; i < std::get<0>(func).shape[I]; ++i)
{
if constexpr (sizeof...(Funcs) > N) { std::get<N>(func)(idxs..., i); }
// call recursively!
recursive_for<N, I + 1>(func, idxs..., i);
}
}
}
In [ ]:
template <class T>
void print(const T& array) {
auto func = make_qfunction([](auto&& el) { std::cout << el << ", "; }, array);
// call the recursive_for on all elements!
recursive_for<1>(
std::make_tuple(func,
[](auto) { std::cout << "\n"; }));
}
In [ ]:
print(qv)
In [ ]:
qv(2, 1) = 1000;
In [ ]:
print(qv)
In [ ]:
print(qf + qv + qv)
In [ ]:
// If we want to assign the result of a qfunction to a qview, we need
// an assign function:
template <class T, class E>
auto assign_func(T& larr, E& rexpr)
{
// note that the lhs argument is a reference!
auto func = make_qfunction([](auto& lhs, auto rhs) {
lhs = rhs;
}, larr, rexpr);
recursive_for<std::tuple_size_v<decltype(func.shape)> - 1>(std::make_tuple(func));
}
In [ ]:
auto res = qview{std::vector<double>(9), {3, 3}};
In [ ]:
print(res)
In [ ]:
assign_func(res, qf)
In [ ]:
print(res)
In [ ]:
auto bcv = qview{std::vector<double>(3), {3}};
bcv(0) = -1000;
bcv(1) = 1;
bcv(2) = -1000;
bcv = {-1, 1, -1}
In [ ]:
print(res + bcv)
https://github.com/SimonDanisch/julia-challenge/blob/master/cpp/stl/main.cpp