The Julia Challenge in C++

Julia developer Simon Danisch set out a "challenge" to other programming languages: implement fast array computing with broadcasting!

https://nextjournal.com/sdanisch/the-julia-challenge

This is quite elegant in Julia, but we thought: C++ can do this, too.


In [ ]:
#include <vector>
#include <array>
#include <tuple>
#include <algorithm>
#include <iostream>

Closure Type

For modern C++: decide to store reference or value type depending on reference type (rvalue vs lvalue).

Closure type compliments move semantics of STL, with perfect forwarding let's you tell who should own memory.

  • If T&& or T, store T
  • If T& store T&

In [ ]:
template <class S>
struct closure_type
{
    using underlying_type = std::conditional_t<std::is_const<std::remove_reference_t<S>>::value,
                                               const std::decay_t<S>,
                                               std::decay_t<S>>;

    using type = typename std::conditional<std::is_lvalue_reference<S>::value,
                                           underlying_type&,
                                           underlying_type>::type;
};

template <class T>
using closure_type_t = typename closure_type<T>::type;

We need a n-dimensional array class


In [ ]:
template <class CT, std::size_t N>
class qview
{
public:
    using value_type = typename std::decay_t<CT>::value_type;

    // Constructor to adapt a block of linear memory (data) to a n-dimensional shape
    explicit qview(CT data, const ptrdiff_t (&i_shape)[N]);

    // The indexing function
    template <class... Args>
    value_type& operator()(Args... args);

    template <class... Args>
    const value_type& operator()(Args... args) const;

    // Note that shape and strides are stored in std::arrays to benefit from an allocation
    // on the stack
    std::array<ptrdiff_t, N> shape;
    std::array<ptrdiff_t, N> strides;

    closure_type_t<CT> memory;

private:
    // This function computes the row-major strides for a n-dimensional view on the data
    // For more information on how array strides work, this Wikipedia article dives  
    // further into the theory https://en.wikipedia.org/wiki/Stride_of_an_array
    auto compute_strides();

    // Special case: compute offset without arguments == 0
    auto constexpr compute_offset() const;

    template <class Arg, class... Args>
    auto constexpr compute_offset(Arg a1, Args... args) const;
};

Constructor


In [ ]:
// Constructor to adapt a block of linear memory (data) to a n-dimensional shape
template <class CT, std::size_t N>
qview<CT, N>::qview(CT data, const ptrdiff_t (&i_shape)[N])
    : memory(std::forward<CT>(data))
{
    std::copy(std::begin(i_shape), std::end(i_shape), shape.begin());

    // The following function will compute the strides for the shape!
    compute_strides();
}

Strides computation

Strides are computed so that multiplying a index with the strides produces the correct offset in linear block of memory


In [ ]:
template <class CT, std::size_t N>
auto qview<CT, N>::compute_strides()
{
    ptrdiff_t data_size = 1;
    if constexpr (N > 0)
    {
        for (std::ptrdiff_t i = N - 1; i >= 0; --i)
        {
            // This is a trick for broadcasting: if the shape is 1 in dim (i), 
            // we set the stride to 0 to not move
            strides[i] = shape[i] != 1 ? data_size : 0;
            data_size *= shape[i];
        }
    }
    return data_size;
}

Compute offset functions

Inner product of strides with index produces the memory offset

$$\text{offset} = \sum_{i=0}^{N}{\text{strides}[i] * \text{index}[i]}$$

In [ ]:
// Special case: compute offset without arguments == 0
template <class CT, std::size_t N>
auto constexpr qview<CT, N>::compute_offset() const
{
    return ptrdiff_t(0);
} 

// And the "real" compute offset function
template <class CT, std::size_t N>
template <class Arg, class... Args>
auto constexpr qview<CT, N>::compute_offset(Arg a1, Args... args) const
{
    // If we have more index arguments than dimensions, remove the first index
    if constexpr (sizeof...(Args) + 1 > N)
    {
        return compute_offset(args...);
    }
    else
    {
        // unpack the arguments into an index and compute the inner product with the strides
        std::array<ptrdiff_t, sizeof...(Args) + 1> idx({static_cast<long>(a1), static_cast<long>(args)...});
        ptrdiff_t offset = 0;
        for (std::size_t i = 0; i < N; ++i)
        {
            offset += strides[i] * idx[i];
        }
        return offset;
    }
}

Access operator


In [ ]:
// The indexing function
template <class CT, std::size_t N>
template <class... Args>
auto qview<CT, N>::operator()(Args... args) -> value_type&
{
    std::size_t offset = compute_offset(args...);
    return memory[offset];
}

In [ ]:
// The indexing function
template <class CT, std::size_t N>
template <class... Args>
auto qview<CT, N>::operator()(Args... args) const -> const value_type&
{
    std::size_t offset = compute_offset(args...);
    return memory[offset];
}

Now we can create a qview!


In [ ]:
auto v = std::vector<double>({1,2,3,4,5,6,7,8,9});

In [ ]:
auto qv = qview{v, {3, 3}};

The contents of the qv should look like

1, 2, 3
4, 5, 6
7, 8, 9

In [ ]:
qv(2, 1)

Implicit broadcasting


In [ ]:
auto qv2 = qview{v, {3, 1, 1, 3}};

In [ ]:
std::cout << qv(1, 2) << " == " <<
             qv(5, 10, 1, 2) << " == " << 
            qv2(1, 10, 4, 2);

max dim helper function

Finds the static, largest extent of multiple std::array's


In [ ]:
template <class... Args>
constexpr auto max_dim() {
    constexpr auto arr = std::array<size_t, sizeof...(Args)>{std::tuple_size<Args>::value...};
    return *std::max_element(arr.begin(), arr.end());
}

A class for a lazy "qfunction"


In [ ]:
// A simple lazy, broadcasting function
template <class F, class... X>
class qfunction
{
public:
    
    F f; // the functor
    
    // with the max_dim constexpr function we compute the largest shape size on all arguments
    std::array<std::ptrdiff_t, max_dim<std::decay_t<decltype(std::declval<X>().shape)>...>()> shape;
    std::tuple<X...> args;

    qfunction(F i_f, X... i_args);
    
    // We unpack the index into all arguments and put them all into our functor!
    template <std::size_t... I, class... Idxs>
    auto access_impl(std::index_sequence<I...>, Idxs... idxs) const;
    
    template <class... Idx>
    auto operator()(Idx... idxs) const;
};

tuple for each helper function

Necessary to iterate through all expression leafs stored in the qfunction


In [ ]:
template <typename T, typename F, std::size_t... I>
void for_each_impl(F&& f, T&& tuple, std::index_sequence<I...>) {
    (void) std::initializer_list<int>{
        (f(std::get<I>(std::forward<T>(tuple))), void(), int{})...
    };
}

template <typename T, typename F>
void for_each(F&& f, T&& tuple) {
    constexpr std::size_t N = std::tuple_size<std::decay_t<T>>::value;
    for_each_impl(std::forward<F>(f), std::forward<T>(tuple),
                  std::make_index_sequence<N>{});
}

The constructor for the qfunction

Computes the broadcast shape for all function arguments.

  • If both extents != 1, extents have to agree
  • If either of the extents is 1, bigger one is chosen
  • Shapes are implicitly extended to the left with 1

E.g. leads to

broadcast { (3, 4), (3, 3, 1) }

            (1) 3, 4
             3, 3, 1 
           ===========
             3, 3, 4

In [ ]:
template <class F, class... X>
qfunction<F, X...>::qfunction(F i_f, X... i_args)
    : f(i_f), args(i_args...)
{
    std::fill(shape.begin(), shape.end(), 1);

    // Lambda to compute the broadcasted shape
    auto broadcast_shape = [this](const auto& v) constexpr {
        // we need the offset to "align" the shapes at the end
        // the size of the shape of the function is the max size of all arguments
        std::size_t offset = this->shape.size() - v.shape.size();
        for (std::size_t i = 0; i < v.shape.size(); ++i)
        {
            if (this->shape[offset + i] == 1)
            {
                this->shape[offset + i] = v.shape[i];                
            }
            else
            {
                if (v.shape[i] != this->shape[offset + i] && v.shape[i] != 1)
                    throw std::runtime_error("Broadcast error.");                
            }
        }
        return true;
    };
    // broadcast shape of all arguments
    for_each(broadcast_shape, args);
}

Accessing & lazily computing function

... by unpacking elements of args at idx into functor f


In [ ]:
template <class F, class... X>
template <std::size_t... I, class... Idxs>
auto qfunction<F, X...>::access_impl(std::index_sequence<I...>, Idxs... idxs) const
{ 
    return f(std::get<I>(args)(idxs...)...); 
}

In [ ]:
template <class F, class... X>
template <class... Idxs>
auto qfunction<F, X...>::operator()(Idxs... idxs) const
{
    return access_impl(std::make_index_sequence<sizeof...(X)>(), idxs...);
}

Make qfunction helper function

Uses closure_type_t to store rvalues correctly.


In [ ]:
template <class L, class... Args>
auto make_qfunction(L func, Args&&... args)
{
    return qfunction<L, closure_type_t<Args>...>(func, std::forward<Args>(args)...);
}

In [ ]:
// And here we define the operator+ for qviews, returning a lazy qfunction
template <class L, class R>
auto operator+(L&& lexpr, R&& rexpr) {
    return make_qfunction([](const auto& lhs, const auto& rhs) { 
        return lhs + rhs;
    }, std::forward<L>(lexpr), std::forward<R>(rexpr));
}

In [ ]:
auto qf = qv + qv

In [ ]:
qv(0, 1)

In [ ]:
qf(0, 1)

In [ ]:
template <class T>
void print_type(T t)
{
    std::cout << __PRETTY_FUNCTION__ << std::endl;
}

In [ ]:
print_type(qf)

Recursive for

The recursive for starts with no arguments at all, and then runs a for loop for every dimension, counting up. For each static dimension, a new for loop is created, resulting in something like this:

for (i = 0; i < shape[0]; i++)
   for (j = 0; j < shape[1]; j++)...
      f(i, j, ...); // function call!

Where f is a lambda!


In [ ]:
template <std::size_t N, std::size_t I = 0, class... Funcs, class... Idxs>
auto recursive_for(const std::tuple<Funcs...>& func, Idxs... idxs) {
    if constexpr (I == N)
    {
        for (std::size_t i = 0; i < std::get<0>(func).shape[I]; ++i)
        {
            std::get<0>(func)(idxs..., i);        
        }
    }
    else
    {
        for (std::size_t i = 0; i < std::get<0>(func).shape[I]; ++i)
        {
            if constexpr (sizeof...(Funcs) > N) { std::get<N>(func)(idxs..., i); }
            
            // call recursively!
            recursive_for<N, I + 1>(func, idxs..., i);
        }
    }
}

A simple print qfunction!


In [ ]:
template <class T>
void print(const T& array) {
    auto func = make_qfunction([](auto&& el) { std::cout << el << ", "; }, array);
    // call the recursive_for on all elements!
    recursive_for<1>(
        std::make_tuple(func, 
                        [](auto) { std::cout << "\n"; }));
}

In [ ]:
print(qv)

In [ ]:
qv(2, 1) = 1000;

In [ ]:
print(qv)

In [ ]:
print(qf + qv + qv)

In [ ]:
// If we want to assign the result of a qfunction to a qview, we need 
// an assign function:
template <class T, class E>
auto assign_func(T& larr, E& rexpr)
{
    // note that the lhs argument is a reference!
    auto func = make_qfunction([](auto& lhs, auto rhs) {
        lhs = rhs; 
    }, larr, rexpr);
    
    recursive_for<std::tuple_size_v<decltype(func.shape)> - 1>(std::make_tuple(func));
}

In [ ]:
auto res = qview{std::vector<double>(9), {3, 3}};

In [ ]:
print(res)

In [ ]:
assign_func(res, qf)

In [ ]:
print(res)

In [ ]:
auto bcv = qview{std::vector<double>(3), {3}};
bcv(0) = -1000;
bcv(1) = 1;
bcv(2) = -1000;

bcv = {-1, 1, -1}


In [ ]:
print(res + bcv)