Info
-
Did you know about
gnu::vector_size
extension?
Example
int main() {
using v4si = int [[gnu::vector_size(4 * sizeof(int))]];
v4si a = {1, 2, 3, 4};
v4si b = {4, 3, 2, 1};
v4si c;
c = a + b;
std::cout << c[0] << c[1] << c[2] << c[3]; // prints 5555
}
Puzzle
- Can you implement
matmul
- vectorized matrix multiplication withgnu::vector_size
?
using v16qi = std::int8_t [[gnu::vector_size(16 * sizeof(std::int8_t))]];
template<auto N>
constexpr auto matmul(v16qi (&A)[N], v16qi (&B)[N], std::int32_t (&C)[N][N]) {
// TODO
}
int main() {
using namespace boost::ut;
"matmul"_test = [] {
v16qi A[4] = {{1, 2, 3, 4},
{5, 6, 7, 8},
{9, 10, 11, 12},
{13, 14, 15, 16}};
v16qi B[4] = {{16, 15, 14, 13},
{12, 11, 10, 9},
{8, 7, 6, 5},
{4, 3, 2, 1}};
std::int32_t C[4][4] = {0};
matmul(A, B, C);
expect(C[0][0] = 80 and C[0][3] == 50 and C[3][0] == 560 and C[3][3] == 386);
};
}
Solutions
template <auto N>
constexpr auto matmul(v16qi (&A)[N], v16qi (&B)[N], std::int32_t (&C)[N][N]) {
static_assert(N <= 16, "Matrices larger than 16x16 are not supported");
auto sum = [](const auto &vec) {
auto ret = [&vec]<std::size_t... Idxs>(std::index_sequence<Idxs...>) {
return (vec[Idxs] + ...);
}(std::make_index_sequence<N>{});
return ret;
};
using v64si = std::int32_t [[gnu::vector_size(16 * sizeof(std::int32_t))]];
auto get_col = [&B](const auto col_idx) {
auto col =
[&B, col_idx]<std::size_t... Idxs>(std::index_sequence<Idxs...>) {
return v16qi{B[Idxs][col_idx]...};
}(std::make_index_sequence<N>{});
return __builtin_convertvector(col, v64si);
};
for (auto i = 0; i < N; ++i) {
for (auto j = 0; j < N; ++j) {
C[i][j] = sum(__builtin_convertvector(A[i], v64si) * get_col(j));
}
}
}