[go: up one dir, main page]

Skip to content

Latest commit

 

History

History
91 lines (70 loc) · 2.39 KB

336.md

File metadata and controls

91 lines (70 loc) · 2.39 KB
Info

Example

int main() {
  using v4si = int [[gnu::vector_size(4 * sizeof(int))]];

  v4si a = {1, 2, 3, 4};
  v4si b = {4, 3, 2, 1};
  v4si c;

  c = a + b;
  std::cout << c[0] << c[1] << c[2] << c[3]; // prints 5555
}

https://godbolt.org/z/fTMsK8Moh

Puzzle

  • Can you implement matmul - vectorized matrix multiplication with gnu::vector_size?
using v16qi = std::int8_t [[gnu::vector_size(16 * sizeof(std::int8_t))]];

template<auto N>
constexpr auto matmul(v16qi (&A)[N], v16qi (&B)[N], std::int32_t (&C)[N][N]) {
    // TODO
}

int main() {
    using namespace boost::ut;

    "matmul"_test = [] {
        v16qi A[4] = {{1, 2, 3, 4},
                  {5, 6, 7, 8},
                  {9, 10, 11, 12},
                  {13, 14, 15, 16}};
        v16qi B[4] = {{16, 15, 14, 13},
                  {12, 11, 10, 9},
                  {8, 7, 6, 5},
                  {4, 3, 2, 1}};
        std::int32_t C[4][4] = {0};

        matmul(A, B, C);

        expect(C[0][0] = 80 and C[0][3] == 50 and C[3][0] == 560 and C[3][3] == 386);
    };
}

https://godbolt.org/z/eGj3eddKM

Solutions

template <auto N>
constexpr auto matmul(v16qi (&A)[N], v16qi (&B)[N], std::int32_t (&C)[N][N]) {
    static_assert(N <= 16, "Matrices larger than 16x16 are not supported");
    auto sum = [](const auto &vec) {
        auto ret = [&vec]<std::size_t... Idxs>(std::index_sequence<Idxs...>) {
            return (vec[Idxs] + ...);
        }(std::make_index_sequence<N>{});
        return ret;
    };
    using v64si = std::int32_t [[gnu::vector_size(16 * sizeof(std::int32_t))]];
    auto get_col = [&B](const auto col_idx) {
        auto col =
            [&B, col_idx]<std::size_t... Idxs>(std::index_sequence<Idxs...>) {
                return v16qi{B[Idxs][col_idx]...};
            }(std::make_index_sequence<N>{});
        return __builtin_convertvector(col, v64si);
    };
    for (auto i = 0; i < N; ++i) {
        for (auto j = 0; j < N; ++j) {
            C[i][j] = sum(__builtin_convertvector(A[i], v64si) * get_col(j));
        }
    }
}

https://godbolt.org/z/earr7KxK4