From de3d2304cf10630ff24996b390a73a3e42704ad6 Mon Sep 17 00:00:00 2001 From: Dounia Date: Mon, 20 Mar 2023 11:16:28 -0700 Subject: [PATCH 1/9] [SYCL][Matrix] Add support for tf32 type using the unified interface --- sycl/include/CL/__spirv/spirv_ops.hpp | 36 ++-- .../sycl/ext/oneapi/matrix/matrix-intel.hpp | 124 ++++++++++--- .../ext/oneapi/matrix/matrix-tensorcores.hpp | 6 - .../oneapi/matrix/matrix-unified-utils.hpp | 6 + .../sycl/ext/oneapi/matrix/matrix-unified.hpp | 33 ++-- sycl/test/matrix/matrix-tf32-test.cpp | 164 ++++++++++++++++++ 6 files changed, 314 insertions(+), 55 deletions(-) create mode 100644 sycl/test/matrix/matrix-tf32-test.cpp diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index a02f47fca3250..9f436ee659d0f 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -24,19 +24,23 @@ #ifdef __SYCL_DEVICE_ONLY__ #if (SYCL_EXT_ONEAPI_MATRIX_VERSION > 1) -template -extern __DPCPP_SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * -__spirv_JointMatrixLoadINTEL(T *Ptr, std::size_t Stride, - __spv::MatrixLayout Layout = L, - __spv::Scope::Flag Sc = S, int MemOperand = 0); +extern __DPCPP_SYCL_EXTERNAL + __spv::__spirv_JointMatrixINTEL * + __spirv_JointMatrixLoadINTEL(T *Ptr, std::size_t Stride, + __spv::MatrixLayout Layout = L, + __spv::Scope::Flag Sc = S, int MemOperand = 0); -template extern __DPCPP_SYCL_EXTERNAL void __spirv_JointMatrixStoreINTEL( - T *Ptr, __spv::__spirv_JointMatrixINTEL *Object, + T *Ptr, __spv::__spirv_JointMatrixINTEL *Object, std::size_t Stride, __spv::MatrixLayout Layout = L, __spv::Scope::Flag Sc = S, int MemOperand = 0); @@ -99,11 +103,13 @@ extern __DPCPP_SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL *C, __spv::Scope::Flag Sc = __spv::Scope::Flag::Subgroup); -template -extern __DPCPP_SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * -__spirv_CompositeConstruct(const T v); +extern __DPCPP_SYCL_EXTERNAL + __spv::__spirv_JointMatrixINTEL * + __spirv_CompositeConstruct(const T v); template *); -template -extern __DPCPP_SYCL_EXTERNAL T __spirv_VectorExtractDynamic( +extern __DPCPP_SYCL_EXTERNAL Ts __spirv_VectorExtractDynamic( __spv::__spirv_JointMatrixINTEL *, size_t i); -template extern __DPCPP_SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * __spirv_VectorInsertDynamic(__spv::__spirv_JointMatrixINTEL *, - T val, size_t i); + Ts val, size_t i); #else template struct helper_traits { + using element_type = T; + using storage_element_type = T; + using fill_argument_type = T; +}; + +template <> +struct helper_traits { + using element_type = sycl::ext::oneapi::experimental::matrix::precision::tf32; + using storage_element_type = float; + using fill_argument_type = float; +}; +} // namespace detail } // namespace oneapi namespace intel::experimental::matrix { +using namespace sycl::ext::oneapi::experimental::matrix; // Begin wi_element definition template ::storage_element_type; wi_element(sycl::ext::oneapi::experimental::matrix::joint_matrix< Group, T, Use, NumRows, NumCols, Layout> &Mat, std::size_t i) : M(Mat), idx(i) {} - operator T() { + operator storage_element_type() { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_VectorExtractDynamic(M.spvm, idx); + storage_element_type elem = + __spirv_VectorExtractDynamic::value, + spv_matrix_layout_traits::value, + spv_scope_traits::value>(M.spvm, + idx); + return elem; #else throw runtime_error("joint matrix is not supported on host device.", PI_ERROR_INVALID_DEVICE); @@ -99,7 +124,12 @@ class wi_element { explicit operator bool() { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_VectorExtractDynamic(M.spvm, idx) != static_cast(0); + return __spirv_VectorExtractDynamic::value, + spv_matrix_layout_traits::value, + spv_scope_traits::value>( + M.spvm, idx) != static_cast(0); #else throw runtime_error("joint matrix is not supported on host device.", PI_ERROR_INVALID_DEVICE); @@ -108,7 +138,8 @@ class wi_element { template wi_element &operator=(const T2 &rhs) { #ifdef __SYCL_DEVICE_ONLY__ - M.spvm = __spirv_VectorInsertDynamic(M.spvm, static_cast(rhs), idx); + M.spvm = __spirv_VectorInsertDynamic( + M.spvm, static_cast(rhs), idx); return *this; #else (void)rhs; @@ -121,7 +152,13 @@ class wi_element { operator=(const wi_element &rhs) { #ifdef __SYCL_DEVICE_ONLY__ M.spvm = __spirv_VectorInsertDynamic( - M.spvm, __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx), idx); + M.spvm, + __spirv_VectorExtractDynamic::value, + spv_matrix_layout_traits::value, + spv_scope_traits::value>(rhs.M.spvm, + rhs.idx), + idx); return *this; #else (void)rhs; @@ -135,8 +172,13 @@ class wi_element { template wi_element &operator op##=(const T2 &rhs) { \ M.spvm = __spirv_VectorInsertDynamic( \ M.spvm, \ - static_cast(__spirv_VectorExtractDynamic(M.spvm, idx) \ - op static_cast(rhs)), \ + static_cast( \ + __spirv_VectorExtractDynamic< \ + storage_element_type, T, NumRows, NumCols, \ + spv_matrix_use_traits::value, \ + spv_matrix_layout_traits::value, \ + spv_scope_traits::value>(M.spvm, idx) \ + op static_cast(rhs)), \ idx); \ return *this; \ } @@ -173,7 +215,11 @@ class wi_element::value, + spv_matrix_layout_traits::value, + spv_scope_traits::value>(M.spvm, idx); #else throw runtime_error("joint matrix is not supported on host device.", PI_ERROR_INVALID_DEVICE); @@ -182,8 +228,13 @@ class wi_element(__spirv_VectorExtractDynamic( - M.spvm, idx))) >= std::numeric_limits::epsilon(); + return std::fabs(static_cast( + __spirv_VectorExtractDynamic< + sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16, + NumRows, NumCols, spv_matrix_use_traits::value, + spv_matrix_layout_traits::value, + spv_scope_traits::value>(M.spvm, idx))) >= + std::numeric_limits::epsilon(); #else throw runtime_error("joint matrix is not supported on host device.", PI_ERROR_INVALID_DEVICE); @@ -205,7 +256,14 @@ class wi_element &rhs) { #ifdef __SYCL_DEVICE_ONLY__ M.spvm = __spirv_VectorInsertDynamic( - M.spvm, __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx), idx); + M.spvm, + __spirv_VectorExtractDynamic::value, + spv_matrix_layout_traits::value, + spv_scope_traits::value>(rhs.M.spvm, + rhs.idx), + idx); return *this; #else (void)rhs; @@ -218,7 +276,13 @@ class wi_element::value, \ + spv_matrix_layout_traits::value, \ + spv_scope_traits::value>(M.spvm, idx) op rhs, \ + idx); \ return *this; \ } #else // __SYCL_DEVICE_ONLY__ @@ -241,13 +305,21 @@ class wi_element &lhs, \ const sycl::ext::oneapi::bfloat16 &rhs) { \ - return __spirv_VectorExtractDynamic(lhs.M.spvm, lhs.idx) op rhs; \ + return __spirv_VectorExtractDynamic< \ + sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16, NumRows, \ + NumCols, spv_matrix_use_traits::value, \ + spv_matrix_layout_traits::value, \ + spv_scope_traits::value>(lhs.M.spvm, lhs.idx) op rhs; \ } \ friend type operator op( \ const sycl::ext::oneapi::bfloat16 &lhs, \ const wi_element &rhs) { \ - return __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx) op lhs; \ + return __spirv_VectorExtractDynamic< \ + sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16, NumRows, \ + NumCols, spv_matrix_use_traits::value, \ + spv_matrix_layout_traits::value, \ + spv_scope_traits::value>(rhs.M.spvm, rhs.idx) op lhs; \ } OP(sycl::ext::oneapi::bfloat16, +) OP(sycl::ext::oneapi::bfloat16, -) @@ -259,15 +331,25 @@ class wi_element &lhs, \ const sycl::ext::oneapi::bfloat16 &rhs) { \ - return type{static_cast(__spirv_VectorExtractDynamic( \ - lhs.M.spvm, lhs.idx)) op static_cast(rhs)}; \ + return type{static_cast( \ + __spirv_VectorExtractDynamic< \ + sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16, NumRows, \ + NumCols, spv_matrix_use_traits::value, \ + spv_matrix_layout_traits::value, \ + spv_scope_traits::value>(lhs.M.spvm, lhs.idx)) \ + op static_cast(rhs)}; \ } \ friend type operator op( \ const sycl::ext::oneapi::bfloat16 &lhs, \ const wi_element &rhs) { \ - return type{static_cast(__spirv_VectorExtractDynamic( \ - rhs.M.spvm, rhs.idx)) op static_cast(lhs)}; \ + return type{static_cast( \ + __spirv_VectorExtractDynamic< \ + sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16, NumRows, \ + NumCols, spv_matrix_use_traits::value, \ + spv_matrix_layout_traits::value, \ + spv_scope_traits::value>(rhs.M.spvm, rhs.idx)) \ + op static_cast(lhs)}; \ } OP(bool, ==) OP(bool, !=) @@ -358,7 +440,7 @@ get_wi_data(Group sg, sycl::ext::oneapi::experimental::matrix::joint_matrix< // End wi_data definition template < - typename Group, typename T, + typename Group, typename T, typename Tp, sycl::ext::oneapi::experimental::matrix::use Use, size_t NumRows, size_t NumCols, sycl::ext::oneapi::experimental::matrix::layout Layout, access::address_space Space, access::decorated IsDecorated, @@ -368,7 +450,7 @@ template < inline __SYCL_ALWAYS_INLINE void joint_matrix_store(Group sg, sycl::ext::oneapi::experimental::matrix::joint_matrix< - Group, T, Use, NumRows, NumCols, Layout> &src, + Group, Tp, Use, NumRows, NumCols, Layout> &src, multi_ptr dst, size_t stride) { #if defined(__SYCL_DEVICE_ONLY__) #if defined(__NVPTX__) @@ -383,7 +465,7 @@ joint_matrix_store(Group sg, #else // intel's impl T *Ptr = dst.get(); - __spirv_JointMatrixStoreINTEL::value, sycl::ext::oneapi::experimental::matrix:: diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcores.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcores.hpp index a871b9709ae66..018c66cf5213d 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcores.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcores.hpp @@ -18,12 +18,6 @@ namespace oneapi { namespace experimental { namespace matrix { -namespace precision { -class tf32 { - tf32() = delete; -}; -} // namespace precision - template struct joint_matrix; diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp index 718411b22ddbb..6f820fb82575a 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp @@ -18,6 +18,12 @@ enum class use { a, b, accumulator }; enum class layout { row_major = 0, col_major = 1, dynamic = 3 }; +namespace precision { +class tf32 { + tf32() = delete; +}; +} // namespace precision + } // namespace matrix } // namespace experimental } // namespace oneapi diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp index 651069db0de33..a7ca4c4f52496 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp @@ -163,11 +163,13 @@ joint_matrix_fill(Group sg, std::ignore = sg; res.cuda_impl.wi_marray = v; #else + using storage_element_type = + typename oneapi::detail::helper_traits::storage_element_type; res.spvm = - __spirv_CompositeConstruct::value, spv_matrix_layout_traits::value>( - static_cast(v)); + static_cast(v)); #endif // defined(__NVPTX__) #else std::ignore = sg; @@ -201,21 +203,21 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load( assert(false && "Invalid Memory Layout!"); case layout::row_major: res.spvm = __spirv_JointMatrixLoadINTEL< - T, NumRows, NumCols, spv_matrix_use_traits::value, + T, S, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, stride, __spv::MatrixLayout::RowMajor, spv_scope_traits::value); break; case layout::col_major: res.spvm = __spirv_JointMatrixLoadINTEL< - T, NumRows, NumCols, spv_matrix_use_traits::value, + T, S, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, stride, __spv::MatrixLayout::ColumnMajor, spv_scope_traits::value); break; case sycl::ext::intel::experimental::matrix::layout::packed: res.spvm = __spirv_JointMatrixLoadINTEL< - T, NumRows, NumCols, spv_matrix_use_traits::value, + T, S, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, stride, __spv::MatrixLayout::Packed, spv_scope_traits::value); @@ -254,7 +256,7 @@ joint_matrix_load(Group sg, #else T *Ptr = src.get(); res.spvm = - __spirv_JointMatrixLoadINTEL::value, spv_matrix_layout_traits::value>( Ptr, stride, spv_matrix_layout_traits::value, @@ -291,21 +293,21 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store( assert(false && "Invalid Memory Layout!"); case layout::row_major: __spirv_JointMatrixStoreINTEL< - T, NumRows, NumCols, spv_matrix_use_traits::value, + T, T, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, src.spvm, stride, __spv::MatrixLayout::RowMajor, spv_scope_traits::value); break; case layout::col_major: __spirv_JointMatrixStoreINTEL< - T, NumRows, NumCols, spv_matrix_use_traits::value, + T, T, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, src.spvm, stride, __spv::MatrixLayout::ColumnMajor, spv_scope_traits::value); break; case sycl::ext::intel::experimental::matrix::layout::packed: __spirv_JointMatrixStoreINTEL< - T, NumRows, NumCols, spv_matrix_use_traits::value, + T, T, NumRows, NumCols, spv_matrix_use_traits::value, spv_matrix_layout_traits::value>( Ptr, src.spvm, stride, __spv::MatrixLayout::Packed, spv_scope_traits::value); @@ -377,20 +379,23 @@ inline __SYCL_ALWAYS_INLINE // This function rounds the bottom 13 bits up or down, and then zeros out the // bottom bits -inline __SYCL_ALWAYS_INLINE float round_to_tf32(float &a) { -#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) +inline __SYCL_ALWAYS_INLINE float round_to_tf32(const float &a) { +#if defined(__SYCL_DEVICE_ONLY__) +#if defined(__NVPTX__) int32_t tmp_int = __nvvm_f2tf32_rna(a); return __nvvm_bitcast_i2f(tmp_int); #else - uint32_t tmp_uint = reinterpret_cast(a); + return __spirv_ConvertFToTF32INTEL(a); +#endif // defined(__NVPTX__) +#else + uint32_t tmp_uint = reinterpret_cast(a); tmp_uint += 0x1000u; tmp_uint &= 0xFFFFE000u; float ret = 0; std::memcpy(&ret, &tmp_uint, sizeof(float)); return ret; -#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) +#endif // defined(__SYCL_DEVICE_ONLY__) } - } // namespace matrix } // namespace experimental } // namespace oneapi diff --git a/sycl/test/matrix/matrix-tf32-test.cpp b/sycl/test/matrix/matrix-tf32-test.cpp new file mode 100644 index 0000000000000..8425bcddbaff4 --- /dev/null +++ b/sycl/test/matrix/matrix-tf32-test.cpp @@ -0,0 +1,164 @@ +// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +auto constexpr SG_SZ = 16; + +#define TM 8 +#define TN SG_SZ +#define TK 16 + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void matrix_multiply(big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC((float *)C.get_data(), range<2>(M, N)); + + queue q; + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] + + { + // The matrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix + sub_b; + joint_matrix sub_c; + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + joint_matrix_fill(sg, sub_a, 42); + for (int k = 0; k < K; k += TK) { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k, K); + joint_matrix_load( + sg, sub_b, + accB.get_pointer() + (k) * (N) + sg_starty / SG_SZ * TN, N); + // If no rounding to tf32 function is called, joint_matrix_mad + // function will work on truncated floats. + auto wi_data_a = + sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + for (int i = 0; i < wi_data_a.length(); i++) { + wi_data_a[i] = round_to_tf32(wi_data_a[i]); + } + auto wi_data_b = + sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_b); + for (int i = 0; i < wi_data_b.length(); i++) { + wi_data_b[i] = round_to_tf32(wi_data_b[i]); + } + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + auto wi_slice_a = + sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + for (int i = 0; i < wi_slice_a.length(); i++) { + float elem = wi_slice_a[i]; + wi_slice_a[i] *= 2; + } + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +static constexpr size_t MATRIX_K = TK * 2; +float A[MATRIX_M][MATRIX_K]; +float B[MATRIX_K][MATRIX_N]; +float C[MATRIX_M][MATRIX_N]; +float D[MATRIX_M][MATRIX_N]; + +void matrix_multiply_ref(float *A_mem, float *B_mem, float *C_mem, int M, int N, + int K) { + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + float va = A_mem[m * K + k]; + float vb = B_mem[k * N + n]; + C_mem[m * N + n] += va * vb; + } + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = 1.0f * (i + j); + } + } + for (int i = 0; i < MATRIX_K; i++) { + for (int j = 0; j < MATRIX_N; j++) { + B[i][j] = 2.0f * i + 3.0f * j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1.0; + D[i][j] = 1.0; + } + } + + big_matrix MC((float *)&C); + big_matrix MD((float *)&D); + big_matrix MA((float *)&A); + big_matrix MB((float *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((float *)A, (float *)B, (float *)D, MATRIX_M, MATRIX_N, + MATRIX_K); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + if (res) + std::cout << "passed\n"; + else + std::cout << "failed\n"; +} From 88aad2c9f7228c1e8f6d697c46c229a5ce394cf1 Mon Sep 17 00:00:00 2001 From: Dounia Date: Mon, 20 Mar 2023 12:33:01 -0700 Subject: [PATCH 2/9] Address Jack's comments --- .../include/sycl/ext/oneapi/matrix/matrix-unified.hpp | 4 +++- sycl/test/matrix/matrix-tf32-test.cpp | 11 +++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp index a7ca4c4f52496..6963eeeeb4252 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp @@ -138,9 +138,11 @@ joint_matrix_apply(Group sg, joint_matrix &jm, lambda(jm.cuda_impl.wi_marray[i]); } #else // NVPTX + using storage_element_type = + typename oneapi::detail::helper_traits::storage_element_type; auto wi_data_c = sycl::ext::intel::experimental::matrix::get_wi_data(sg, jm); for (int i = 0; i < wi_data_c.length(); i++) { - T element = wi_data_c[i]; + storage_element_type element = wi_data_c[i]; lambda(element); wi_data_c[i] = element; } diff --git a/sycl/test/matrix/matrix-tf32-test.cpp b/sycl/test/matrix/matrix-tf32-test.cpp index 8425bcddbaff4..6b5339f6e26ac 100644 --- a/sycl/test/matrix/matrix-tf32-test.cpp +++ b/sycl/test/matrix/matrix-tf32-test.cpp @@ -1,4 +1,5 @@ -// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-device-only -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -O2 -S -emit-llvm -o - %s | FileCheck %s + #include #include @@ -78,11 +79,8 @@ void matrix_multiply(big_matrix &C, accB.get_pointer() + (k) * (N) + sg_starty / SG_SZ * TN, N); // If no rounding to tf32 function is called, joint_matrix_mad // function will work on truncated floats. - auto wi_data_a = - sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a); - for (int i = 0; i < wi_data_a.length(); i++) { - wi_data_a[i] = round_to_tf32(wi_data_a[i]); - } + joint_matrix_apply(sg, sub_a, + [=](float x) { x = round_to_tf32(x); }); auto wi_data_b = sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_b); for (int i = 0; i < wi_data_b.length(); i++) { @@ -92,6 +90,7 @@ void matrix_multiply(big_matrix &C, } auto wi_slice_a = sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + joint_matrix_apply(sg, sub_a, [=](float x) { x *= 2; }); for (int i = 0; i < wi_slice_a.length(); i++) { float elem = wi_slice_a[i]; wi_slice_a[i] *= 2; From a56084d8eaabac514554506c159eae187ca5f1bd Mon Sep 17 00:00:00 2001 From: Dounia Date: Wed, 22 Mar 2023 12:08:02 -0700 Subject: [PATCH 3/9] change convert to round in the name --- sycl/include/CL/__spirv/spirv_ops.hpp | 2 +- sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp | 2 +- sycl/test/matrix/matrix-tf32-test.cpp | 9 +-------- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 9f436ee659d0f..1eba8f83b43e3 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -24,7 +24,7 @@ #ifdef __SYCL_DEVICE_ONLY__ #if (SYCL_EXT_ONEAPI_MATRIX_VERSION > 1) -extern SYCL_EXTERNAL float __spirv_ConvertFToTF32INTEL(float a); +extern SYCL_EXTERNAL float __spirv_RoundFToTF32INTEL(float a); template (a); diff --git a/sycl/test/matrix/matrix-tf32-test.cpp b/sycl/test/matrix/matrix-tf32-test.cpp index 6b5339f6e26ac..bfea2e4c9a698 100644 --- a/sycl/test/matrix/matrix-tf32-test.cpp +++ b/sycl/test/matrix/matrix-tf32-test.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-device-only -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -O2 -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out #include #include @@ -88,13 +88,6 @@ void matrix_multiply(big_matrix &C, } sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); } - auto wi_slice_a = - sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a); - joint_matrix_apply(sg, sub_a, [=](float x) { x *= 2; }); - for (int i = 0; i < wi_slice_a.length(); i++) { - float elem = wi_slice_a[i]; - wi_slice_a[i] *= 2; - } joint_matrix_store(sg, sub_c, accC.get_pointer() + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, From 0d23b8acc5c7f947107e540c60320dd619388afe Mon Sep 17 00:00:00 2001 From: Dounia Date: Fri, 24 Mar 2023 06:55:42 -0700 Subject: [PATCH 4/9] set xfail to test as SPIRV changes are not part of intel/llvm yet --- sycl/test/matrix/matrix-tf32-test.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/test/matrix/matrix-tf32-test.cpp b/sycl/test/matrix/matrix-tf32-test.cpp index bfea2e4c9a698..2410872cd70b2 100644 --- a/sycl/test/matrix/matrix-tf32-test.cpp +++ b/sycl/test/matrix/matrix-tf32-test.cpp @@ -1,5 +1,7 @@ // RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out +// XFAIL:* + #include #include From 7d05e6ad230498fbf84e659517085b455a534a5e Mon Sep 17 00:00:00 2001 From: Dounia Date: Tue, 28 Mar 2023 06:11:04 -0700 Subject: [PATCH 5/9] correct SYCL_EXTERNAL with the new naming __DPCPP_SYCL_EXTERNAL --- sycl/include/CL/__spirv/spirv_ops.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index f6578f942a2c2..b9f14db4fde00 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -24,7 +24,7 @@ #ifdef __SYCL_DEVICE_ONLY__ #if (SYCL_EXT_ONEAPI_MATRIX_VERSION > 1) -extern SYCL_EXTERNAL float __spirv_RoundFToTF32INTEL(float a); +extern __DPCPP_SYCL_EXTERNAL float __spirv_RoundFToTF32INTEL(float a); template Date: Mon, 3 Apr 2023 16:26:43 +0800 Subject: [PATCH 6/9] move e2e testcases from llvm-test-suite's pr --- .../Matrix/XMX8/element_wise_all_ops_tf32.cpp | 27 ++ .../Matrix/XMX8/joint_matrix_tf32.cpp | 27 ++ .../Matrix/element_wise_all_ops_tf32.cpp | 27 ++ .../Matrix/element_wise_all_ops_tf32_impl.hpp | 239 ++++++++++++++++++ sycl/test-e2e/Matrix/joint_matrix_tf32.cpp | 27 ++ .../Matrix/joint_matrix_tf32_impl.hpp | 147 +++++++++++ 6 files changed, 494 insertions(+) create mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_tf32.cpp create mode 100644 sycl/test-e2e/Matrix/XMX8/joint_matrix_tf32.cpp create mode 100644 sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp create mode 100644 sycl/test-e2e/Matrix/element_wise_all_ops_tf32_impl.hpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_tf32.cpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_tf32.cpp new file mode 100644 index 0000000000000..2af9d9176548b --- /dev/null +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_tf32.cpp @@ -0,0 +1,27 @@ +//==----------- element_wise_all_ops_tf32.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix-xmx8 + +// RUN: %clangxx -fsycl %s -o %t.out -Dsycl/test-e2e_EXT_ONEAPI_MATRIX_VERSION=4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// XFAIL:* + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel; +using namespace sycl::ext::oneapi; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 8 + +#include "../element_wise_all_ops_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_tf32.cpp new file mode 100644 index 0000000000000..c35a69f05f460 --- /dev/null +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_tf32.cpp @@ -0,0 +1,27 @@ +//==---------------- joint_matrix_tf32.cpp - DPC++ joint_matrix------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix-xmx8 + +// RUN: %clangxx -fsycl %s -o %t.out -Dsycl/test-e2e_EXT_ONEAPI_MATRIX_VERSION=4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// XFAIL:* + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel; +using namespace sycl::ext::oneapi; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 8 + +#include "../joint_matrix_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp new file mode 100644 index 0000000000000..c693a186be970 --- /dev/null +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp @@ -0,0 +1,27 @@ +//==----------- element_wise_all_ops_tf32.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %clangxx -fsycl %s -o %t.out -Dsycl/test-e2e_EXT_ONEAPI_MATRIX_VERSION=4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// XFAIL:* + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel; +using namespace sycl::ext::oneapi; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 16 + +#include "element_wise_all_ops_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_tf32_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_tf32_impl.hpp new file mode 100644 index 0000000000000..77a84c6533b23 --- /dev/null +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_tf32_impl.hpp @@ -0,0 +1,239 @@ + +#define TM 8 +#define TN SG_SZ +#define TK 16 + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void assert_ops_ref(host_accessor C, + const float ref) { + for (size_t i = 0; i < M; i++) + for (size_t j = 0; j < N; j++) { + auto diff = C[i][j] - ref; + assert(std::fabs(static_cast(diff)) < + std::numeric_limits::epsilon()); + } +} +template +void matrix_verify_add(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + + cgh.parallel_for( + r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + + joint_matrix_fill(sg, sub_a, round_to_tf32(5.0)); + + auto wi_slice_a = + ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = wi_slice_a[i] + 2; + } + + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(sycl::read_only), ref); +} + +template +void matrix_verify_sub(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + + cgh.parallel_for( + r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + + joint_matrix_fill(sg, sub_a, round_to_tf32(5.0)); + + auto wi_slice_a = + ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = wi_slice_a[i] - round_to_tf32(2); + } + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(sycl::read_only), ref); +} + +template +void matrix_verify_mul(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + + cgh.parallel_for( + r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + joint_matrix_fill(sg, sub_a, round_to_tf32(5.0)); + + auto wi_slice_a = + ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = wi_slice_a[i] * round_to_tf32(3.0); + } + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(sycl::read_only), ref); +} + +template +void matrix_verify_div(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + + cgh.parallel_for( + r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + + joint_matrix_fill(sg, sub_a, round_to_tf32(4.0)); + + auto wi_slice_a = + ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = wi_slice_a[i] / round_to_tf32(2.0); + } + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(sycl::read_only), ref); +} + +template +void matrix_verify_logic(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_write}; + cgh.parallel_for( + r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + + joint_matrix_fill(sg, sub_a, round_to_tf32(5.0)); + + auto wi_slice_a = + ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + for (int i = 0; i < wi_slice_a.length(); i++) { + if (wi_slice_a[i]) { + if (wi_slice_a[i] > 2.0 || wi_slice_a[i] >= 2.0 || + wi_slice_a[i] < 2.0 || wi_slice_a[i] <= 2.0) { + Ts val = (wi_slice_a[i] != 2.0) ? wi_slice_a[i] : 2.0; + val = val - static_cast(1); + val = val + static_cast(1); + if (wi_slice_a[i] == 2.0) { + val = val - static_cast(2); + val = val * static_cast(3); + val = val / static_cast(2); + + } else { + val = val + static_cast(2); + } + wi_slice_a[i] = val; + } + } + } + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_host_access(sycl::read_only), ref); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +float A[MATRIX_M][MATRIX_N]; +float D[MATRIX_M][MATRIX_N]; + +int main() { + + big_matrix MD((float *)&D); + big_matrix MA((float *)&A); + + size_t NDRangeM = MATRIX_M / TM; + size_t NDRangeN = MATRIX_N / TN; + queue q; + nd_range<2> r({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}); + + matrix_verify_add(q, MA, r, 7.0); + matrix_verify_sub(q, MA, r, 3.0); + matrix_verify_mul(q, MA, r, 15.0); + matrix_verify_div(q, MA, r, 2.0); + matrix_verify_logic(q, MA, r, + 7.0); + + return 0; +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp new file mode 100644 index 0000000000000..f50ac79bd0a46 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp @@ -0,0 +1,27 @@ +//==---------------- joint_matrix_tf32.cpp - DPC++ joint_matrix------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %clangxx -fsycl %s -o %t.out -Dsycl/test-e2e_EXT_ONEAPI_MATRIX_VERSION=4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// XFAIL:* + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel; +using namespace sycl::ext::oneapi; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 16 + +#include "joint_matrix_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp new file mode 100644 index 0000000000000..0c6787ecd0ba9 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp @@ -0,0 +1,147 @@ +#define TM 8 +#define TN SG_SZ +#define TK 16 + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void matrix_multiply(big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC((float *)C.get_data(), range<2>(M, N)); + + queue q; + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] + + { + // The matrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix + sub_b; + joint_matrix sub_c; + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + joint_matrix_fill(sg, sub_a, 42); + for (int k = 0; k < K; k += TK) { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k, K); + joint_matrix_load( + sg, sub_b, + accB.get_pointer() + (k) * (N) + sg_starty / SG_SZ * TN, N); + // If no rounding to tf32 function is called, joint_matrix_mad + // function will work on truncated floats. + joint_matrix_apply(sg, sub_a, + [=](float x) { x = round_to_tf32(x); }); + auto wi_data_b = + sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_b); + for (int i = 0; i < wi_data_b.length(); i++) { + wi_data_b[i] = round_to_tf32(wi_data_b[i]); + } + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + auto wi_slice_a = + sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a); + joint_matrix_apply(sg, sub_a, [=](float x) { x *= 2; }); + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +static constexpr size_t MATRIX_K = TK * 2; +float A[MATRIX_M][MATRIX_K]; +float B[MATRIX_K][MATRIX_N]; +float C[MATRIX_M][MATRIX_N]; +float D[MATRIX_M][MATRIX_N]; + +void matrix_multiply_ref(float *A_mem, float *B_mem, float *C_mem, int M, int N, + int K) { + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + float va = A_mem[m * K + k]; + float vb = B_mem[k * N + n]; + C_mem[m * N + n] += va * vb; + } + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = 1.0f * (i + j); + } + } + for (int i = 0; i < MATRIX_K; i++) { + for (int j = 0; j < MATRIX_N; j++) { + B[i][j] = 2.0f * i + 3.0f * j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1.0; + D[i][j] = 1.0; + } + } + + big_matrix MC((float *)&C); + big_matrix MD((float *)&D); + big_matrix MA((float *)&A); + big_matrix MB((float *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((float *)A, (float *)B, (float *)D, MATRIX_M, MATRIX_N, + MATRIX_K); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + std::cout << (res ? "passed" : "failed") << std::endl; + return !res; +} From 93193e9304563f30c47ba60d2b5b4a25d543b0c2 Mon Sep 17 00:00:00 2001 From: Bing1 Yu Date: Tue, 4 Apr 2023 14:02:03 +0800 Subject: [PATCH 7/9] rm XFAIL from sycl/test/matrix/matrix-tf32-test.cpp --- sycl/test/matrix/matrix-tf32-test.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/test/matrix/matrix-tf32-test.cpp b/sycl/test/matrix/matrix-tf32-test.cpp index 2410872cd70b2..1c0fea4e9b969 100644 --- a/sycl/test/matrix/matrix-tf32-test.cpp +++ b/sycl/test/matrix/matrix-tf32-test.cpp @@ -1,6 +1,5 @@ // RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out -// XFAIL:* #include #include From 8caee53498d8caab50e014c9699621fd5149cb96 Mon Sep 17 00:00:00 2001 From: Bing1 Yu Date: Tue, 4 Apr 2023 14:21:53 +0800 Subject: [PATCH 8/9] fix clang-format issue --- sycl/test/matrix/matrix-tf32-test.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/test/matrix/matrix-tf32-test.cpp b/sycl/test/matrix/matrix-tf32-test.cpp index 1c0fea4e9b969..bfea2e4c9a698 100644 --- a/sycl/test/matrix/matrix-tf32-test.cpp +++ b/sycl/test/matrix/matrix-tf32-test.cpp @@ -1,6 +1,5 @@ // RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out - #include #include From 1557f345fbf6c5afc67d67796d4de6ba79c55d5c Mon Sep 17 00:00:00 2001 From: Bing1 Yu Date: Wed, 5 Apr 2023 21:41:50 +0800 Subject: [PATCH 9/9] address comments --- sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp | 10 +++++----- sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp | 6 ++++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp index af55cd00df04e..cb827d9839a6e 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp @@ -70,17 +70,16 @@ struct joint_matrix; namespace detail { // Differentiating between the "element type" and the "storage element type" -template struct helper_traits { +template struct jm_type_interpretation_helper_trait { using element_type = T; using storage_element_type = T; - using fill_argument_type = T; }; template <> -struct helper_traits { +struct jm_type_interpretation_helper_trait< + sycl::ext::oneapi::experimental::matrix::precision::tf32> { using element_type = sycl::ext::oneapi::experimental::matrix::precision::tf32; using storage_element_type = float; - using fill_argument_type = float; }; } // namespace detail } // namespace oneapi @@ -102,7 +101,8 @@ class wi_element { public: using storage_element_type = - typename oneapi::detail::helper_traits::storage_element_type; + typename oneapi::detail::jm_type_interpretation_helper_trait< + T>::storage_element_type; wi_element(sycl::ext::oneapi::experimental::matrix::joint_matrix< Group, T, Use, NumRows, NumCols, Layout> &Mat, std::size_t i) diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp index 2ccfe3768244e..13326bac83182 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp @@ -139,7 +139,8 @@ joint_matrix_apply(Group sg, joint_matrix &jm, } #else // NVPTX using storage_element_type = - typename oneapi::detail::helper_traits::storage_element_type; + typename oneapi::detail::jm_type_interpretation_helper_trait< + T>::storage_element_type; auto wi_data_c = sycl::ext::intel::experimental::matrix::get_wi_data(sg, jm); for (int i = 0; i < wi_data_c.length(); i++) { storage_element_type element = wi_data_c[i]; @@ -169,7 +170,8 @@ joint_matrix_fill(Group sg, res.cuda_impl.wi_marray = v; #else using storage_element_type = - typename oneapi::detail::helper_traits::storage_element_type; + typename oneapi::detail::jm_type_interpretation_helper_trait< + T>::storage_element_type; res.spvm = __spirv_CompositeConstruct::value,