diff --git a/SYCL/ESIMD/BitonicSortK.cpp b/SYCL/ESIMD/BitonicSortK.cpp index c47a41200a..72eee98cd4 100644 --- a/SYCL/ESIMD/BitonicSortK.cpp +++ b/SYCL/ESIMD/BitonicSortK.cpp @@ -148,7 +148,7 @@ const mask_type_t<32> init_mask20 = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, // Function bitonic_exchange{1,2,4,8} compares and swaps elements with // the particular strides ESIMD_INLINE simd -bitonic_exchange8(simd A, simd flip) { +bitonic_exchange8(simd A, simd_mask<32> flip) { simd B; #pragma unroll for (int i = 0; i < BASE_SZ; i += 32) { @@ -163,7 +163,7 @@ bitonic_exchange8(simd A, simd flip) { } ESIMD_INLINE simd -bitonic_exchange4(simd A, simd flip) { +bitonic_exchange4(simd A, simd_mask<32> flip) { simd B; #pragma unroll for (int i = 0; i < BASE_SZ; i += 32) { @@ -192,7 +192,7 @@ bitonic_exchange4(simd A, simd flip) { // each mov copies four 64-bit data, which is 4X SIMD efficiency // improvement over the straightforward implementation. ESIMD_INLINE simd -bitonic_exchange2(simd A, simd flip) { +bitonic_exchange2(simd A, simd_mask<32> flip) { simd B; #pragma unroll for (int i = 0; i < BASE_SZ; i += 32) { @@ -207,7 +207,7 @@ bitonic_exchange2(simd A, simd flip) { } ESIMD_INLINE simd -bitonic_exchange1(simd A, simd flip) { +bitonic_exchange1(simd A, simd_mask<32> flip) { simd B; #pragma unroll // each thread is handling 256-element chunk. Each iteration @@ -302,8 +302,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd &A, // similar to bitonic_exchange{1,2,4,8}. // exchange 8 - simd flip13(init_mask13); - simd flip14(init_mask14); + simd_mask<32> flip13(init_mask13); + simd_mask<32> flip14(init_mask14); simd B; for (int i = 0; i < BASE_SZ; i += 32) { B.select<8, 1>(i) = A.select<8, 1>(i + 8); @@ -322,8 +322,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd &A, } // exchange 4 - simd flip15(init_mask15); - simd flip16(init_mask16); + simd_mask<32> flip15(init_mask15); + simd_mask<32> flip16(init_mask16); #pragma unroll for (int i = 0; i < BASE_SZ; i += 32) { auto MA = A.select<32, 1>(i).bit_cast_view(); @@ -342,8 +342,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd &A, } // exchange 2 - simd flip17(init_mask17); - simd flip18(init_mask18); + simd_mask<32> flip17(init_mask17); + simd_mask<32> flip18(init_mask18); #pragma unroll for (int i = 0; i < BASE_SZ; i += 32) { auto MB = B.select<32, 1>(i).bit_cast_view(); @@ -362,8 +362,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd &A, flip18); } // exchange 1 - simd flip19(init_mask19); - simd flip20(init_mask20); + simd_mask<32> flip19(init_mask19); + simd_mask<32> flip20(init_mask20); #pragma unroll // Each iteration compares and swaps 2 32-element chunks for (int i = 0; i < BASE_SZ; i += 32) { @@ -406,28 +406,28 @@ ESIMD_INLINE void cmk_bitonic_sort_256(AccTy1 buf1, AccTy2 buf2, uint32_t idx) { simd B; A = cmk_read(buf1, offset); - simd flip1(init_mask1); + simd_mask<32> flip1(init_mask1); simd mask; // stage 0 B = bitonic_exchange1(A, flip1); // stage 1 - simd flip2(init_mask2); - simd flip3(init_mask3); + simd_mask<32> flip2(init_mask2); + simd_mask<32> flip3(init_mask3); A = bitonic_exchange2(B, flip2); B = bitonic_exchange1(A, flip3); // stage 2 - simd flip4(init_mask4); - simd flip5(init_mask5); - simd flip6(init_mask6); + simd_mask<32> flip4(init_mask4); + simd_mask<32> flip5(init_mask5); + simd_mask<32> flip6(init_mask6); A = bitonic_exchange4(B, flip4); B = bitonic_exchange2(A, flip5); A = bitonic_exchange1(B, flip6); // stage 3 - simd flip7(init_mask7); - simd flip8(init_mask8); - simd flip9(init_mask9); - simd flip10(init_mask10); + simd_mask<32> flip7(init_mask7); + simd_mask<32> flip8(init_mask8); + simd_mask<32> flip9(init_mask9); + simd_mask<32> flip10(init_mask10); B = bitonic_exchange8(A, flip7); A = bitonic_exchange4(B, flip8); B = bitonic_exchange2(A, flip9); diff --git a/SYCL/ESIMD/BitonicSortKv2.cpp b/SYCL/ESIMD/BitonicSortKv2.cpp index 9aaee21ea6..7720a9777e 100644 --- a/SYCL/ESIMD/BitonicSortKv2.cpp +++ b/SYCL/ESIMD/BitonicSortKv2.cpp @@ -65,7 +65,7 @@ ESIMD_INLINE void cmk_write(ty *buf, uint32_t offset, simd v) { // Function bitonic_exchange{1,2,4,8} compares and swaps elements with // the particular strides ESIMD_INLINE simd -bitonic_exchange8(simd A, simd flip) { +bitonic_exchange8(simd A, simd_mask<32> flip) { simd B; #pragma unroll for (int i = 0; i < BASE_SZ; i += 32) { @@ -80,7 +80,7 @@ bitonic_exchange8(simd A, simd flip) { } ESIMD_INLINE simd -bitonic_exchange4(simd A, simd flip) { +bitonic_exchange4(simd A, simd_mask<32> flip) { simd B; #pragma unroll for (int i = 0; i < BASE_SZ; i += 32) { @@ -109,7 +109,7 @@ bitonic_exchange4(simd A, simd flip) { // each mov copies four 64-bit data, which is 4X SIMD efficiency // improvement over the straightforward implementation. ESIMD_INLINE simd -bitonic_exchange2(simd A, simd flip) { +bitonic_exchange2(simd A, simd_mask<32> flip) { simd B; #pragma unroll for (int i = 0; i < BASE_SZ; i += 32) { @@ -124,7 +124,7 @@ bitonic_exchange2(simd A, simd flip) { } ESIMD_INLINE simd -bitonic_exchange1(simd A, simd flip) { +bitonic_exchange1(simd A, simd_mask<32> flip) { simd B; #pragma unroll // each thread is handling 256-element chunk. Each iteration @@ -219,8 +219,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd &A, // similar to bitonic_exchange{1,2,4,8}. // exchange 8 - simd flip13 = esimd_unpack_mask<32>(0xff00ff00); //(init_mask13); - simd flip14 = esimd_unpack_mask<32>(0x00ff00ff); //(init_mask14); + simd_mask<32> flip13 = esimd_unpack_mask<32>(0xff00ff00); //(init_mask13); + simd_mask<32> flip14 = esimd_unpack_mask<32>(0x00ff00ff); //(init_mask14); simd B; for (int i = 0; i < BASE_SZ; i += 32) { B.select<8, 1>(i) = A.select<8, 1>(i + 8); @@ -239,8 +239,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd &A, } // exchange 4 - simd flip15 = esimd_unpack_mask<32>(0xf0f0f0f0); //(init_mask15); - simd flip16 = esimd_unpack_mask<32>(0x0f0f0f0f); //(init_mask16); + simd_mask<32> flip15 = esimd_unpack_mask<32>(0xf0f0f0f0); //(init_mask15); + simd_mask<32> flip16 = esimd_unpack_mask<32>(0x0f0f0f0f); //(init_mask16); #pragma unroll for (int i = 0; i < BASE_SZ; i += 32) { auto MA = A.select<32, 1>(i).bit_cast_view(); @@ -259,8 +259,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd &A, } // exchange 2 - simd flip17 = esimd_unpack_mask<32>(0xcccccccc); //(init_mask17); - simd flip18 = esimd_unpack_mask<32>(0x33333333); //(init_mask18); + simd_mask<32> flip17 = esimd_unpack_mask<32>(0xcccccccc); //(init_mask17); + simd_mask<32> flip18 = esimd_unpack_mask<32>(0x33333333); //(init_mask18); #pragma unroll for (int i = 0; i < BASE_SZ; i += 32) { auto MB = B.select<32, 1>(i).bit_cast_view(); @@ -279,8 +279,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd &A, flip18); } // exchange 1 - simd flip19 = esimd_unpack_mask<32>(0xaaaaaaaa); //(init_mask19); - simd flip20 = esimd_unpack_mask<32>(0x55555555); //(init_mask20); + simd_mask<32> flip19 = esimd_unpack_mask<32>(0xaaaaaaaa); //(init_mask19); + simd_mask<32> flip20 = esimd_unpack_mask<32>(0x55555555); //(init_mask20); #pragma unroll // Each iteration compares and swaps 2 32-element chunks for (int i = 0; i < BASE_SZ; i += 32) { @@ -323,28 +323,28 @@ ESIMD_INLINE void cmk_bitonic_sort_256(uint32_t *buf1, uint32_t *buf2, simd B; A = cmk_read(buf1, offset); - simd flip1 = esimd_unpack_mask<32>(0x66666666); //(init_mask1); + simd_mask<32> flip1 = esimd_unpack_mask<32>(0x66666666); //(init_mask1); simd mask; // stage 0 B = bitonic_exchange1(A, flip1); // stage 1 - simd flip2 = esimd_unpack_mask<32>(0x3c3c3c3c); //(init_mask2); - simd flip3 = esimd_unpack_mask<32>(0x5a5a5a5a); //(init_mask3); + simd_mask<32> flip2 = esimd_unpack_mask<32>(0x3c3c3c3c); //(init_mask2); + simd_mask<32> flip3 = esimd_unpack_mask<32>(0x5a5a5a5a); //(init_mask3); A = bitonic_exchange2(B, flip2); B = bitonic_exchange1(A, flip3); // stage 2 - simd flip4 = esimd_unpack_mask<32>(0x0ff00ff0); //(init_mask4); - simd flip5 = esimd_unpack_mask<32>(0x33cc33cc); //(init_mask5); - simd flip6 = esimd_unpack_mask<32>(0x55aa55aa); //(init_mask6); + simd_mask<32> flip4 = esimd_unpack_mask<32>(0x0ff00ff0); //(init_mask4); + simd_mask<32> flip5 = esimd_unpack_mask<32>(0x33cc33cc); //(init_mask5); + simd_mask<32> flip6 = esimd_unpack_mask<32>(0x55aa55aa); //(init_mask6); A = bitonic_exchange4(B, flip4); B = bitonic_exchange2(A, flip5); A = bitonic_exchange1(B, flip6); // stage 3 - simd flip7 = esimd_unpack_mask<32>(0x00ffff00); //(init_mask7); - simd flip8 = esimd_unpack_mask<32>(0x0f0ff0f0); //(init_mask8); - simd flip9 = esimd_unpack_mask<32>(0x3333cccc); //(init_mask9); - simd flip10 = esimd_unpack_mask<32>(0x5555aaaa); //(init_mask10); + simd_mask<32> flip7 = esimd_unpack_mask<32>(0x00ffff00); //(init_mask7); + simd_mask<32> flip8 = esimd_unpack_mask<32>(0x0f0ff0f0); //(init_mask8); + simd_mask<32> flip9 = esimd_unpack_mask<32>(0x3333cccc); //(init_mask9); + simd_mask<32> flip10 = esimd_unpack_mask<32>(0x5555aaaa); //(init_mask10); B = bitonic_exchange8(A, flip7); A = bitonic_exchange4(B, flip8); B = bitonic_exchange2(A, flip9); diff --git a/SYCL/ESIMD/PrefixSum.cpp b/SYCL/ESIMD/PrefixSum.cpp index 14a8610a9e..1ddba6b087 100644 --- a/SYCL/ESIMD/PrefixSum.cpp +++ b/SYCL/ESIMD/PrefixSum.cpp @@ -162,7 +162,7 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos, cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0); simd voff(0, 1); // 0, 1, 2, 3 - simd p = voff < TUPLE_SZ; // predicate + simd_mask<8> p = voff < TUPLE_SZ; // predicate voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) * sizeof(unsigned); scatter(buf, S.select<8, 1>(0), voff, p); @@ -182,7 +182,7 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems, simd prev = 0; for (unsigned i = 0; i < remaining; i += 32) { - simd p = elm32 < remaining; + simd_mask<32> p = elm32 < remaining; S = gather_rgba(buf, element_offset, p); diff --git a/SYCL/ESIMD/Prefix_Local_sum1.cpp b/SYCL/ESIMD/Prefix_Local_sum1.cpp index 62aa730869..7f2fd5ba7d 100644 --- a/SYCL/ESIMD/Prefix_Local_sum1.cpp +++ b/SYCL/ESIMD/Prefix_Local_sum1.cpp @@ -101,7 +101,7 @@ void cmk_sum_tuple_count(unsigned int *buf, unsigned int h_pos) { cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0); simd voff(0, 1); // 0, 1, 2, 3 - simd p = voff < TUPLE_SZ; // predicate + simd_mask<8> p = voff < TUPLE_SZ; // predicate voff = (voff + ((h_pos + 1) * PREFIX_ENTRIES * TUPLE_SZ - TUPLE_SZ)) * sizeof(unsigned); scatter(buf, S.select<8, 1>(0), voff, p); diff --git a/SYCL/ESIMD/Prefix_Local_sum2.cpp b/SYCL/ESIMD/Prefix_Local_sum2.cpp index 259a6b6d26..ccd9b6a3d3 100644 --- a/SYCL/ESIMD/Prefix_Local_sum2.cpp +++ b/SYCL/ESIMD/Prefix_Local_sum2.cpp @@ -95,7 +95,7 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos, simd result = 0; result.select(0) = sum; simd voff(0, 1); // 0, 1, 2, 3 - simd p = voff < TUPLE_SZ; // predicate + simd_mask<8> p = voff < TUPLE_SZ; // predicate voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) * sizeof(unsigned); scatter(buf, result, voff, p); diff --git a/SYCL/ESIMD/Prefix_Local_sum3.cpp b/SYCL/ESIMD/Prefix_Local_sum3.cpp index dd18f1a9f8..770804b443 100644 --- a/SYCL/ESIMD/Prefix_Local_sum3.cpp +++ b/SYCL/ESIMD/Prefix_Local_sum3.cpp @@ -124,7 +124,7 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos, cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0); simd voff(0, 1); // 0, 1, 2, 3 - simd p = voff < TUPLE_SZ; // predicate + simd_mask<8> p = voff < TUPLE_SZ; // predicate voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) * sizeof(unsigned); scatter(buf, S.select<8, 1>(0), voff, p); @@ -175,7 +175,7 @@ void cmk_acum_iterative_low(unsigned *buf, unsigned h_pos, cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0); simd voff(0, 1); // 0, 1, 2, 3 - simd p = voff < TUPLE_SZ; // predicate + simd_mask<8> p = voff < TUPLE_SZ; // predicate voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) * sizeof(unsigned); scatter(buf, S.select<8, 1>(0), voff, p); @@ -195,7 +195,7 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems, simd prev = 0; for (unsigned i = 0; i < remaining; i += 32) { - simd p = elm32 < remaining; + simd_mask<32> p = elm32 < remaining; S = gather_rgba(buf, element_offset, p); diff --git a/SYCL/ESIMD/Stencil.cpp b/SYCL/ESIMD/Stencil.cpp index fa4d391dfe..c7068c972f 100644 --- a/SYCL/ESIMD/Stencil.cpp +++ b/SYCL/ESIMD/Stencil.cpp @@ -164,7 +164,7 @@ int main(int argc, char *argv[]) { in.row(i + 10).select(5) * 0.02f; // predciate output - simd p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10; + simd_mask p = (elm16 + h_pos * WIDTH) < (DIM_SIZE - 10); simd elm16_off = elm16 * sizeof(float) + out_off; scatter(outputMatrix, sum, elm16_off, p); diff --git a/SYCL/ESIMD/api/simd_mask.cpp b/SYCL/ESIMD/api/simd_mask.cpp new file mode 100644 index 0000000000..6ee65d2fc4 --- /dev/null +++ b/SYCL/ESIMD/api/simd_mask.cpp @@ -0,0 +1,342 @@ +//==---------------- simd_mask.cpp - DPC++ ESIMD simd_mask API test -------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu +// UNSUPPORTED: cuda +// RUN: %clangxx -fsycl-unnamed-lambda -fsycl -I%S/.. %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// Smoke test for simd_mask API functionality. + +#include "esimd_test_utils.hpp" + +#include +#include +#include +#include +#include +#include + +using namespace sycl::ext::intel::experimental::esimd; +using namespace cl::sycl; + +template using value_type = typename simd_mask::element_type; + +template static inline constexpr value_type Error = 0; +template static inline constexpr value_type Pass = 1; + +// Slow mask storage function independent of simd_mask::copy_to (memory) and +// simd_mask::value_type. +template +static SYCL_ESIMD_FUNCTION void store(value_type *Ptr, simd_mask M) { + value_type Arr[N]; + M.copy_to(Arr); + + for (auto I = 0; I < N; ++I) { + Ptr[I] = Arr[I] ? 1 : 0; + } +} + +// Slow mask storage function independent of simd_mask::copy_from (memory) and +// simd_mask::value_type. +template +static SYCL_ESIMD_FUNCTION simd_mask load(value_type *Ptr) { + value_type Arr[N]; + for (auto I = 0; I < N; ++I) { + Arr[I] = Ptr[I] ? 1 : 0; + } + simd_mask M(std::move(Arr)); + return M; +} + +// Apply F to each element of M and write result to Res. +template +static SYCL_ESIMD_FUNCTION void +check_mask(const simd_mask &M, typename simd_mask::element_type *Res, + PerElemF F) { + for (auto I = 0; I < N; ++I) { + value_type Val = F(M[I]) ? Pass : Error; + Res[I] = Val; + } +} + +// Slow check if M1 and M2 are equal and write result to Res. +template +static SYCL_ESIMD_FUNCTION void +check_masks_equal(const simd_mask &M1, const simd_mask &M2, + typename simd_mask::element_type *Res) { + for (auto I = 0; I < N; ++I) { + value_type Val = ((M1[I] == 0) == (M2[I] == 0)) ? Pass : Error; + Res[I] = Val; + } +} + +// Represents a generic test case. Each test case has two optional inputs - +// In and InvIn, and one mandatory output - Res. Each input and output element +// matches the simd_mask value type, there is one data element in each per +// NDRange element. InvIn is a logical inversion of In for easier validation of +// operations. +template struct sub_test { + using value_type = typename simd_mask::element_type; + + // Used to automatically free USM memory allocated for input/output. + struct usm_deleter { + queue Q; + + void operator()(value_type *Ptr) { + if (Ptr) { + sycl::free(Ptr, Q); + } + } + }; + + queue Q; + using ptr_type = std::unique_ptr; + ptr_type In; + ptr_type InvIn; + ptr_type Res; + size_t Size = N * 7; + + sub_test(queue Q, bool Need2Inputs = false) : Q(Q) { + In = ptr_type{nullptr, usm_deleter{Q}}; + InvIn = ptr_type{nullptr, usm_deleter{Q}}; + Res = ptr_type{nullptr, usm_deleter{Q}}; + init(Need2Inputs); + } + + void init(bool Need2Inputs) { + device Dev = Q.get_device(); + context Ctx = Q.get_context(); + const auto Sz = Size * sizeof(value_type); + In.reset(static_cast(malloc_shared(Sz, Dev, Ctx))); + if (Need2Inputs) + InvIn.reset(static_cast(malloc_shared(Sz, Dev, Ctx))); + Res.reset(static_cast(malloc_shared(Sz, Dev, Ctx))); + if (!In || (Need2Inputs && !InvIn) || !Res) { + throw sycl::exception(std::error_code{}, "malloc_shared failed"); + } + for (unsigned I = 0; I < Size; I += N) { + unsigned J = 0; + + for (; J < N / 2; ++J) { + auto Ind = I + J; + In.get()[Ind] = 1; + if (Need2Inputs) + InvIn.get()[Ind] = 0; + Res.get()[Ind] = Error; + } + for (; J < N; ++J) { + auto Ind = I + J; + In.get()[Ind] = 0; + if (Need2Inputs) + InvIn.get()[Ind] = 1; + Res.get()[Ind] = Error; + } + } + } + + // The main test function which submits the test kernel F. + template bool run(const char *Name, FuncType F) { + std::cout << " Running " << Name << " API test, N=" << N << "...\n"; + + // Submit the kernel. + try { + cl::sycl::range<1> R{Size / N}; + auto E = Q.submit([&](handler &CGH) { CGH.parallel_for(R, F); }); + E.wait(); + } catch (sycl::exception &Exc) { + std::cout << " *** ERROR. SYCL exception caught: << " << Exc.what() + << "\n"; + return false; + } + // Verify results - basically see if there are no non-zeros in the 'Res' + // array. + int ErrCnt = 0; + + for (auto I = 0; I < Size; ++I) { + if (Res.get()[I] == Error) { + if (++ErrCnt < 10) { + std::cout << " failed at index " << I << "\n"; + } + } + } + if (ErrCnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - ErrCnt) / (float)Size) * 100.0f << "% (" + << (Size - ErrCnt) << "/" << Size << ")\n"; + } + std::cout << (ErrCnt > 0 ? " FAILED\n" : " Passed\n"); + return ErrCnt == 0; + } +}; + +// Defines actual test cases. +template struct simd_mask_api_test { + using value_type = typename simd_mask::element_type; + + bool run(queue Q) { + bool Passed = true; + + // Tests for constructors and operators ! []. + { + sub_test Test(Q); + value_type *In = Test.In.get(); + value_type *Res = Test.Res.get(); + Passed &= Test.run( + "broadcast constructor, operator[]", [=](id<1> Id) SYCL_ESIMD_KERNEL { + auto Off = Id * N; + simd_mask M0 = load(In + Off); // 1..1,0...0 + simd_mask M1(M0[0]); + check_mask(M1, Res + Off, [](value_type V) { return V != 0; }); + }); + } + { + sub_test Test(Q); + value_type *Res = Test.Res.get(); + Passed &= + Test.run("value initialization", [=](id<1> Id) SYCL_ESIMD_KERNEL { + auto Off = Id * N; + simd_mask M0{}; + // TODO FIXME Shorter version not work due to a BE bug +#define WORKAROUND_BE_BUG +#ifdef WORKAROUND_BE_BUG + for (auto I = 0; I < N; ++I) { + if (M0[I] == 0) { + Res[Off + I] = Pass; + } + // else write Error, but its already there + } +#else + check_mask(M0, Res + Off, [](value_type V) { return (V == 0); }); +#endif // WORKAROUND_BE_BUG +#undef WORKAROUND_BE_BUG + }); + } + { + sub_test Test(Q, true /*need InInv*/); + value_type *In = Test.In.get(); + value_type *InInv = Test.InvIn.get(); + value_type *Res = Test.Res.get(); + Passed &= Test.run("operator!", [=](id<1> Id) SYCL_ESIMD_KERNEL { + auto Off = Id * N; + simd_mask M0 = load(In + Off); // 1..1,0...0 + simd_mask M1 = !M0; + simd_mask M2 = load(InInv + Off); // 0..0,1...1 + check_masks_equal(M1, M2, Res + Off); + }); + } + + // Tests for binary and assignment operators. + +#define RUN_TEST(Op, Gold) \ + { \ + sub_test Test(Q, true /*need InInv*/); \ + value_type *In = Test.In.get(); \ + value_type *InInv = Test.InvIn.get(); \ + value_type *Res = Test.Res.get(); \ + Passed &= Test.run("operator " #Op, [=](id<1> Id) SYCL_ESIMD_KERNEL { \ + auto Off = Id * N; \ + simd_mask M0 = load(In + Off); /* 1..1,0...0 */ \ + simd_mask M1 = load(InInv + Off); /* 0..0,1...1 */ \ + simd_mask M2 = M0 Op M1; \ + simd_mask MGold((value_type)Gold); \ + check_masks_equal(M2, MGold, Res + Off); \ + }); \ + } + + RUN_TEST(&&, 0); + RUN_TEST(||, 1); + RUN_TEST(&, 0); + RUN_TEST(|, 1); + RUN_TEST(^, 1); + RUN_TEST(==, 0); + RUN_TEST(!=, 1); + RUN_TEST(&=, 0); + RUN_TEST(|=, 1); + RUN_TEST(^=, 1); +#undef RUN_TEST + + if constexpr (N == 8 || N == 32) { + // Tests for APIs that access memory. + { + sub_test Test(Q); + value_type *In = Test.In.get(); + value_type *Res = Test.Res.get(); + Passed &= Test.run("load constructor", [=](id<1> Id) SYCL_ESIMD_KERNEL { + auto Off = Id * N; + simd_mask M0 = load(In + Off); + simd_mask M1(In + Off); + check_masks_equal(M0, M1, Res + Off); + }); + } + { + sub_test Test(Q); + value_type *In = Test.In.get(); + value_type *Res = Test.Res.get(); + Passed &= Test.run("copy_from", [=](id<1> Id) SYCL_ESIMD_KERNEL { + auto Off = Id * N; + simd_mask M0 = load(In + Off); + simd_mask M1; + M1.copy_from(In + Off); + check_masks_equal(M0, M1, Res + Off); + }); + } + { + sub_test Test(Q, true /*need InInv*/); + value_type *In = Test.In.get(); + value_type *InInv = Test.InvIn.get(); + value_type *Res = Test.Res.get(); + Passed &= Test.run("copy_to", [=](id<1> Id) SYCL_ESIMD_KERNEL { + auto Off = Id * N; + simd_mask M0 = load(In + Off); + M0.copy_to(InInv + Off); + simd_mask M1 = load(InInv + Off); + check_masks_equal(M0, M1, Res + Off); + }); + } + // Tests for APIs select operation. + { + sub_test Test(Q, true /*need InInv*/); + value_type *In = Test.In.get(); + value_type *InInv = Test.InvIn.get(); + value_type *Res = Test.Res.get(); + Passed &= Test.run("read/write through simd_mask::select() ", + [=](id<1> Id) SYCL_ESIMD_KERNEL { + auto Off = Id * N; + simd_mask M0 = load(In + Off); // 1..1,0...0 + simd_mask M1(0); + // swap halves of M0 into M1 + M1.template select(0) = + M0.template select(N / 2); + M1.template select(N / 2) = + M0.template select(0); + // Read the inversed mask, which should be equal to + // M1 + simd_mask M2 = load(InInv + Off); + M2.template select(0) &= + M1.template select(0); // no-op + check_masks_equal(M1, M2, Res + Off); + }); + } + } + return Passed; + } +}; + +int main(int argc, char **argv) { + queue Q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + auto Dev = Q.get_device(); + std::cout << "Running on " << Dev.get_info() << "\n"; + bool Passed = true; + // Run tests for different mask size, including the one exceeding the h/w flag + // register width and being not multiple of such. + Passed &= simd_mask_api_test<8>().run(Q); + Passed &= simd_mask_api_test<32>().run(Q); + Passed &= simd_mask_api_test<67>().run(Q); + std::cout << (Passed ? "Test Passed\n" : "Test FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/SYCL/ESIMD/histogram_raw_send.cpp b/SYCL/ESIMD/histogram_raw_send.cpp index b0262fa1fe..869cb6871a 100644 --- a/SYCL/ESIMD/histogram_raw_send.cpp +++ b/SYCL/ESIMD/histogram_raw_send.cpp @@ -65,7 +65,7 @@ int checkHistogram(unsigned int *refHistogram, unsigned int *hist) { using namespace sycl::ext::intel::experimental::esimd; template ESIMD_INLINE void atomic_write(T *bins, simd offset, - simd src0, simd pred) { + simd src0, simd_mask pred) { simd oldDst; simd vAddr(reinterpret_cast(bins)); simd vOffset = offset; @@ -82,7 +82,8 @@ ESIMD_INLINE void atomic_write(T *bins, simd offset, constexpr uint8_t isSendc = 0; esimd_raw_sends_load(oldDst, vAddr, src0, exDesc, desc, execSize, sfid, - numSrc0, numSrc1, numDst, isEOT, isSendc, pred); + numSrc0, numSrc1, numDst, isEOT, isSendc, + simd_mask{pred}); } int main(int argc, char *argv[]) { diff --git a/SYCL/ESIMD/kmeans/kmeans.cpp b/SYCL/ESIMD/kmeans/kmeans.cpp index ec4f217a99..37763eedee 100644 --- a/SYCL/ESIMD/kmeans/kmeans.cpp +++ b/SYCL/ESIMD/kmeans/kmeans.cpp @@ -311,9 +311,9 @@ int main(int argc, char *argv[]) { int j = c / SIMD_SIZE; int m = c & (SIMD_SIZE - 1); - xsum.row(j).select<1, 0>(m) += pointsXY.row(0)[k]; - ysum.row(j).select<1, 0>(m) += pointsXY.row(1)[k]; - npoints.row(j).select<1, 0>(m) += 1; + xsum.row(j).select<1, 1>(m) += pointsXY.row(0)[k]; + ysum.row(j).select<1, 1>(m) += pointsXY.row(1)[k]; + npoints.row(j).select<1, 1>(m) += 1; } } simd offsets(0, sizeof(Accum4)); @@ -359,11 +359,11 @@ int main(int argc, char *argv[]) { simd centroid(0); int num = reduce(npoints, std::plus<>()); - centroid.select<1, 0>(0) = reduce(xsum, std::plus<>()) / num; - centroid.select<1, 0>(1) = reduce(ysum, std::plus<>()) / num; - (centroid.bit_cast_view()).select<1, 0>(2) = num; + centroid.select<1, 1>(0) = reduce(xsum, std::plus<>()) / num; + centroid.select<1, 1>(1) = reduce(ysum, std::plus<>()) / num; + (centroid.bit_cast_view()).select<1, 1>(2) = num; - simd mask(0); + simd_mask mask(0); mask.select<3, 1>(0) = 1; int i = it.get_global_id(0) / SIMD_SIZE; int k = it.get_global_id(0) & (SIMD_SIZE - 1); diff --git a/SYCL/ESIMD/linear/linear.cpp b/SYCL/ESIMD/linear/linear.cpp index e22aa9dafe..72b7f0f44d 100644 --- a/SYCL/ESIMD/linear/linear.cpp +++ b/SYCL/ESIMD/linear/linear.cpp @@ -115,7 +115,7 @@ int main(int argc, char *argv[]) { m += in.select<6, 1, 24, 1>(2, 6); m = m * 0.111f; - vout = vm; + vout = convert(vm); media_block_store(accOutput, h_pos * 24, v_pos * 6, out); diff --git a/SYCL/ESIMD/mandelbrot/mandelbrot.cpp b/SYCL/ESIMD/mandelbrot/mandelbrot.cpp index ff4f5fd425..e095d82b89 100644 --- a/SYCL/ESIMD/mandelbrot/mandelbrot.cpp +++ b/SYCL/ESIMD/mandelbrot/mandelbrot.cpp @@ -61,7 +61,7 @@ ESIMD_INLINE void mandelbrot(ACC out_image, int ix, int iy, int crunch, mtemp += 1; } while ((mtemp < crunch) & (xx + yy < 4.0f)); - m.select<1, 0>(lane) = mtemp; + m.select<1, 1>(lane) = mtemp; } // SIMT_END diff --git a/SYCL/ESIMD/mandelbrot/mandelbrot_spec.cpp b/SYCL/ESIMD/mandelbrot/mandelbrot_spec.cpp index a71f02e9b6..2e632bd5e7 100644 --- a/SYCL/ESIMD/mandelbrot/mandelbrot_spec.cpp +++ b/SYCL/ESIMD/mandelbrot/mandelbrot_spec.cpp @@ -63,7 +63,7 @@ ESIMD_INLINE void mandelbrot(ACC out_image, int ix, int iy, int crunch, mtemp += 1; } while ((mtemp < crunch) & (xx + yy < thrs)); - m.select<1, 0>(lane) = mtemp; + m.select<1, 1>(lane) = mtemp; } simd color = (((m * 15) & 0xff)) + (((m * 7) & 0xff) * 256) + diff --git a/SYCL/ESIMD/reduction.cpp b/SYCL/ESIMD/reduction.cpp index cf0a86606e..9b90bc33e0 100644 --- a/SYCL/ESIMD/reduction.cpp +++ b/SYCL/ESIMD/reduction.cpp @@ -61,10 +61,10 @@ int main(void) { va.copy_from(A + i * VL); simd vb; - vb.select<1, 0>(0) = reduce(va, std::plus<>()); - vb.select<1, 0>(1) = reduce(va, std::multiplies<>()); - vb.select<1, 0>(2) = hmax(va); - vb.select<1, 0>(3) = hmin(va); + vb[0] = reduce(va, std::plus<>()); + vb[1] = reduce(va, std::multiplies<>()); + vb[2] = hmax(va); + vb[3] = hmin(va); vb.copy_to(B + i * VL); }); diff --git a/SYCL/ESIMD/regression/dgetrf.cpp b/SYCL/ESIMD/regression/dgetrf.cpp index bbf0b5f8bf..3716ecec41 100644 --- a/SYCL/ESIMD/regression/dgetrf.cpp +++ b/SYCL/ESIMD/regression/dgetrf.cpp @@ -63,7 +63,7 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { auto a = V(GRF, M * N, 0); if (K % 8) { - simd mask = 1; + simd_mask<8> mask = 1; for (int k = 0; k < K % 8; k++) V1(mask, k) = 0; diff --git a/SYCL/ESIMD/regression/dgetrf_8x8.cpp b/SYCL/ESIMD/regression/dgetrf_8x8.cpp new file mode 100644 index 0000000000..679c162079 --- /dev/null +++ b/SYCL/ESIMD/regression/dgetrf_8x8.cpp @@ -0,0 +1,314 @@ +//==-------------- dgetrf.cpp - DPC++ ESIMD on-device test ----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -I%S/.. -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out 1 +// +// Reduced version of dgetrf.cpp - M = 8, N = 8, single batch. +// +#include +#include +#include +#include +#include +#include + +#define ABS(x) ((x) >= 0 ? (x) : -(x)) +#define MIN(x, y) ((x) <= (y) ? (x) : (y)) +#define MAX(x, y) ((x) >= (y) ? (x) : (y)) +#define FP_RAND ((double)rand() / (double)RAND_MAX) + +#define OUTN(text, ...) fprintf(stderr, text, ##__VA_ARGS__) +#define OUT(text, ...) OUTN(text "\n", ##__VA_ARGS__) + +#define CHECK(cmd, status) \ + do { \ + cmd; \ + if (status) { \ + OUT(#cmd " status: %d", status); \ + exit(1); \ + } \ + } while (0) +#define FAILED(res, thresh) ((res) > (thresh) || (res) != (res)) +#define CHECK_AND_REPORT(test_desc, test_id, fail_cond, res, fail_cnt) \ + do { \ + if (fail_cond) \ + fail_cnt++; \ + OUT("Test (%s): " test_desc ". Result: %f. %s", test_id, res, \ + (fail_cond) ? "FAILED" : "PASSED"); \ + } while (0) + +using namespace cl::sycl; +using namespace std; +using namespace sycl::ext::intel::experimental::esimd; + +ESIMD_PRIVATE ESIMD_REGISTER(192) simd GRF; + +#define V(x, w, i) (x).template select(i) +#define V1(x, i) V(x, 1, i) +#define V8(x, i) V(x, 8, i) +#define BCAST8(x, i) (x).template replicate<8, 1>(i) + +template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { + auto a = V(GRF, M * N, 0); + for (int kk = 0; kk < N; kk += 8) { + simd mask = 1; + for (int k = 0; k < 8 && kk + k < N; k++) { + auto ak = V(a, M, (kk + k) * M); + auto ak0 = V8(ak, kk + K); + + V1(mask, k) = 0; + if (ak0[k] != 0.0) { + // scal + double temp = 1.0 / ak0[k]; + ak0.merge(ak0 * temp, mask); + for (int i = 8 + K + kk; i < M; i += 8) { + V8(ak, i) *= temp; + } + + // update + for (int j = kk + k + 1; j < N; j++) { + auto aj = V(a, M, j * M); + auto aj0 = V8(aj, kk + K); + auto temp = BCAST8(aj0, k); + aj0.merge(aj0 - temp * ak0, aj0, mask); + for (int i = 8 + K + kk; i < M; i += 8) { + V8(aj, i) -= temp * V8(ak, i); + } + } + } else if (*info == 0) { + *info = K + kk + k + 1; + } + } + } +} + +// A left-looking algorithm step +// M, N - a panel size to be updated and factorized (M * N <= 64 * 6), must fit +// into GRF K - an update rank P0=A[0:M,0:K] = column(F=A[0:K,0:K], +// L=A[K:M,0:K]) - panel to update with P1=A[0:M,K:K+N] = column(U=A[0:K,K:K+N], +// T=A[K:M,K:K+N]) - panel to be updated +template +ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) { + auto p1 = V(GRF, M * N, 0); + double *a1; + int i, j, k; + + // load P1 + for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) + for (i = 0; i < M; i += 8) { + simd data; + data.copy_from(a1 + i); + V8(p1, j * M + i) = data; + } + // (getrf) factorize T=P*L*U + dgetrfnp_panel(info); + + // store P1 + for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) + for (i = 0; i < M; i += 8) { + simd vals = V8(p1, j * M + i); + vals.copy_to(a1 + i); + } +} + +ESIMD_INLINE void dgetrfnp_esimd_8x8(double *a, int64_t lda, int64_t *ipiv, + int64_t *info) { + *info = 0; + dgetrfnp_left_step<8, 8, 0>(a, lda, info); +} + +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, + int64_t stride_a, int64_t *ipiv, + int64_t stride_ipiv, int64_t batch, + int64_t *info) { + queue queue((gpu_selector())); + auto device = queue.get_device(); + auto context = queue.get_context(); + int status; + + CHECK(status = device.is_gpu(), !status); + + double *a_gpu; + int64_t *ipiv_gpu; + int64_t *info_gpu; + CHECK(a_gpu = static_cast( + malloc_shared(stride_a * batch * sizeof(double), device, context)), + !a_gpu); + CHECK(ipiv_gpu = static_cast(malloc_shared( + stride_ipiv * batch * sizeof(int64_t), device, context)), + !ipiv_gpu); + CHECK(info_gpu = static_cast( + malloc_shared(batch * sizeof(int64_t), device, context)), + !info_gpu); + + memcpy(a_gpu, a, stride_a * batch * sizeof(double)); + + sycl::nd_range<1> range(sycl::range<1>{static_cast(batch)}, + sycl::range<1>{1}); + try { + auto event = queue.submit([&](handler &cgh) { + cgh.parallel_for( + range, [=](nd_item<1> id) SYCL_ESIMD_KERNEL { + int i = id.get_global_id(0); + dgetrfnp_esimd_8x8(&a_gpu[i * stride_a], lda, + &ipiv_gpu[i * stride_ipiv], &info_gpu[i]); + }); + }); + event.wait(); + } catch (const sycl::exception &e) { + std::cout << "*** EXCEPTION caught: " << e.what() << "\n"; + free(a_gpu, context); + free(ipiv_gpu, context); + free(info_gpu, context); + return; + } + + memcpy(a, a_gpu, stride_a * batch * sizeof(double)); + memcpy(ipiv, ipiv_gpu, stride_ipiv * batch * sizeof(int64_t)); + memcpy(info, info_gpu, batch * sizeof(int64_t)); + + free(a_gpu, context); + free(ipiv_gpu, context); + free(info_gpu, context); +} + +static void fp_init(int64_t m, int64_t n, double *a, int64_t lda) { + int64_t i, j; + for (j = 0; j < n; j++) + for (i = 0; i < m; i++) + a[i + j * lda] = 2.0 * FP_RAND - 1.0; +} + +static void fp_copy(int64_t m, int64_t n, double *a, int64_t lda, double *b, + int64_t ldb) { + int64_t i, j; + for (j = 0; j < n; j++) + for (i = 0; i < m; i++) + b[i + j * ldb] = a[i + j * lda]; +} + +static double fp_norm1(int64_t m, int64_t n, double *a, int64_t lda) { + double sum, value = 0.0; + int64_t i, j; + for (j = 0; j < n; j++) { + sum = 0.0; + for (i = 0; i < m; i++) + sum += ABS(a[i + j * lda]); + if (value < sum) + value = sum; + } + return value; +} + +static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, double *a_in, + double *a, int64_t lda, + int64_t stride_a, int64_t *ipiv, + int64_t stride_ipiv, int64_t batch, + int64_t *info) { + double thresh = 30.0; + int fail = 0; + int64_t i, j, k, l; + char label[1024]; + unsigned char prec_b[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; + double res = 0.0, nrm = 0.0, ulp = *(double *)prec_b; + double *w = (double *)malloc(sizeof(double) * MAX(m * n, 1)); + + sprintf(label, "m=%ld, n=%ld, lda=%ld, batch=%ld", m, n, lda, batch); + + for (k = 0; k < batch; k++) { + /* info == 0 */ + CHECK_AND_REPORT("info == 0", label, info[k] != 0, (double)info[k], fail); + + if (m > 0 && n > 0) { + /* | L U - A | / ( |A| n ulp ) */ + memset(w, 0, sizeof(double) * m * n); + if (m < n) { + for (j = 0; j < n; j++) + for (i = 0; i <= j; i++) + w[i + j * m] = a[i + j * lda + k * stride_a]; + for (i = m - 1; i >= 0; i--) + for (j = 0; j < n; j++) + for (l = 0; l < i; l++) + w[i + j * m] += a[i + l * lda + k * stride_a] * w[l + j * m]; + } else { + for (j = 0; j < n; j++) + for (i = j; i < m; i++) + w[i + j * m] = a[i + j * lda + k * stride_a]; + for (j = 0; j < n; j++) + w[j + j * m] = 1.0; + for (j = n - 1; j >= 0; j--) + for (i = 0; i < m; i++) { + w[i + j * m] *= a[j + j * lda + k * stride_a]; + for (l = 0; l < j; l++) + w[i + j * m] += w[i + l * m] * a[l + j * lda + k * stride_a]; + } + } + for (j = 0; j < n; j++) + for (i = 0; i < m; i++) + w[i + j * m] -= a_in[k * stride_a + i + j * lda]; + res = fp_norm1(m, n, w, m); + nrm = fp_norm1(m, n, &a_in[k * stride_a], lda); + nrm *= (double)n * ulp; + res /= nrm > 0.0 ? nrm : ulp; + CHECK_AND_REPORT("| L U - A | / ( |A| n ulp )", label, + FAILED(res, thresh), res, fail); + } + } + + free(w); + return fail; +} + +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, + int64_t stride_a, int64_t *ipiv, + int64_t stride_ipiv, int64_t batch, + int64_t *info); + +int main(int argc, char *argv[]) { + int exit_status = 0; + constexpr int64_t m = 8, n = 8, lda = 8; + int64_t stride_a = lda * n, stride_ipiv = n; + + srand(1); + + for (int i = 1; i < argc; i++) { + int64_t batch = (int64_t)atoi(argv[i]); + batch = MAX(batch, 0); + int64_t a_count = MAX(stride_a * batch, 1); + int64_t ipiv_count = MAX(stride_ipiv * batch, 1); + int64_t info_count = MAX(batch, 1); + double *a = NULL, *a_copy = NULL; + int64_t *ipiv = NULL, *info = NULL; + CHECK(a = (double *)malloc(sizeof(double) * a_count), !a); + CHECK(a_copy = (double *)malloc(sizeof(double) * a_count), !a_copy); + CHECK(ipiv = (int64_t *)malloc(sizeof(int64_t) * ipiv_count), !ipiv); + CHECK(info = (int64_t *)malloc(sizeof(int64_t) * info_count), !info); + + /* Initialize input data */ + for (int64_t k = 0; k < batch; k++) { + fp_init(m, n, &a_copy[k * stride_a], lda); + fp_copy(m, n, &a_copy[k * stride_a], lda, &a[k * stride_a], lda); + } + + /* Run the tested function */ + dgetrfnp_batch_strided_c(m, n, a, lda, stride_a, ipiv, stride_ipiv, batch, + info); + + /* Check that the computation completed successfully */ + exit_status += dgetrfnp_batch_strided_check(m, n, a_copy, a, lda, stride_a, + ipiv, stride_ipiv, batch, info); + + free(a); + free(a_copy); + free(ipiv); + free(info); + } + return exit_status; +} diff --git a/SYCL/ESIMD/stencil2.cpp b/SYCL/ESIMD/stencil2.cpp index 9f6799b704..85f47527c1 100644 --- a/SYCL/ESIMD/stencil2.cpp +++ b/SYCL/ESIMD/stencil2.cpp @@ -166,7 +166,7 @@ int main(int argc, char *argv[]) { vin.select(GET_IDX(i + 10, 5)) * 0.02f; // predciate output - simd p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10; + simd_mask p = (elm16 + h_pos * WIDTH) < (DIM_SIZE - 10); simd elm16_off = elm16 * sizeof(float) + out_off; scatter(outputMatrix, sum, elm16_off, p); diff --git a/SYCL/ESIMD/usm_gather_scatter_rgba.cpp b/SYCL/ESIMD/usm_gather_scatter_rgba.cpp index c72babedd5..a0c48719b4 100644 --- a/SYCL/ESIMD/usm_gather_scatter_rgba.cpp +++ b/SYCL/ESIMD/usm_gather_scatter_rgba.cpp @@ -42,7 +42,7 @@ struct Kernel { simd byteOffsets(0, STRIDE * sizeof(T) * NUM_RGBA_CHANNELS); simd v = gather_rgba(bufIn + global_offset, byteOffsets); - v += i; + v += (int)i; simd pred = 1; pred[VL - MASKED_LANE_NUM_REV] = 0; // mask out the last lane