diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index 73ae2d7e5428b..9c3eb5a1b55c8 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -14,6 +14,8 @@ #include #include +#include + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace ONEAPI { @@ -386,12 +388,30 @@ class reducer struct AreAllButLastReductions { + static constexpr bool value = + std::is_base_of::value && + AreAllButLastReductions::value; +}; + +/// Helper specialization of AreAllButLastReductions for one element only. +/// Returns true if the template parameter is not a reduction. +template struct AreAllButLastReductions { + static constexpr bool value = !std::is_base_of::value; +}; + /// This class encapsulates the reduction variable/accessor, /// the reduction operator and an optional operator identity. template -class reduction_impl { +class reduction_impl : private reduction_impl_base { public: using reducer_type = reducer; using result_type = T; @@ -405,6 +425,9 @@ class reduction_impl { static constexpr access::mode accessor_mode = AccMode; static constexpr int accessor_dim = Dims; static constexpr int buffer_dim = (Dims == 0) ? 1 : Dims; + using local_accessor_type = + accessor; + static constexpr bool has_fast_atomics = IsReduOptForFastAtomicFetch::value; static constexpr bool has_fast_reduce = @@ -541,11 +564,8 @@ class reduction_impl { #endif } - accessor - getReadWriteLocalAcc(size_t Size, handler &CGH) { - return accessor(Size, CGH); + static local_accessor_type getReadWriteLocalAcc(size_t Size, handler &CGH) { + return local_accessor_type(Size, CGH); } accessor @@ -554,6 +574,30 @@ class reduction_impl { return accessor(*MOutBufPtr, CGH); } + /// Returns user's USM pointer passed to reduction for editing. + template + std::enable_if_t + getWriteMemForPartialReds(size_t, handler &) { + return getUSMPointer(); + } + + /// Returns user's accessor passed to reduction for editing. + template + std::enable_if_t + getWriteMemForPartialReds(size_t, handler &) { + return *MAcc; + } + + /// Constructs a new temporary buffer to hold partial sums and returns + /// the accessor that that buffer. + template + std::enable_if_t + getWriteMemForPartialReds(size_t Size, handler &CGH) { + MOutBufPtr = std::make_shared>(range<1>(Size)); + CGH.addReduction(MOutBufPtr); + return accessor_type(*MOutBufPtr, CGH); + } + template enable_if_t<_IsPlaceholder == access::placeholder::false_t, accessor_type> getWriteAccForPartialReds(size_t Size, handler &CGH) { @@ -642,26 +686,24 @@ template class __sycl_reduction_aux_kernel; /// Helper structs to get additional kernel name types based on given -/// \c Name and \c Type types: if \c Name is undefined (is a \c auto_name) then -/// \c Type becomes the \c Name. -template +/// \c Name and additional template parameters helping to distinguish kernels. +/// If \c Name is undefined (is \c auto_name) then \c Type becomes the \c Name. +template struct get_reduction_main_kernel_name_t { using name = __sycl_reduction_main_kernel< - typename sycl::detail::get_kernel_name_t::name, B1, B2, - OutputT>; + typename sycl::detail::get_kernel_name_t::name, B1, B2, T3>; }; -template +template struct get_reduction_aux_kernel_name_t { using name = __sycl_reduction_aux_kernel< - typename sycl::detail::get_kernel_name_t::name, B1, B2, - OutputT>; + typename sycl::detail::get_kernel_name_t::name, B1, B2, T3>; }; /// Implements a command group function that enqueues a kernel that calls /// user's lambda function KernelFunc and also does one iteration of reduction /// of elements computed in user's lambda function. /// This version uses ONEAPI::reduce() algorithm to reduce elements in each -/// of work-groups, then it calls fast sycl atomic operations to update +/// of work-groups, then it calls fast SYCL atomic operations to update /// user's reduction variable. /// /// Briefly: calls user's lambda, ONEAPI::reduce() + atomic, INT + ADD/MIN/MAX. @@ -688,7 +730,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, /// user's lambda function KernelFunc and also does one iteration of reduction /// of elements computed in user's lambda function. /// This version uses tree-reduction algorithm to reduce elements in each -/// of work-groups, then it calls fast sycl atomic operations to update +/// of work-groups, then it calls fast SYCL atomic operations to update /// user's reduction variable. /// /// Briefly: calls user's lambda, tree-reduction + atomic, INT + AND/OR/XOR. @@ -696,7 +738,7 @@ template enable_if_t reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, - Reduction &Redu, OutputT Out) { + Reduction &, OutputT Out) { size_t WGSize = Range.get_local_range().size(); // Use local memory to reduce elements in work-groups into zero-th element. @@ -704,7 +746,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, // The additional last element is used to catch reduce elements that could // otherwise be lost in the tree-reduction algorithm used in the kernel. size_t NLocalElements = WGSize + (IsPow2WG ? 0 : 1); - auto LocalReds = Redu.getReadWriteLocalAcc(NLocalElements, CGH); + auto LocalReds = Reduction::getReadWriteLocalAcc(NLocalElements, CGH); using Name = typename get_reduction_main_kernel_name_t< KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name; @@ -830,7 +872,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, // The additional last element is used to catch elements that could // otherwise be lost in the tree-reduction algorithm. size_t NumLocalElements = WGSize + (IsPow2WG ? 0 : 1); - auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH); + auto LocalReds = Reduction::getReadWriteLocalAcc(NumLocalElements, CGH); typename Reduction::result_type ReduIdentity = Redu.getIdentity(); using Name = typename get_reduction_main_kernel_name_t< KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name; @@ -959,7 +1001,7 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups, // The additional last element is used to catch elements that could // otherwise be lost in the tree-reduction algorithm. size_t NumLocalElements = WGSize + (UniformPow2WG ? 0 : 1); - auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH); + auto LocalReds = Reduction::getReadWriteLocalAcc(NumLocalElements, CGH); auto ReduIdentity = Redu.getIdentity(); auto BOp = Redu.getBinaryOperation(); @@ -1050,6 +1092,481 @@ reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, return NWorkGroups; } +/// For the given 'Reductions' types pack and indices enumerating only +/// the reductions for which a local accessors are needed, this function creates +/// those local accessors and returns a tuple consisting of them. +template +auto createReduLocalAccs(size_t Size, handler &CGH, + std::index_sequence) { + return std::make_tuple( + std::tuple_element_t>::getReadWriteLocalAcc( + Size, CGH)...); +} + +/// For the given 'Reductions' types pack and indices enumerating them this +/// function either creates new temporary accessors for partial sums (if IsOneWG +/// is false) or returns user's accessor/USM-pointer if (IsOneWG is true). +template +auto createReduOutAccs(size_t NWorkGroups, handler &CGH, + std::tuple &ReduTuple, + std::index_sequence) { + return std::make_tuple( + std::get(ReduTuple).template getWriteMemForPartialReds( + NWorkGroups, CGH)...); +} + +/// For the given 'Reductions' types pack and indices enumerating them this +/// function returns accessors to buffers holding partial sums generated in the +/// previous kernel invocation. +template +auto getReadAccsToPreviousPartialReds(handler &CGH, + std::tuple &ReduTuple, + std::index_sequence) { + return std::make_tuple( + std::get(ReduTuple).getReadAccToPreviousPartialReds(CGH)...); +} + +template +std::tuple +getReduIdentities(std::tuple &ReduTuple, + std::index_sequence) { + return {std::get(ReduTuple).getIdentity()...}; +} + +template +std::tuple +getReduBOPs(std::tuple &ReduTuple, std::index_sequence) { + return {std::get(ReduTuple).getBinaryOperation()...}; +} + +template +std::tuple +createReducers(std::tuple Identities, + std::tuple BOPsTuple, + std::index_sequence) { + return {typename Reductions::reducer_type{std::get(Identities), + std::get(BOPsTuple)}...}; +} + +template +void callReduUserKernelFunc(KernelType KernelFunc, nd_item NDIt, + std::tuple &Reducers, + std::index_sequence) { + KernelFunc(NDIt, std::get(Reducers)...); +} + +template +void initReduLocalAccs(size_t LID, size_t WGSize, + std::tuple LocalAccs, + const std::tuple &Reducers, + const std::tuple Identities, + std::index_sequence) { + std::tie(std::get(LocalAccs)[LID]...) = + std::make_tuple(std::get(Reducers).MValue...); + + // For work-groups, which size is not power of two, local accessors have + // an additional element with index WGSize that is used by the tree-reduction + // algorithm. Initialize those additional elements with identity values here. + if (!Pow2WG) + std::tie(std::get(LocalAccs)[WGSize]...) = + std::make_tuple(std::get(Identities)...); +} + +template +void initReduLocalAccs(size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, + std::tuple LocalAccs, + std::tuple InputAccs, + const std::tuple Identities, + std::index_sequence) { + // Normally, the local accessors are initialized with elements from the input + // accessors. The exception is the case when (GID >= NWorkItems), which + // possible only when UniformPow2WG is false. For that case the elements of + // local accessors are initialized with identity value, so they would not + // give any impact into the final partial sums during the tree-reduction + // algorithm work. + if (UniformPow2WG || GID < NWorkItems) + std::tie(std::get(LocalAccs)[LID]...) = + std::make_tuple(std::get(InputAccs)[GID]...); + else + std::tie(std::get(LocalAccs)[LID]...) = + std::make_tuple(std::get(Identities)...); + + // For work-groups, which size is not power of two, local accessors have + // an additional element with index WGSize that is used by the tree-reduction + // algorithm. Initialize those additional elements with identity values here. + if (!UniformPow2WG) + std::tie(std::get(LocalAccs)[WGSize]...) = + std::make_tuple(std::get(Identities)...); +} + +template +void reduceReduLocalAccs(size_t IndexA, size_t IndexB, + std::tuple LocalAccs, + std::tuple BOPs, + std::index_sequence) { + std::tie(std::get(LocalAccs)[IndexA]...) = + std::make_tuple((std::get(BOPs)(std::get(LocalAccs)[IndexA], + std::get(LocalAccs)[IndexB]))...); +} + +template +void writeReduSumsToOutAccs(size_t OutAccIndex, size_t WGSize, + std::tuple *, + std::tuple OutAccs, + std::tuple LocalAccs, + std::tuple BOPs, + std::index_sequence, + std::index_sequence) { + // This statement is needed for read_write accessors/USM-memory only. + // It adds the initial value of the reduction variable to the final result. + std::tie(std::get(LocalAccs)[0]...) = + std::make_tuple(std::get(BOPs)( + std::get(LocalAccs)[0], + std::tuple_element_t>::getOutPointer( + std::get(OutAccs))[OutAccIndex])...); + + if (Pow2WG) { + // The partial sums for the work-group are stored in 0-th elements of local + // accessors. Simply write those sums to output accessors. + std::tie(std::tuple_element_t>::getOutPointer( + std::get(OutAccs))[OutAccIndex]...) = + std::make_tuple(std::get(LocalAccs)[0]...); + } else { + // Each of local accessors keeps two partial sums: in 0-th and WGsize-th + // elements. Combine them into final partial sums and write to output + // accessors. + std::tie(std::tuple_element_t>::getOutPointer( + std::get(OutAccs))[OutAccIndex]...) = + std::make_tuple(std::get(BOPs)(std::get(LocalAccs)[0], + std::get(LocalAccs)[WGSize])...); + } +} + +// Concatenate an empty sequence. +constexpr std::index_sequence<> concat_sequences(std::index_sequence<>) { + return {}; +} + +// Concatenate a sequence consisting of 1 element. +template +constexpr std::index_sequence concat_sequences(std::index_sequence) { + return {}; +} + +// Concatenate two potentially empty sequences. +template +constexpr std::index_sequence +concat_sequences(std::index_sequence, std::index_sequence) { + return {}; +} + +// Concatenate more than 2 sequences. +template +constexpr auto concat_sequences(std::index_sequence, + std::index_sequence, Rs...) { + return concat_sequences(std::index_sequence{}, Rs{}...); +} + +struct IsRWReductionPredicate { + template struct Func { + static constexpr bool value = + std::remove_pointer_t::accessor_mode == access::mode::read_write; + }; +}; + +struct IsNonUsmReductionPredicate { + template struct Func { + static constexpr bool value = !std::remove_pointer_t::is_usm; + }; +}; + +struct EmptyReductionPredicate { + template struct Func { static constexpr bool value = false; }; +}; + +template struct FilterElement { + using type = + std::conditional_t, std::index_sequence<>>; +}; + +/// For each index 'I' from the given indices pack 'Is' this function initially +/// creates a number of short index_sequences, where each of such short +/// index sequences is either empty (if the given Functor returns false for the +/// type T[I]) or 1 element 'I' (otherwise). After that this function +/// concatenates those short sequences into one and returns the result sequence. +template 0), int> Z = 0> +constexpr auto filterSequenceHelper(FunctorT, std::index_sequence) { + return concat_sequences( + typename FilterElement>>::value, + Is>::type{}...); +} +template Z = 0> +constexpr auto filterSequenceHelper(FunctorT, std::index_sequence) { + return std::index_sequence<>{}; +} + +/// For each index 'I' from the given indices pack 'Is' this function returns +/// an index sequence consisting of only those 'I's for which the 'FunctorT' +/// applied to 'T[I]' returns true. +template +constexpr auto filterSequence(FunctorT F, std::index_sequence Indices) { + return filterSequenceHelper(F, Indices); +} + +template +void reduCGFuncImpl(handler &CGH, KernelType KernelFunc, + const nd_range &Range, + std::tuple &ReduTuple, + std::index_sequence ReduIndices) { + + size_t WGSize = Range.get_local_range().size(); + size_t LocalAccSize = WGSize + (Pow2WG ? 0 : 1); + auto LocalAccsTuple = + createReduLocalAccs(LocalAccSize, CGH, ReduIndices); + + size_t NWorkGroups = IsOneWG ? 1 : Range.get_group_range().size(); + auto OutAccsTuple = + createReduOutAccs(NWorkGroups, CGH, ReduTuple, ReduIndices); + auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices); + auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices); + + using Name = typename get_reduction_main_kernel_name_t< + KernelName, KernelType, Pow2WG, IsOneWG, decltype(OutAccsTuple)>::name; + CGH.parallel_for(Range, [=](nd_item NDIt) { + auto ReduIndices = std::index_sequence_for(); + auto ReducersTuple = + createReducers(IdentitiesTuple, BOPsTuple, ReduIndices); + // The .MValue field of each of the elements in ReducersTuple + // gets initialized in this call. + callReduUserKernelFunc(KernelFunc, NDIt, ReducersTuple, ReduIndices); + + size_t WGSize = NDIt.get_local_range().size(); + size_t LID = NDIt.get_local_linear_id(); + initReduLocalAccs(LID, WGSize, LocalAccsTuple, ReducersTuple, + IdentitiesTuple, ReduIndices); + NDIt.barrier(); + + size_t PrevStep = WGSize; + for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) { + // LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple, + ReduIndices); + } else if (!Pow2WG && LID == CurStep && (PrevStep & 0x1)) { + // LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); + reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple, + ReduIndices); + } + NDIt.barrier(); + PrevStep = CurStep; + } + + // Compute the partial sum/reduction for the work-group. + if (LID == 0) { + size_t GrID = NDIt.get_group_linear_id(); + // If there is only one work-group, then the original accessors need to be + // updated, i.e. after the work in each work-group is done, the work-group + // result is added to the original value of the read-write accessors or + // USM memory. + std::conditional_t + Predicate; + auto RWReduIndices = + filterSequence(Predicate, ReduIndices); + writeReduSumsToOutAccs( + GrID, WGSize, (std::tuple *)nullptr, OutAccsTuple, + LocalAccsTuple, BOPsTuple, ReduIndices, RWReduIndices); + } + }); +} + +template +void reduCGFunc(handler &CGH, KernelType KernelFunc, + const nd_range &Range, + std::tuple &ReduTuple, + std::index_sequence ReduIndices) { + size_t WGSize = Range.get_local_range().size(); + size_t NWorkGroups = Range.get_group_range().size(); + bool Pow2WG = (WGSize & (WGSize - 1)) == 0; + if (NWorkGroups == 1) { + if (Pow2WG) + reduCGFuncImpl(CGH, KernelFunc, Range, ReduTuple, + ReduIndices); + else + reduCGFuncImpl(CGH, KernelFunc, Range, ReduTuple, + ReduIndices); + } else { + if (Pow2WG) + reduCGFuncImpl(CGH, KernelFunc, Range, ReduTuple, + ReduIndices); + else + reduCGFuncImpl(CGH, KernelFunc, Range, + ReduTuple, ReduIndices); + } +} + +// The list of reductions may be empty; for such cases there is nothing to do. +// This function is intentionally made template to eliminate the need in holding +// it in SYCL library, what would be less efficient and also would create the +// need in keeping it for long due support backward ABI compatibility. +template +std::enable_if_t::value> +associateReduAccsWithHandlerHelper(HandlerT &) {} + +template +void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu) { + Redu.associateWithHandler(CGH); +} + +template 0), int> Z = 0> +void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu, + RestT &... Rest) { + Redu.associateWithHandler(CGH); + associateReduAccsWithHandlerHelper(CGH, Rest...); +} + +template +void associateReduAccsWithHandler(handler &CGH, + std::tuple &ReduTuple, + std::index_sequence) { + associateReduAccsWithHandlerHelper(CGH, std::get(ReduTuple)...); +} + +template +void reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups, + size_t WGSize, std::tuple &ReduTuple, + std::index_sequence ReduIndices) { + // The last kernel DOES write to user's accessor passed to reduction. + // Associate it with handler manually. + std::conditional_t + Predicate; + auto AccReduIndices = filterSequence(Predicate, ReduIndices); + associateReduAccsWithHandler(CGH, ReduTuple, AccReduIndices); + + size_t LocalAccSize = WGSize + (UniformPow2WG ? 0 : 1); + auto LocalAccsTuple = + createReduLocalAccs(LocalAccSize, CGH, ReduIndices); + auto InAccsTuple = + getReadAccsToPreviousPartialReds(CGH, ReduTuple, ReduIndices); + auto OutAccsTuple = + createReduOutAccs(NWorkGroups, CGH, ReduTuple, ReduIndices); + auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices); + auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices); + + using Name = + typename get_reduction_aux_kernel_name_t::name; + range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize}; + nd_range<1> Range{GlobalRange, range<1>(WGSize)}; + CGH.parallel_for(Range, [=](nd_item<1> NDIt) { + auto ReduIndices = std::index_sequence_for(); + size_t WGSize = NDIt.get_local_range().size(); + size_t LID = NDIt.get_local_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + initReduLocalAccs(LID, GID, NWorkItems, WGSize, + LocalAccsTuple, InAccsTuple, + IdentitiesTuple, ReduIndices); + NDIt.barrier(); + + size_t PrevStep = WGSize; + for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) { + // LocalAcc[LID] = BOp(LocalAcc[LID], LocalAcc[LID + CurStep]); + reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple, + ReduIndices); + } else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1)) { + // LocalAcc[WGSize] = BOp(LocalAcc[WGSize], LocalAcc[PrevStep - 1]); + reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple, + ReduIndices); + } + NDIt.barrier(); + PrevStep = CurStep; + } + + // Compute the partial sum/reduction for the work-group. + if (LID == 0) { + size_t GrID = NDIt.get_group_linear_id(); + // If there is only one work-group, then the original accessors need to be + // updated, i.e. after the work in each work-group is done, the work-group + // result is added to the original value of the read-write accessors or + // USM memory. + std::conditional_t + Predicate; + auto RWReduIndices = + filterSequence(Predicate, ReduIndices); + writeReduSumsToOutAccs( + GrID, WGSize, (std::tuple *)nullptr, OutAccsTuple, + LocalAccsTuple, BOPsTuple, ReduIndices, RWReduIndices); + } + }); +} + +template +size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, + std::tuple &ReduTuple, + std::index_sequence ReduIndices) { + size_t NWorkGroups; + size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups); + + bool Pow2WG = (WGSize & (WGSize - 1)) == 0; + bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems); + if (NWorkGroups == 1) { + if (HasUniformWG) + reduAuxCGFuncImpl( + CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices); + else + reduAuxCGFuncImpl( + CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices); + } else { + if (HasUniformWG) + reduAuxCGFuncImpl( + CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices); + else + reduAuxCGFuncImpl( + CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices); + } + return NWorkGroups; +} + +template size_t reduGetMemPerWorkItemHelper(Reduction &) { + return sizeof(typename Reduction::result_type); +} + +template +size_t reduGetMemPerWorkItemHelper(Reduction &, RestT... Rest) { + return sizeof(typename Reduction::result_type) + + reduGetMemPerWorkItemHelper(Rest...); +} + +template +size_t reduGetMemPerWorkItem(std::tuple &ReduTuple, + std::index_sequence) { + return reduGetMemPerWorkItemHelper(std::get(ReduTuple)...); +} + +/// Utility function: for the given tuple \param Tuple the function returns +/// a new tuple consisting of only elements indexed by the index sequence. +template +std::tuple...> +tuple_select_elements(TupleT Tuple, std::index_sequence) { + return {std::get(std::move(Tuple))...}; +} + } // namespace detail /// Creates and returns an object implementing the reduction functionality. diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 24d8ac49ad9c0..5fbd1d99be699 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -30,6 +30,7 @@ #include #include #include +#include #include // SYCL_LANGUAGE_VERSION is 4 digit year followed by 2 digit revision @@ -244,9 +245,32 @@ enable_if_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, Reduction &Redu); +template +void reduCGFunc(handler &CGH, KernelType KernelFunc, + const nd_range &Range, + std::tuple &ReduTuple, + std::index_sequence); + +template +size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, + std::tuple &ReduTuple, + std::index_sequence); + __SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class Queue, size_t LocalMemBytesPerWorkItem); +template +size_t reduGetMemPerWorkItem(std::tuple &ReduTuple, + std::index_sequence); + +template +std::tuple...> +tuple_select_elements(TupleT Tuple, std::index_sequence); + +template struct AreAllButLastReductions; + } // namespace detail } // namespace ONEAPI @@ -1184,7 +1208,6 @@ class __SYCL_EXPORT handler { /// globally visible, there is no need for the developer to provide /// a kernel name for it. /// - /// TODO: Need to handle more than 1 reduction in parallel_for(). /// TODO: Support HOST. The kernels called by this parallel_for() may use /// some functionality that is not yet supported on HOST such as: /// barrier(), and ONEAPI::reduce() that also may be used in more @@ -1257,6 +1280,54 @@ class __SYCL_EXPORT handler { } // end while (NWorkItems > 1) } + // This version of parallel_for may handle one or more reductions packed in + // \p Rest argument. Note thought that the last element in \p Rest pack is + // the kernel function. + // TODO: this variant is currently enabled for 2+ reductions only as the + // versions handling 1 reduction variable are more efficient right now. + template + std::enable_if_t<(sizeof...(RestT) >= 3 && + ONEAPI::detail::AreAllButLastReductions::value)> + parallel_for(nd_range Range, RestT... Rest) { + std::tuple ArgsTuple(Rest...); + constexpr size_t NumArgs = sizeof...(RestT); + auto KernelFunc = std::get(ArgsTuple); + auto ReduIndices = std::make_index_sequence(); + auto ReduTuple = + ONEAPI::detail::tuple_select_elements(ArgsTuple, ReduIndices); + + size_t LocalMemPerWorkItem = + ONEAPI::detail::reduGetMemPerWorkItem(ReduTuple, ReduIndices); + // TODO: currently the maximal work group size is determined for the given + // queue/device, while it is safer to use queries to the kernel compiled + // for the device. + size_t MaxWGSize = + ONEAPI::detail::reduGetMaxWGSize(MQueue, LocalMemPerWorkItem); + if (Range.get_local_range().size() > MaxWGSize) + throw sycl::runtime_error("The implementation handling parallel_for with" + " reduction requires work group size not bigger" + " than " + + std::to_string(MaxWGSize), + PI_INVALID_WORK_GROUP_SIZE); + + ONEAPI::detail::reduCGFunc(*this, KernelFunc, Range, ReduTuple, + ReduIndices); + shared_ptr_class QueueCopy = MQueue; + this->finalize(); + + size_t NWorkItems = Range.get_group_range().size(); + while (NWorkItems > 1) { + handler AuxHandler(QueueCopy, MIsHost); + AuxHandler.saveCodeLoc(MCodeLoc); + + NWorkItems = + ONEAPI::detail::reduAuxCGFunc( + AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices); + MLastEvent = AuxHandler.finalize(); + } // end while (NWorkItems > 1) + } + /// Hierarchical kernel invocation method of a kernel defined as a lambda /// encoding the body of each work-group to launch. /// diff --git a/sycl/test/regression/sycl-include-gnu11.cpp b/sycl/test/regression/sycl-include-gnu14.cpp similarity index 74% rename from sycl/test/regression/sycl-include-gnu11.cpp rename to sycl/test/regression/sycl-include-gnu14.cpp index c265b0ff25247..0147acf143de5 100644 --- a/sycl/test/regression/sycl-include-gnu11.cpp +++ b/sycl/test/regression/sycl-include-gnu14.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -std=gnu++11 -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -std=gnu++14 -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %RUN_ON_HOST %t.out // UNSUPPORTED: system-windows