From d4144b4d38412c8e352659b39b4c567c78377a42 Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Tue, 27 Oct 2020 19:57:45 -0700 Subject: [PATCH 1/6] [SYCL] Support parallel_for() accepting many reductions Signed-off-by: Vyacheslav N Klochkov --- sycl/include/CL/sycl/ONEAPI/reduction.hpp | 543 +++++++++++++++++++++- sycl/include/CL/sycl/handler.hpp | 75 ++- 2 files changed, 599 insertions(+), 19 deletions(-) diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index f15a811c2187..83b190f95e6e 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -14,6 +14,8 @@ #include #include +#include + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace ONEAPI { @@ -72,6 +74,9 @@ using IsReduBitAND = detail::bool_constant< template using IsReduOptForFastAtomicFetch = +#ifdef SYCL_REDUCTION_NO_FAST_OPTS + detail::bool_constant; +#else detail::bool_constant<(is_geninteger32bit::value || is_geninteger64bit::value) && (IsReduPlus::value || @@ -80,15 +85,20 @@ using IsReduOptForFastAtomicFetch = IsReduBitOR::value || IsReduBitXOR::value || IsReduBitAND::value)>; +#endif template using IsReduOptForFastReduce = detail::bool_constant< +#ifdef SYCL_REDUCTION_NO_FAST_OPTS + false>; +#else (is_geninteger32bit::value || is_geninteger64bit::value || std::is_same::value || std::is_same::value || std::is_same::value) && (IsReduPlus::value || IsReduMinimum::value || IsReduMaximum::value)>; +#endif // Identity = 0 template @@ -340,12 +350,31 @@ class reducer +struct are_all_but_last_reductions { + static constexpr bool value = + std::is_base_of::value && + are_all_but_last_reductions::value; +}; + +/// Helper specialization of are_all_but_last_reductions for one element only. +/// Returns true if the last and only typename is not a reduction. +template struct are_all_but_last_reductions { + static constexpr bool value = !std::is_base_of::value; +}; + /// This class encapsulates the reduction variable/accessor, /// the reduction operator and an optional operator identity. template -class reduction_impl { +class reduction_impl : private reduction_impl_base { public: using reducer_type = reducer; using result_type = T; @@ -359,6 +388,10 @@ class reduction_impl { static constexpr access::mode accessor_mode = AccMode; static constexpr int accessor_dim = Dims; static constexpr int buffer_dim = (Dims == 0) ? 1 : Dims; + using local_accessor_type = + accessor; + static constexpr bool has_fast_atomics = IsReduOptForFastAtomicFetch::value; static constexpr bool has_fast_reduce = @@ -495,11 +528,8 @@ class reduction_impl { #endif } - accessor - getReadWriteLocalAcc(size_t Size, handler &CGH) { - return accessor(Size, CGH); + static local_accessor_type getReadWriteLocalAcc(size_t Size, handler &CGH) { + return local_accessor_type(Size, CGH); } accessor @@ -508,6 +538,30 @@ class reduction_impl { return accessor(*MOutBufPtr, CGH); } + /// Returns user's USM pointer passed to reduction for editing. + template + std::enable_if_t + getWriteMemForPartialReds(size_t, handler &CGH) { + return getUSMPointer(); + } + + /// Returns user's accessor passed to reduction for editing. + template + std::enable_if_t + getWriteMemForPartialReds(size_t, handler &CGH) { + return *MAcc; + } + + /// Constructs a new temporary buffer to hold partial sums and returns + /// the accessor that that buffer. + template + std::enable_if_t + getWriteMemForPartialReds(size_t Size, handler &CGH) { + MOutBufPtr = std::make_shared>(range<1>(Size)); + CGH.addReduction(MOutBufPtr); + return accessor_type(*MOutBufPtr, CGH); + } + template enable_if_t<_IsPlaceholder == access::placeholder::false_t, accessor_type> getWriteAccForPartialReds(size_t Size, handler &CGH) { @@ -596,19 +650,17 @@ template class __sycl_reduction_aux_kernel; /// Helper structs to get additional kernel name types based on given -/// \c Name and \c Type types: if \c Name is undefined (is a \c auto_name) then -/// \c Type becomes the \c Name. -template +/// \c Name and additional template parameters helping to distinguish kernels. +/// If \c Name is undefined (is \c auto_name) then \c Type becomes the \c Name. +template struct get_reduction_main_kernel_name_t { using name = __sycl_reduction_main_kernel< - typename sycl::detail::get_kernel_name_t::name, B1, B2, - OutputT>; + typename sycl::detail::get_kernel_name_t::name, B1, B2, T3>; }; -template +template struct get_reduction_aux_kernel_name_t { using name = __sycl_reduction_aux_kernel< - typename sycl::detail::get_kernel_name_t::name, B1, B2, - OutputT>; + typename sycl::detail::get_kernel_name_t::name, B1, B2, T3>; }; /// Implements a command group function that enqueues a kernel that calls @@ -650,7 +702,7 @@ template enable_if_t reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, - Reduction &Redu, OutputT Out) { + Reduction &, OutputT Out) { size_t WGSize = Range.get_local_range().size(); // Use local memory to reduce elements in work-groups into zero-th element. @@ -658,7 +710,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, // The additional last element is used to catch reduce elements that could // otherwise be lost in the tree-reduction algorithm used in the kernel. size_t NLocalElements = WGSize + (IsPow2WG ? 0 : 1); - auto LocalReds = Redu.getReadWriteLocalAcc(NLocalElements, CGH); + auto LocalReds = Reduction::getReadWriteLocalAcc(NLocalElements, CGH); using Name = typename get_reduction_main_kernel_name_t< KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name; @@ -784,7 +836,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, // The additional last element is used to catch elements that could // otherwise be lost in the tree-reduction algorithm. size_t NumLocalElements = WGSize + (IsPow2WG ? 0 : 1); - auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH); + auto LocalReds = Reduction::getReadWriteLocalAcc(NumLocalElements, CGH); typename Reduction::result_type ReduIdentity = Redu.getIdentity(); using Name = typename get_reduction_main_kernel_name_t< KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name; @@ -913,7 +965,7 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups, // The additional last element is used to catch elements that could // otherwise be lost in the tree-reduction algorithm. size_t NumLocalElements = WGSize + (UniformPow2WG ? 0 : 1); - auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH); + auto LocalReds = Reduction::getReadWriteLocalAcc(NumLocalElements, CGH); auto ReduIdentity = Redu.getIdentity(); auto BOp = Redu.getBinaryOperation(); @@ -1004,6 +1056,461 @@ reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, return NWorkGroups; } +/// For the given 'Reductions' types pack and indices enumerating only +/// the reductions for which a local accessors are needed, this function creates +/// those local accessors and returns a tuple consisting of them. +template +std::tuple +createReduLocalAccs(size_t Size, handler &CGH, std::index_sequence) { + return {Reductions::getReadWriteLocalAcc(Size, CGH)...}; +} + +/// For the given 'Reductions' types pack and indices enumerating them this +/// function either creates new temporary accessors for partial sums (if IsOneWG +/// is false) or returns user's accessor/USM-pointer if (IsOneWG is true). +template +auto createReduOutAccs(size_t NWorkGroups, handler &CGH, + std::tuple &ReduTuple, + std::index_sequence) { + return std::make_tuple( + std::get(ReduTuple).template getWriteMemForPartialReds( + NWorkGroups, CGH)...); +} + +/// For the given 'Reductions' types pack and indices enumerating them this +/// function returns accessors to buffers holding partial sums generated in the +/// previous kernel invocation. +template +auto getReadAccsToPreviousPartialReds(handler &CGH, + std::tuple &ReduTuple, + std::index_sequence) { + return std::make_tuple( + std::get(ReduTuple).getReadAccToPreviousPartialReds(CGH)...); +} + +template +std::tuple +getReduIdentities(std::tuple &ReduTuple, + std::index_sequence) { + return {std::get(ReduTuple).getIdentity()...}; +} + +template +std::tuple +getReduBOPs(std::tuple &ReduTuple, std::index_sequence) { + return {std::get(ReduTuple).getBinaryOperation()...}; +} + +template +std::tuple +createReducers(std::tuple Identities, + std::tuple BOPsTuple, + std::index_sequence) { + return {typename Reductions::reducer_type{std::get(Identities), + std::get(BOPsTuple)}...}; +} + +template +void callReduUserKernelFunc(KernelType KernelFunc, nd_item NDIt, + std::tuple &Reducers, + std::index_sequence) { + KernelFunc(NDIt, std::get(Reducers)...); +} + +template +void initReduLocalAccs(size_t LID, size_t WGSize, + std::tuple LocalAccs, + const std::tuple &Reducers, + const std::tuple Identities, + std::index_sequence) { + std::tie(std::get(LocalAccs)[LID]...) = + std::make_tuple(std::get(Reducers).MValue...); + if (!UniformPow2WG) + std::tie(std::get(LocalAccs)[WGSize]...) = + std::make_tuple(std::get(Identities)...); +} + +template +void initReduLocalAccs(size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, + std::tuple LocalAccs, + std::tuple InputAccs, + const std::tuple Identities, + std::index_sequence) { + if (UniformPow2WG || GID < NWorkItems) + std::tie(std::get(LocalAccs)[LID]...) = + std::make_tuple(std::get(InputAccs)[GID]...); + if (!UniformPow2WG) + std::tie(std::get(LocalAccs)[WGSize]...) = + std::make_tuple(std::get(Identities)...); +} + +template +void reduceReduLocalAccs(size_t IndexA, size_t IndexB, + std::tuple LocalAccs, + std::tuple BOPs, + std::index_sequence) { + std::tie(std::get(LocalAccs)[IndexA]...) = + std::make_tuple((std::get(BOPs)(std::get(LocalAccs)[IndexA], + std::get(LocalAccs)[IndexB]))...); +} + +template +void writeReduSumsToOutAccs(size_t OutAccIndex, size_t WGSize, + std::tuple *, + std::tuple OutAccs, + std::tuple LocalAccs, + std::tuple BOPs, + std::index_sequence, + std::index_sequence) { + // This statement is needed for read_write accessors/USM-memory only. + // It adds the initial value of the reduction variable to the final result. + std::tie(std::get(LocalAccs)[0]...) = + std::make_tuple(std::get(BOPs)( + std::get(LocalAccs)[0], + std::tuple_element_t>::getOutPointer( + std::get(OutAccs))[OutAccIndex])...); + + if (UniformPow2WG) { + std::tie(std::tuple_element_t>::getOutPointer( + std::get(OutAccs))[OutAccIndex]...) = + std::make_tuple(std::get(LocalAccs)[0]...); + } else { + std::tie(std::tuple_element_t>::getOutPointer( + std::get(OutAccs))[OutAccIndex]...) = + std::make_tuple(std::get(BOPs)(std::get(LocalAccs)[0], + std::get(LocalAccs)[WGSize])...); + } +} + +// Concatenate an empty sequence. +constexpr std::index_sequence<> concat_sequences(std::index_sequence<>) { + return {}; +} + +// Concatenate a sequence consisting of 1 element. +template +constexpr std::index_sequence concat_sequences(std::index_sequence) { + return {}; +} + +// Concatenate two potentially empty sequences. +template +constexpr std::index_sequence +concat_sequences(std::index_sequence, std::index_sequence) { + return {}; +} + +// Concatenate more than 2 sequences. +template +constexpr auto concat_sequences(std::index_sequence, + std::index_sequence, Rs...) { + return concat_sequences(std::index_sequence{}, Rs{}...); +} + +struct IsRWReductionPredicate { + template struct Func { + static constexpr bool value = + std::remove_pointer_t::accessor_mode == access::mode::read_write; + }; +}; + +struct IsNonUsmReductionPredicate { + template struct Func { + static constexpr bool value = !std::remove_pointer_t::is_usm; + }; +}; + +struct EmptyReductionPredicate { + template struct Func { static constexpr bool value = false; }; +}; + +template struct FilterElement { + using type = + std::conditional_t, std::index_sequence<>>; +}; + +/// For each index 'I' from the given indices pack 'Is' this function initially +/// creates a number of short index_sequences, where each of such short +/// index sequences is either empty (if the given Functor returns false for the +/// type T[I]) or 1 element 'I' (otherwise). After that this function +/// concatenates those short sequences into one and returns the result sequence. +template 0), int> Z = 0> +constexpr auto filterSequenceHelper(FunctorT, std::index_sequence) { + return concat_sequences( + typename FilterElement>>::value, + Is>::type{}...); +} +template Z = 0> +constexpr auto filterSequenceHelper(FunctorT, std::index_sequence) { + return std::index_sequence<>{}; +} + +/// For each index 'I' from the given indices pack 'Is' this function returns +/// an index sequence consisting of only those 'I's for which the 'FunctorT' +/// applied to 'T[I]' returns true. +template +constexpr auto filterSequence(FunctorT F, std::index_sequence Indices) { + return filterSequenceHelper(F, Indices); +} + +template +void reduCGFuncImpl(handler &CGH, KernelType KernelFunc, + const nd_range &Range, + std::tuple &ReduTuple, + std::index_sequence ReduIndices) { + + size_t WGSize = Range.get_local_range().size(); + size_t LocalAccSize = WGSize + (UniformPow2WG ? 0 : 1); + auto LocalAccsTuple = + createReduLocalAccs(LocalAccSize, CGH, ReduIndices); + + size_t NWorkGroups = IsOneWG ? 1 : Range.get_group_range().size(); + auto OutAccsTuple = + createReduOutAccs(NWorkGroups, CGH, ReduTuple, ReduIndices); + auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices); + auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices); + + using Name = typename get_reduction_main_kernel_name_t< + KernelName, KernelType, UniformPow2WG, IsOneWG, + std::tuple>::name; + CGH.parallel_for(Range, [=](nd_item NDIt) { + auto ReduIndices = std::index_sequence_for(); + auto ReducersTuple = + createReducers(IdentitiesTuple, BOPsTuple, ReduIndices); + // The .MValue field of each of the elements in ReducersTuple + // gets initialized in this call. + callReduUserKernelFunc(KernelFunc, NDIt, ReducersTuple, ReduIndices); + + size_t WGSize = NDIt.get_local_range().size(); + size_t LID = NDIt.get_local_linear_id(); + initReduLocalAccs(LID, WGSize, LocalAccsTuple, ReducersTuple, + IdentitiesTuple, ReduIndices); + NDIt.barrier(); + + size_t PrevStep = WGSize; + for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) { + // LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple, + ReduIndices); + } else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1)) { + // LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); + reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple, + ReduIndices); + } + NDIt.barrier(); + PrevStep = CurStep; + } + + // Compute the partial sum/reduction for the work-group. + if (LID == 0) { + size_t GrID = NDIt.get_group_linear_id(); + // If there is only one work-group, then the original accessors need to be + // updated, i.e. after the work in each work-group is done, the work-group + // result is added to the original value of the read-write accessors or + // USM memory. + std::conditional_t + Predicate; + auto RWReduIndices = + filterSequence(Predicate, ReduIndices); + writeReduSumsToOutAccs( + GrID, WGSize, (std::tuple *)nullptr, OutAccsTuple, + LocalAccsTuple, BOPsTuple, ReduIndices, RWReduIndices); + } + }); +} + +template +void reduCGFunc(handler &CGH, KernelType KernelFunc, + const nd_range &Range, + std::tuple &ReduTuple, + std::index_sequence ReduIndices) { + size_t NWorkItems = Range.get_global_range().size(); + size_t WGSize = Range.get_local_range().size(); + size_t NWorkGroups = Range.get_group_range().size(); + + bool Pow2WG = (WGSize & (WGSize - 1)) == 0; + bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems); + if (NWorkGroups == 1) { + if (HasUniformWG) + reduCGFuncImpl(CGH, KernelFunc, Range, ReduTuple, + ReduIndices); + else + reduCGFuncImpl(CGH, KernelFunc, Range, ReduTuple, + ReduIndices); + } else { + if (HasUniformWG) + reduCGFuncImpl(CGH, KernelFunc, Range, ReduTuple, + ReduIndices); + else + reduCGFuncImpl(CGH, KernelFunc, Range, + ReduTuple, ReduIndices); + } +} + +// The list of reductions may be empty; for such cases there is nothing to do. +// This function is intentionally made template to eliminate the need in holding +// it in sycl library, what would be less efficient and also would create the +// need in keeping it for long due support backward ABI compatibility. +template +std::enable_if_t::value> +associateReduAccsWithHandlerHelper(HandlerT &) {} + +template +void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu) { + Redu.associateWithHandler(CGH); +} + +template 0), int> Z = 0> +void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu, + RestT &...Rest) { + Redu.associateWithHandler(CGH); + associateReduAccsWithHandlerHelper(CGH, Rest...); +} + +template +void associateReduAccsWithHandler(handler &CGH, + std::tuple &ReduTuple, + std::index_sequence) { + associateReduAccsWithHandlerHelper(CGH, std::get(ReduTuple)...); +} + +template +void reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups, + size_t WGSize, std::tuple &ReduTuple, + std::index_sequence ReduIndices) { + // The last kernel DOES write to user's accessor passed to reduction. + // Associate it with handler manually. + std::conditional_t + Predicate; + auto AccReduIndices = filterSequence(Predicate, ReduIndices); + associateReduAccsWithHandler(CGH, ReduTuple, AccReduIndices); + + size_t LocalAccSize = WGSize + (UniformPow2WG ? 0 : 1); + auto LocalAccsTuple = + createReduLocalAccs(LocalAccSize, CGH, ReduIndices); + auto InAccsTuple = + getReadAccsToPreviousPartialReds(CGH, ReduTuple, ReduIndices); + auto OutAccsTuple = + createReduOutAccs(NWorkGroups, CGH, ReduTuple, ReduIndices); + auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices); + auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices); + + using Name = + typename get_reduction_aux_kernel_name_t>::name; + range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize}; + nd_range<1> Range{GlobalRange, range<1>(WGSize)}; + CGH.parallel_for(Range, [=](nd_item<1> NDIt) { + auto ReduIndices = std::index_sequence_for(); + size_t WGSize = NDIt.get_local_range().size(); + size_t LID = NDIt.get_local_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + initReduLocalAccs(LID, GID, NWorkItems, WGSize, + LocalAccsTuple, InAccsTuple, + IdentitiesTuple, ReduIndices); + NDIt.barrier(); + + size_t PrevStep = WGSize; + for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) { + // LocalAcc[LID] = BOp(LocalAcc[LID], LocalAcc[LID + CurStep]); + reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple, + ReduIndices); + } else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1)) { + // LocalAcc[WGSize] = BOp(LocalAcc[WGSize], LocalAcc[PrevStep - 1]); + reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple, + ReduIndices); + } + NDIt.barrier(); + PrevStep = CurStep; + } + + // Compute the partial sum/reduction for the work-group. + if (LID == 0) { + size_t GrID = NDIt.get_group_linear_id(); + // If there is only one work-group, then the original accessors need to be + // updated, i.e. after the work in each work-group is done, the work-group + // result is added to the original value of the read-write accessors or + // USM memory. + std::conditional_t + Predicate; + auto RWReduIndices = + filterSequence(Predicate, ReduIndices); + writeReduSumsToOutAccs( + GrID, WGSize, (std::tuple *)nullptr, OutAccsTuple, + LocalAccsTuple, BOPsTuple, ReduIndices, RWReduIndices); + } + }); +} + +template +size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, + std::tuple &ReduTuple, + std::index_sequence ReduIndices) { + size_t NWorkGroups; + size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups); + + bool Pow2WG = (WGSize & (WGSize - 1)) == 0; + bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems); + if (NWorkGroups == 1) { + if (HasUniformWG) + reduAuxCGFuncImpl( + CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices); + else + reduAuxCGFuncImpl( + CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices); + } else { + if (HasUniformWG) + reduAuxCGFuncImpl( + CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices); + else + reduAuxCGFuncImpl( + CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices); + } + return NWorkGroups; +} + +template size_t reduGetMemPerWorkItemHelper(Reduction &) { + return sizeof(typename Reduction::result_type); +} + +template +size_t reduGetMemPerWorkItemHelper(Reduction &, RestT... Rest) { + return sizeof(typename Reduction::result_type) + + reduGetMemPerWorkItemHelper(Rest...); +} + +template +size_t reduGetMemPerWorkItem(std::tuple &ReduTuple, + std::index_sequence) { + return reduGetMemPerWorkItemHelper(std::get(ReduTuple)...); +} + +/// Utility function: for the given tuple \param Tuple the function returns +/// a new tuple consisting of only elements indexed by the index sequence. +template +std::tuple...> +tuple_select_elements(TupleT Tuple, std::index_sequence) { + return {std::get(std::move(Tuple))...}; +} + } // namespace detail /// Creates and returns an object implementing the reduction functionality. diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 24d8ac49ad9c..f33ef1bfe0de 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -30,6 +30,7 @@ #include #include #include +#include #include // SYCL_LANGUAGE_VERSION is 4 digit year followed by 2 digit revision @@ -244,9 +245,33 @@ enable_if_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, Reduction &Redu); +template +void reduCGFunc(handler &CGH, KernelType KernelFunc, + const nd_range &Range, + std::tuple &ReduTuple, + std::index_sequence); + +template +size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, + std::tuple &ReduTuple, + std::index_sequence); + __SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class Queue, size_t LocalMemBytesPerWorkItem); +template +size_t reduGetMemPerWorkItem(std::tuple &ReduTuple, + std::index_sequence); + +template +std::tuple...> +tuple_select_elements(TupleT Tuple, std::index_sequence); + +template +struct are_all_but_last_reductions; + } // namespace detail } // namespace ONEAPI @@ -1184,7 +1209,6 @@ class __SYCL_EXPORT handler { /// globally visible, there is no need for the developer to provide /// a kernel name for it. /// - /// TODO: Need to handle more than 1 reduction in parallel_for(). /// TODO: Support HOST. The kernels called by this parallel_for() may use /// some functionality that is not yet supported on HOST such as: /// barrier(), and ONEAPI::reduce() that also may be used in more @@ -1257,6 +1281,55 @@ class __SYCL_EXPORT handler { } // end while (NWorkItems > 1) } + // This version of parallel_for may handle one or more reductions packed in + // \p Rest argument. Note thought that the last element in \p Rest pack is + // the kernel function. + // TODO: this variant is currently enabled for 2+ reductions only as the + // versions handling 1 reduction variable are more efficient right now. + template + std::enable_if_t< + (sizeof...(RestT) >= 3 && + ONEAPI::detail::are_all_but_last_reductions::value)> + parallel_for(nd_range Range, RestT... Rest) { + std::tuple ArgsTuple(Rest...); + constexpr size_t NumArgs = sizeof...(RestT); + auto KernelFunc = std::get(ArgsTuple); + auto ReduIndices = std::make_index_sequence(); + auto ReduTuple = + ONEAPI::detail::tuple_select_elements(ArgsTuple, ReduIndices); + + size_t LocalMemPerWorkItem = + ONEAPI::detail::reduGetMemPerWorkItem(ReduTuple, ReduIndices); + // TODO: currently the maximal work group size is determined for the given + // queue/device, while it is safer to use queries to the kernel compiled + // for the device. + size_t MaxWGSize = + ONEAPI::detail::reduGetMaxWGSize(MQueue, LocalMemPerWorkItem); + if (Range.get_local_range().size() > MaxWGSize) + throw sycl::runtime_error("The implementation handling parallel_for with" + " reduction requires work group size not bigger" + " than " + + std::to_string(MaxWGSize), + PI_INVALID_WORK_GROUP_SIZE); + + ONEAPI::detail::reduCGFunc(*this, KernelFunc, Range, ReduTuple, + ReduIndices); + shared_ptr_class QueueCopy = MQueue; + this->finalize(); + + size_t NWorkItems = Range.get_group_range().size(); + while (NWorkItems > 1) { + handler AuxHandler(QueueCopy, MIsHost); + AuxHandler.saveCodeLoc(MCodeLoc); + + NWorkItems = + ONEAPI::detail::reduAuxCGFunc( + AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices); + MLastEvent = AuxHandler.finalize(); + } // end while (NWorkItems > 1) + } + /// Hierarchical kernel invocation method of a kernel defined as a lambda /// encoding the body of each work-group to launch. /// From b2ba2a4bbd520cad21313078ebe72a8a6862c243 Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Mon, 1 Feb 2021 11:28:14 -0800 Subject: [PATCH 2/6] Remove obsolete/debugging code. clang-format. Change LIT test requiring c++11 (changed to c++14) Signed-off-by: Vyacheslav N Klochkov --- sycl/include/CL/sycl/ONEAPI/reduction.hpp | 13 ++----------- ...ycl-include-gnu11.cpp => sycl-include-gnu14.cpp} | 2 +- 2 files changed, 3 insertions(+), 12 deletions(-) rename sycl/test/regression/{sycl-include-gnu11.cpp => sycl-include-gnu14.cpp} (74%) diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index 83b190f95e6e..0480a9a01987 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -74,9 +74,6 @@ using IsReduBitAND = detail::bool_constant< template using IsReduOptForFastAtomicFetch = -#ifdef SYCL_REDUCTION_NO_FAST_OPTS - detail::bool_constant; -#else detail::bool_constant<(is_geninteger32bit::value || is_geninteger64bit::value) && (IsReduPlus::value || @@ -85,20 +82,15 @@ using IsReduOptForFastAtomicFetch = IsReduBitOR::value || IsReduBitXOR::value || IsReduBitAND::value)>; -#endif template using IsReduOptForFastReduce = detail::bool_constant< -#ifdef SYCL_REDUCTION_NO_FAST_OPTS - false>; -#else (is_geninteger32bit::value || is_geninteger64bit::value || std::is_same::value || std::is_same::value || std::is_same::value) && (IsReduPlus::value || IsReduMinimum::value || IsReduMaximum::value)>; -#endif // Identity = 0 template @@ -389,8 +381,7 @@ class reduction_impl : private reduction_impl_base { static constexpr int accessor_dim = Dims; static constexpr int buffer_dim = (Dims == 0) ? 1 : Dims; using local_accessor_type = - accessor; + accessor; static constexpr bool has_fast_atomics = IsReduOptForFastAtomicFetch::value; @@ -1374,7 +1365,7 @@ void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu) { template 0), int> Z = 0> void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu, - RestT &...Rest) { + RestT &... Rest) { Redu.associateWithHandler(CGH); associateReduAccsWithHandlerHelper(CGH, Rest...); } diff --git a/sycl/test/regression/sycl-include-gnu11.cpp b/sycl/test/regression/sycl-include-gnu14.cpp similarity index 74% rename from sycl/test/regression/sycl-include-gnu11.cpp rename to sycl/test/regression/sycl-include-gnu14.cpp index c265b0ff2524..0147acf143de 100644 --- a/sycl/test/regression/sycl-include-gnu11.cpp +++ b/sycl/test/regression/sycl-include-gnu14.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -std=gnu++11 -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -std=gnu++14 -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %RUN_ON_HOST %t.out // UNSUPPORTED: system-windows From f0d0716cc063bfcdf3ac9a90fd052ce539699a9c Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Mon, 1 Feb 2021 12:28:07 -0800 Subject: [PATCH 3/6] [NFC] Fix build warnings; change 'sycl' with 'SYCL' in comments Signed-off-by: Vyacheslav N Klochkov --- sycl/include/CL/sycl/ONEAPI/reduction.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index 0480a9a01987..c87066bcab54 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -532,14 +532,14 @@ class reduction_impl : private reduction_impl_base { /// Returns user's USM pointer passed to reduction for editing. template std::enable_if_t - getWriteMemForPartialReds(size_t, handler &CGH) { + getWriteMemForPartialReds(size_t, handler &) { return getUSMPointer(); } /// Returns user's accessor passed to reduction for editing. template std::enable_if_t - getWriteMemForPartialReds(size_t, handler &CGH) { + getWriteMemForPartialReds(size_t, handler &) { return *MAcc; } @@ -658,7 +658,7 @@ struct get_reduction_aux_kernel_name_t { /// user's lambda function KernelFunc and also does one iteration of reduction /// of elements computed in user's lambda function. /// This version uses ONEAPI::reduce() algorithm to reduce elements in each -/// of work-groups, then it calls fast sycl atomic operations to update +/// of work-groups, then it calls fast SYCL atomic operations to update /// user's reduction variable. /// /// Briefly: calls user's lambda, ONEAPI::reduce() + atomic, INT + ADD/MIN/MAX. @@ -685,7 +685,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, /// user's lambda function KernelFunc and also does one iteration of reduction /// of elements computed in user's lambda function. /// This version uses tree-reduction algorithm to reduce elements in each -/// of work-groups, then it calls fast sycl atomic operations to update +/// of work-groups, then it calls fast SYCL atomic operations to update /// user's reduction variable. /// /// Briefly: calls user's lambda, tree-reduction + atomic, INT + AND/OR/XOR. @@ -1351,7 +1351,7 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, // The list of reductions may be empty; for such cases there is nothing to do. // This function is intentionally made template to eliminate the need in holding -// it in sycl library, what would be less efficient and also would create the +// it in SYCL library, what would be less efficient and also would create the // need in keeping it for long due support backward ABI compatibility. template std::enable_if_t::value> From 205c1245688d7c97017702cc4fbc2718df81bef2 Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Mon, 1 Feb 2021 20:18:35 -0800 Subject: [PATCH 4/6] Fix the problem with lambda functions passed to parallel_for Signed-off-by: Vyacheslav N Klochkov --- sycl/include/CL/sycl/ONEAPI/reduction.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index c87066bcab54..ddb933d53edc 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -1269,9 +1269,10 @@ void reduCGFuncImpl(handler &CGH, KernelType KernelFunc, auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices); auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices); - using Name = typename get_reduction_main_kernel_name_t< - KernelName, KernelType, UniformPow2WG, IsOneWG, - std::tuple>::name; + using Name = + typename get_reduction_main_kernel_name_t::name; CGH.parallel_for(Range, [=](nd_item NDIt) { auto ReduIndices = std::index_sequence_for(); auto ReducersTuple = @@ -1403,7 +1404,7 @@ void reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups, using Name = typename get_reduction_aux_kernel_name_t>::name; + decltype(OutAccsTuple)>::name; range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize}; nd_range<1> Range{GlobalRange, range<1>(WGSize)}; CGH.parallel_for(Range, [=](nd_item<1> NDIt) { From efcd7e1d86b7945e76ea237d65002286d644a2f1 Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Wed, 3 Feb 2021 16:46:12 -0800 Subject: [PATCH 5/6] Fix initialization of local accessors (now both opencl:cpu and opencl:gpu) pass testing Signed-off-by: Vyacheslav N Klochkov --- sycl/include/CL/sycl/ONEAPI/reduction.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index 70e4277e7d72..b01e471c01e7 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -1178,6 +1178,9 @@ void initReduLocalAccs(size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, if (UniformPow2WG || GID < NWorkItems) std::tie(std::get(LocalAccs)[LID]...) = std::make_tuple(std::get(InputAccs)[GID]...); + else + std::tie(std::get(LocalAccs)[LID]...) = + std::make_tuple(std::get(Identities)...); if (!UniformPow2WG) std::tie(std::get(LocalAccs)[WGSize]...) = std::make_tuple(std::get(Identities)...); From 94e1be0649f968184de167880387b0dde63782c5 Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Thu, 4 Feb 2021 10:27:41 -0800 Subject: [PATCH 6/6] Address reviewer's comments. Fixes are mostly NFC and do not change code-gen at this moment. Signed-off-by: Vyacheslav N Klochkov --- sycl/include/CL/sycl/ONEAPI/reduction.hpp | 77 ++++++++++++++--------- sycl/include/CL/sycl/handler.hpp | 8 +-- 2 files changed, 49 insertions(+), 36 deletions(-) diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index b01e471c01e7..9c3eb5a1b55c 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -392,18 +392,17 @@ class reducer -struct are_all_but_last_reductions { +/// Predicate returning true if all template type parameters except the last one +/// are reductions. +template struct AreAllButLastReductions { static constexpr bool value = std::is_base_of::value && - are_all_but_last_reductions::value; + AreAllButLastReductions::value; }; -/// Helper specialization of are_all_but_last_reductions for one element only. -/// Returns true if the last and only typename is not a reduction. -template struct are_all_but_last_reductions { +/// Helper specialization of AreAllButLastReductions for one element only. +/// Returns true if the template parameter is not a reduction. +template struct AreAllButLastReductions { static constexpr bool value = !std::is_base_of::value; }; @@ -1097,9 +1096,11 @@ reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, /// the reductions for which a local accessors are needed, this function creates /// those local accessors and returns a tuple consisting of them. template -std::tuple -createReduLocalAccs(size_t Size, handler &CGH, std::index_sequence) { - return {Reductions::getReadWriteLocalAcc(Size, CGH)...}; +auto createReduLocalAccs(size_t Size, handler &CGH, + std::index_sequence) { + return std::make_tuple( + std::tuple_element_t>::getReadWriteLocalAcc( + Size, CGH)...); } /// For the given 'Reductions' types pack and indices enumerating them this @@ -1154,7 +1155,7 @@ void callReduUserKernelFunc(KernelType KernelFunc, nd_item NDIt, KernelFunc(NDIt, std::get(Reducers)...); } -template void initReduLocalAccs(size_t LID, size_t WGSize, std::tuple LocalAccs, @@ -1163,7 +1164,11 @@ void initReduLocalAccs(size_t LID, size_t WGSize, std::index_sequence) { std::tie(std::get(LocalAccs)[LID]...) = std::make_tuple(std::get(Reducers).MValue...); - if (!UniformPow2WG) + + // For work-groups, which size is not power of two, local accessors have + // an additional element with index WGSize that is used by the tree-reduction + // algorithm. Initialize those additional elements with identity values here. + if (!Pow2WG) std::tie(std::get(LocalAccs)[WGSize]...) = std::make_tuple(std::get(Identities)...); } @@ -1175,12 +1180,22 @@ void initReduLocalAccs(size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, std::tuple InputAccs, const std::tuple Identities, std::index_sequence) { + // Normally, the local accessors are initialized with elements from the input + // accessors. The exception is the case when (GID >= NWorkItems), which + // possible only when UniformPow2WG is false. For that case the elements of + // local accessors are initialized with identity value, so they would not + // give any impact into the final partial sums during the tree-reduction + // algorithm work. if (UniformPow2WG || GID < NWorkItems) std::tie(std::get(LocalAccs)[LID]...) = std::make_tuple(std::get(InputAccs)[GID]...); else std::tie(std::get(LocalAccs)[LID]...) = std::make_tuple(std::get(Identities)...); + + // For work-groups, which size is not power of two, local accessors have + // an additional element with index WGSize that is used by the tree-reduction + // algorithm. Initialize those additional elements with identity values here. if (!UniformPow2WG) std::tie(std::get(LocalAccs)[WGSize]...) = std::make_tuple(std::get(Identities)...); @@ -1196,7 +1211,7 @@ void reduceReduLocalAccs(size_t IndexA, size_t IndexB, std::get(LocalAccs)[IndexB]))...); } -template void writeReduSumsToOutAccs(size_t OutAccIndex, size_t WGSize, @@ -1214,11 +1229,16 @@ void writeReduSumsToOutAccs(size_t OutAccIndex, size_t WGSize, std::tuple_element_t>::getOutPointer( std::get(OutAccs))[OutAccIndex])...); - if (UniformPow2WG) { + if (Pow2WG) { + // The partial sums for the work-group are stored in 0-th elements of local + // accessors. Simply write those sums to output accessors. std::tie(std::tuple_element_t>::getOutPointer( std::get(OutAccs))[OutAccIndex]...) = std::make_tuple(std::get(LocalAccs)[0]...); } else { + // Each of local accessors keeps two partial sums: in 0-th and WGsize-th + // elements. Combine them into final partial sums and write to output + // accessors. std::tie(std::tuple_element_t>::getOutPointer( std::get(OutAccs))[OutAccIndex]...) = std::make_tuple(std::get(BOPs)(std::get(LocalAccs)[0], @@ -1300,15 +1320,15 @@ constexpr auto filterSequence(FunctorT F, std::index_sequence Indices) { return filterSequenceHelper(F, Indices); } -template +template void reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, std::tuple &ReduTuple, std::index_sequence ReduIndices) { size_t WGSize = Range.get_local_range().size(); - size_t LocalAccSize = WGSize + (UniformPow2WG ? 0 : 1); + size_t LocalAccSize = WGSize + (Pow2WG ? 0 : 1); auto LocalAccsTuple = createReduLocalAccs(LocalAccSize, CGH, ReduIndices); @@ -1318,10 +1338,8 @@ void reduCGFuncImpl(handler &CGH, KernelType KernelFunc, auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices); auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices); - using Name = - typename get_reduction_main_kernel_name_t::name; + using Name = typename get_reduction_main_kernel_name_t< + KernelName, KernelType, Pow2WG, IsOneWG, decltype(OutAccsTuple)>::name; CGH.parallel_for(Range, [=](nd_item NDIt) { auto ReduIndices = std::index_sequence_for(); auto ReducersTuple = @@ -1332,8 +1350,8 @@ void reduCGFuncImpl(handler &CGH, KernelType KernelFunc, size_t WGSize = NDIt.get_local_range().size(); size_t LID = NDIt.get_local_linear_id(); - initReduLocalAccs(LID, WGSize, LocalAccsTuple, ReducersTuple, - IdentitiesTuple, ReduIndices); + initReduLocalAccs(LID, WGSize, LocalAccsTuple, ReducersTuple, + IdentitiesTuple, ReduIndices); NDIt.barrier(); size_t PrevStep = WGSize; @@ -1342,7 +1360,7 @@ void reduCGFuncImpl(handler &CGH, KernelType KernelFunc, // LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple, ReduIndices); - } else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1)) { + } else if (!Pow2WG && LID == CurStep && (PrevStep & 0x1)) { // LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple, ReduIndices); @@ -1363,7 +1381,7 @@ void reduCGFuncImpl(handler &CGH, KernelType KernelFunc, Predicate; auto RWReduIndices = filterSequence(Predicate, ReduIndices); - writeReduSumsToOutAccs( + writeReduSumsToOutAccs( GrID, WGSize, (std::tuple *)nullptr, OutAccsTuple, LocalAccsTuple, BOPsTuple, ReduIndices, RWReduIndices); } @@ -1376,21 +1394,18 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, std::tuple &ReduTuple, std::index_sequence ReduIndices) { - size_t NWorkItems = Range.get_global_range().size(); size_t WGSize = Range.get_local_range().size(); size_t NWorkGroups = Range.get_group_range().size(); - bool Pow2WG = (WGSize & (WGSize - 1)) == 0; - bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems); if (NWorkGroups == 1) { - if (HasUniformWG) + if (Pow2WG) reduCGFuncImpl(CGH, KernelFunc, Range, ReduTuple, ReduIndices); else reduCGFuncImpl(CGH, KernelFunc, Range, ReduTuple, ReduIndices); } else { - if (HasUniformWG) + if (Pow2WG) reduCGFuncImpl(CGH, KernelFunc, Range, ReduTuple, ReduIndices); else diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index f33ef1bfe0de..5fbd1d99be69 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -269,8 +269,7 @@ template std::tuple...> tuple_select_elements(TupleT Tuple, std::index_sequence); -template -struct are_all_but_last_reductions; +template struct AreAllButLastReductions; } // namespace detail } // namespace ONEAPI @@ -1288,9 +1287,8 @@ class __SYCL_EXPORT handler { // versions handling 1 reduction variable are more efficient right now. template - std::enable_if_t< - (sizeof...(RestT) >= 3 && - ONEAPI::detail::are_all_but_last_reductions::value)> + std::enable_if_t<(sizeof...(RestT) >= 3 && + ONEAPI::detail::AreAllButLastReductions::value)> parallel_for(nd_range Range, RestT... Rest) { std::tuple ArgsTuple(Rest...); constexpr size_t NumArgs = sizeof...(RestT);