From 82dbc8ac1ba1204702de56e7b98f4c1092aee182 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Tue, 8 Nov 2022 12:39:25 -0800 Subject: [PATCH 1/3] [NFCI][SYCL] Refactor reductions implementations The purpose of this change is 1) Make it clear that sycl::range version always delegates to some sycl::nd_range implementation. 2) Flatten all existing implementations. That required splitting reduction_parallel_for_basic_impl depending on Reduction::has_fast_reduce. However, I also inlined its helper routines (that now had unambigious branches for "if constexpr (Reduction::*)") so the ratio between duplicate/unique code isn't bad at all. 3) Make a unique dispatching entry and have all the implementations provide the same interface. I plan to use it in unit-tests to bypass the dispatch and test all the implementation directly (when applicable based on nd_range/BinOp/HW/etc.). I'd say 90% of the change is straightforward code movement with the exceptions of - Inlining described above - Simplifying __sycl_reduction_kernel helper enabled by this change - Factoring out forward declarations in handler.hpp into a separate reduction_fwd.hpp - Rewriting dispatcher routine - caused by changes in the interfaces, not logic updates. --- sycl/include/sycl/handler.hpp | 35 +- sycl/include/sycl/reduction.hpp | 1516 ++++++++--------- sycl/include/sycl/reduction_fwd.hpp | 67 + .../properties_kernel_device_has.cpp | 24 +- .../properties_kernel_sub_group_size.cpp | 72 +- .../properties_kernel_work_group_size.cpp | 72 +- ...properties_kernel_work_group_size_hint.cpp | 72 +- 7 files changed, 942 insertions(+), 916 deletions(-) create mode 100644 sycl/include/sycl/reduction_fwd.hpp diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 971c91de9c5a8..c03db5fd23e9a 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -268,43 +269,9 @@ class RoundedRangeKernelWithKH { KernelType KernelFunc; }; -template -class reduction_impl_algo; - using sycl::detail::enable_if_t; using sycl::detail::queue_impl; -// Reductions implementation need access to private members of handler. Those -// are limited to those below. -namespace reduction { -inline void finalizeHandler(handler &CGH); -template void withAuxHandler(handler &CGH, FunctorTy Func); -} // namespace reduction - -template -void reduction_parallel_for(handler &CGH, - std::shared_ptr Queue, - range Range, PropertiesT Properties, - Reduction Redu, KernelType KernelFunc); - -template -void reduction_parallel_for(handler &CGH, - std::shared_ptr Queue, - nd_range Range, PropertiesT Properties, - Reduction Redu, KernelType KernelFunc); - -template -void reduction_parallel_for(handler &CGH, - std::shared_ptr Queue, - nd_range Range, PropertiesT Properties, - RestT... Rest); - -template struct IsReduction; -template struct AreAllButLastReductions; } // namespace detail /// Command group handler class. diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp index 6eeb263aa0ca1..14bd060d6e54b 100644 --- a/sycl/include/sycl/reduction.hpp +++ b/sycl/include/sycl/reduction.hpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -317,7 +318,7 @@ template class combiner { ReduVarPtr, [](auto Ref, auto Val) { return Ref.fetch_max(Val); }); } }; -} +} // namespace detail /// Specialization of the generic class 'reducer'. It is used for reductions /// of those types and operations for which the identity value is not known. @@ -571,8 +572,7 @@ class reduction_impl_algo : public reduction_impl_common { } } - template - auto &getTempBuffer(size_t Size, handler &CGH) { + template auto &getTempBuffer(size_t Size, handler &CGH) { auto Buffer = std::make_shared>(range<1>(Size)); CGH.addReduction(Buffer); return *Buffer; @@ -830,711 +830,785 @@ reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) { }); } +namespace reduction { +template struct MainKrn; +template struct AuxKrn; +} // namespace reduction + /// A helper to pass undefined (sycl::detail::auto_name) names unmodified. We /// must do that to avoid name collisions. -template