From d4144b4d38412c8e352659b39b4c567c78377a42 Mon Sep 17 00:00:00 2001
From: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Tue, 27 Oct 2020 19:57:45 -0700
Subject: [PATCH 1/6] [SYCL] Support parallel_for() accepting many reductions

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/CL/sycl/ONEAPI/reduction.hpp | 543 +++++++++++++++++++++-
 sycl/include/CL/sycl/handler.hpp          |  75 ++-
 2 files changed, 599 insertions(+), 19 deletions(-)
diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
index f15a811c2187..83b190f95e6e 100644
--- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp
+++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
@@ -14,6 +14,8 @@
 #include <CL/sycl/handler.hpp>
 #include <CL/sycl/kernel.hpp>
 
+#include <tuple>
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace ONEAPI {
@@ -72,6 +74,9 @@ using IsReduBitAND = detail::bool_constant<
 
 template <typename T, class BinaryOperation>
 using IsReduOptForFastAtomicFetch =
+#ifdef SYCL_REDUCTION_NO_FAST_OPTS
+    detail::bool_constant<false>;
+#else
     detail::bool_constant<(is_geninteger32bit<T>::value ||
                            is_geninteger64bit<T>::value) &&
                           (IsReduPlus<T, BinaryOperation>::value ||
@@ -80,15 +85,20 @@ using IsReduOptForFastAtomicFetch =
                            IsReduBitOR<T, BinaryOperation>::value ||
                            IsReduBitXOR<T, BinaryOperation>::value ||
                            IsReduBitAND<T, BinaryOperation>::value)>;
+#endif
 
 template <typename T, class BinaryOperation>
 using IsReduOptForFastReduce = detail::bool_constant<
+#ifdef SYCL_REDUCTION_NO_FAST_OPTS
+    false>;
+#else
     (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value ||
      std::is_same<T, half>::value || std::is_same<T, float>::value ||
      std::is_same<T, double>::value) &&
     (IsReduPlus<T, BinaryOperation>::value ||
      IsReduMinimum<T, BinaryOperation>::value ||
      IsReduMaximum<T, BinaryOperation>::value)>;
+#endif
 
 // Identity = 0
 template <typename T, class BinaryOperation>
@@ -340,12 +350,31 @@ class reducer<T, BinaryOperation,
   T MValue;
 };
 
+/// Base non-template class which is a base class for all reduction
+/// implementation classes. It is needed to detect the reduction classes.
+class reduction_impl_base {};
+
+/// Predicate returning true if and only if 'FirstT' is a reduction class and
+/// all types except the last one from 'RestT' are reductions as well.
+template <typename FirstT, typename... RestT>
+struct are_all_but_last_reductions {
+  static constexpr bool value =
+      std::is_base_of<reduction_impl_base, FirstT>::value &&
+      are_all_but_last_reductions<RestT...>::value;
+};
+
+/// Helper specialization of are_all_but_last_reductions for one element only.
+/// Returns true if the last and only typename is not a reduction.
+template <typename T> struct are_all_but_last_reductions<T> {
+  static constexpr bool value = !std::is_base_of<reduction_impl_base, T>::value;
+};
+
 /// This class encapsulates the reduction variable/accessor,
 /// the reduction operator and an optional operator identity.
 template <typename T, class BinaryOperation, int Dims, bool IsUSM,
           access::mode AccMode = access::mode::read_write,
           access::placeholder IsPlaceholder = access::placeholder::false_t>
-class reduction_impl {
+class reduction_impl : private reduction_impl_base {
 public:
   using reducer_type = reducer<T, BinaryOperation>;
   using result_type = T;
@@ -359,6 +388,10 @@ class reduction_impl {
   static constexpr access::mode accessor_mode = AccMode;
   static constexpr int accessor_dim = Dims;
   static constexpr int buffer_dim = (Dims == 0) ? 1 : Dims;
+  using local_accessor_type =
+      accessor<T, buffer_dim, access::mode::discard_read_write,
+               access::target::local>;
+
   static constexpr bool has_fast_atomics =
       IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
   static constexpr bool has_fast_reduce =
@@ -495,11 +528,8 @@ class reduction_impl {
 #endif
   }
 
-  accessor<T, buffer_dim, access::mode::discard_read_write,
-           access::target::local>
-  getReadWriteLocalAcc(size_t Size, handler &CGH) {
-    return accessor<T, buffer_dim, access::mode::discard_read_write,
-                    access::target::local>(Size, CGH);
+  static local_accessor_type getReadWriteLocalAcc(size_t Size, handler &CGH) {
+    return local_accessor_type(Size, CGH);
   }
 
   accessor<T, buffer_dim, access::mode::read>
@@ -508,6 +538,30 @@ class reduction_impl {
     return accessor<T, buffer_dim, access::mode::read>(*MOutBufPtr, CGH);
   }
 
+  /// Returns user's USM pointer passed to reduction for editing.
+  template <bool IsOneWG, bool _IsUSM = is_usm>
+  std::enable_if_t<IsOneWG && _IsUSM, result_type *>
+  getWriteMemForPartialReds(size_t, handler &CGH) {
+    return getUSMPointer();
+  }
+
+  /// Returns user's accessor passed to reduction for editing.
+  template <bool IsOneWG, bool _IsUSM = is_usm>
+  std::enable_if_t<IsOneWG && !_IsUSM, accessor_type>
+  getWriteMemForPartialReds(size_t, handler &CGH) {
+    return *MAcc;
+  }
+
+  /// Constructs a new temporary buffer to hold partial sums and returns
+  /// the accessor that that buffer.
+  template <bool IsOneWG>
+  std::enable_if_t<!IsOneWG, accessor_type>
+  getWriteMemForPartialReds(size_t Size, handler &CGH) {
+    MOutBufPtr = std::make_shared<buffer<T, buffer_dim>>(range<1>(Size));
+    CGH.addReduction(MOutBufPtr);
+    return accessor_type(*MOutBufPtr, CGH);
+  }
+
   template <access::placeholder _IsPlaceholder = IsPlaceholder>
   enable_if_t<_IsPlaceholder == access::placeholder::false_t, accessor_type>
   getWriteAccForPartialReds(size_t Size, handler &CGH) {
@@ -596,19 +650,17 @@ template <typename T1, bool B1, bool B2, typename T2>
 class __sycl_reduction_aux_kernel;
 
 /// Helper structs to get additional kernel name types based on given
-/// \c Name and \c Type types: if \c Name is undefined (is a \c auto_name) then
-/// \c Type becomes the \c Name.
-template <typename Name, typename Type, bool B1, bool B2, typename OutputT>
+/// \c Name and additional template parameters helping to distinguish kernels.
+/// If \c Name is undefined (is \c auto_name) then \c Type becomes the \c Name.
+template <typename Name, typename Type, bool B1, bool B2, typename T3>
 struct get_reduction_main_kernel_name_t {
   using name = __sycl_reduction_main_kernel<
-      typename sycl::detail::get_kernel_name_t<Name, Type>::name, B1, B2,
-      OutputT>;
+      typename sycl::detail::get_kernel_name_t<Name, Type>::name, B1, B2, T3>;
 };
-template <typename Name, typename Type, bool B1, bool B2, typename OutputT>
+template <typename Name, typename Type, bool B1, bool B2, typename T3>
 struct get_reduction_aux_kernel_name_t {
   using name = __sycl_reduction_aux_kernel<
-      typename sycl::detail::get_kernel_name_t<Name, Type>::name, B1, B2,
-      OutputT>;
+      typename sycl::detail::get_kernel_name_t<Name, Type>::name, B1, B2, T3>;
 };
 
 /// Implements a command group function that enqueues a kernel that calls
@@ -650,7 +702,7 @@ template <typename KernelName, typename KernelType, int Dims, class Reduction,
           bool IsPow2WG, typename OutputT>
 enable_if_t<!Reduction::has_fast_reduce && Reduction::has_fast_atomics>
 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
-               Reduction &Redu, OutputT Out) {
+               Reduction &, OutputT Out) {
   size_t WGSize = Range.get_local_range().size();
 
   // Use local memory to reduce elements in work-groups into zero-th element.
@@ -658,7 +710,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
   // The additional last element is used to catch reduce elements that could
   // otherwise be lost in the tree-reduction algorithm used in the kernel.
   size_t NLocalElements = WGSize + (IsPow2WG ? 0 : 1);
-  auto LocalReds = Redu.getReadWriteLocalAcc(NLocalElements, CGH);
+  auto LocalReds = Reduction::getReadWriteLocalAcc(NLocalElements, CGH);
 
   using Name = typename get_reduction_main_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
@@ -784,7 +836,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
   // The additional last element is used to catch elements that could
   // otherwise be lost in the tree-reduction algorithm.
   size_t NumLocalElements = WGSize + (IsPow2WG ? 0 : 1);
-  auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH);
+  auto LocalReds = Reduction::getReadWriteLocalAcc(NumLocalElements, CGH);
   typename Reduction::result_type ReduIdentity = Redu.getIdentity();
   using Name = typename get_reduction_main_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
@@ -913,7 +965,7 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
   // The additional last element is used to catch elements that could
   // otherwise be lost in the tree-reduction algorithm.
   size_t NumLocalElements = WGSize + (UniformPow2WG ? 0 : 1);
-  auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH);
+  auto LocalReds = Reduction::getReadWriteLocalAcc(NumLocalElements, CGH);
 
   auto ReduIdentity = Redu.getIdentity();
   auto BOp = Redu.getBinaryOperation();
@@ -1004,6 +1056,461 @@ reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
   return NWorkGroups;
 }
 
+/// For the given 'Reductions' types pack and indices enumerating only
+/// the reductions for which a local accessors are needed, this function creates
+/// those local accessors and returns a tuple consisting of them.
+template <typename... Reductions, size_t... Is>
+std::tuple<typename Reductions::local_accessor_type...>
+createReduLocalAccs(size_t Size, handler &CGH, std::index_sequence<Is...>) {
+  return {Reductions::getReadWriteLocalAcc(Size, CGH)...};
+}
+
+/// For the given 'Reductions' types pack and indices enumerating them this
+/// function either creates new temporary accessors for partial sums (if IsOneWG
+/// is false) or returns user's accessor/USM-pointer if (IsOneWG is true).
+template <bool IsOneWG, typename... Reductions, size_t... Is>
+auto createReduOutAccs(size_t NWorkGroups, handler &CGH,
+                       std::tuple<Reductions...> &ReduTuple,
+                       std::index_sequence<Is...>) {
+  return std::make_tuple(
+      std::get<Is>(ReduTuple).template getWriteMemForPartialReds<IsOneWG>(
+          NWorkGroups, CGH)...);
+}
+
+/// For the given 'Reductions' types pack and indices enumerating them this
+/// function returns accessors to buffers holding partial sums generated in the
+/// previous kernel invocation.
+template <typename... Reductions, size_t... Is>
+auto getReadAccsToPreviousPartialReds(handler &CGH,
+                                      std::tuple<Reductions...> &ReduTuple,
+                                      std::index_sequence<Is...>) {
+  return std::make_tuple(
+      std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds(CGH)...);
+}
+
+template <typename... Reductions, size_t... Is>
+std::tuple<typename Reductions::result_type...>
+getReduIdentities(std::tuple<Reductions...> &ReduTuple,
+                  std::index_sequence<Is...>) {
+  return {std::get<Is>(ReduTuple).getIdentity()...};
+}
+
+template <typename... Reductions, size_t... Is>
+std::tuple<typename Reductions::binary_operation...>
+getReduBOPs(std::tuple<Reductions...> &ReduTuple, std::index_sequence<Is...>) {
+  return {std::get<Is>(ReduTuple).getBinaryOperation()...};
+}
+
+template <typename... Reductions, size_t... Is>
+std::tuple<typename Reductions::reducer_type...>
+createReducers(std::tuple<typename Reductions::result_type...> Identities,
+               std::tuple<typename Reductions::binary_operation...> BOPsTuple,
+               std::index_sequence<Is...>) {
+  return {typename Reductions::reducer_type{std::get<Is>(Identities),
+                                            std::get<Is>(BOPsTuple)}...};
+}
+
+template <typename KernelType, int Dims, typename... ReducerT, size_t... Is>
+void callReduUserKernelFunc(KernelType KernelFunc, nd_item<Dims> NDIt,
+                            std::tuple<ReducerT...> &Reducers,
+                            std::index_sequence<Is...>) {
+  KernelFunc(NDIt, std::get<Is>(Reducers)...);
+}
+
+template <bool UniformPow2WG, typename... LocalAccT, typename... ReducerT,
+          typename... ResultT, size_t... Is>
+void initReduLocalAccs(size_t LID, size_t WGSize,
+                       std::tuple<LocalAccT...> LocalAccs,
+                       const std::tuple<ReducerT...> &Reducers,
+                       const std::tuple<ResultT...> Identities,
+                       std::index_sequence<Is...>) {
+  std::tie(std::get<Is>(LocalAccs)[LID]...) =
+      std::make_tuple(std::get<Is>(Reducers).MValue...);
+  if (!UniformPow2WG)
+    std::tie(std::get<Is>(LocalAccs)[WGSize]...) =
+        std::make_tuple(std::get<Is>(Identities)...);
+}
+
+template <bool UniformPow2WG, typename... LocalAccT, typename... InputAccT,
+          typename... ResultT, size_t... Is>
+void initReduLocalAccs(size_t LID, size_t GID, size_t NWorkItems, size_t WGSize,
+                       std::tuple<InputAccT...> LocalAccs,
+                       std::tuple<LocalAccT...> InputAccs,
+                       const std::tuple<ResultT...> Identities,
+                       std::index_sequence<Is...>) {
+  if (UniformPow2WG || GID < NWorkItems)
+    std::tie(std::get<Is>(LocalAccs)[LID]...) =
+        std::make_tuple(std::get<Is>(InputAccs)[GID]...);
+  if (!UniformPow2WG)
+    std::tie(std::get<Is>(LocalAccs)[WGSize]...) =
+        std::make_tuple(std::get<Is>(Identities)...);
+}
+
+template <typename... LocalAccT, typename... BOPsT, size_t... Is>
+void reduceReduLocalAccs(size_t IndexA, size_t IndexB,
+                         std::tuple<LocalAccT...> LocalAccs,
+                         std::tuple<BOPsT...> BOPs,
+                         std::index_sequence<Is...>) {
+  std::tie(std::get<Is>(LocalAccs)[IndexA]...) =
+      std::make_tuple((std::get<Is>(BOPs)(std::get<Is>(LocalAccs)[IndexA],
+                                          std::get<Is>(LocalAccs)[IndexB]))...);
+}
+
+template <bool UniformPow2WG, typename... Reductions, typename... OutAccT,
+          typename... LocalAccT, typename... BOPsT, size_t... Is,
+          size_t... RWIs>
+void writeReduSumsToOutAccs(size_t OutAccIndex, size_t WGSize,
+                            std::tuple<Reductions...> *,
+                            std::tuple<OutAccT...> OutAccs,
+                            std::tuple<LocalAccT...> LocalAccs,
+                            std::tuple<BOPsT...> BOPs,
+                            std::index_sequence<Is...>,
+                            std::index_sequence<RWIs...>) {
+  // This statement is needed for read_write accessors/USM-memory only.
+  // It adds the initial value of the reduction variable to the final result.
+  std::tie(std::get<RWIs>(LocalAccs)[0]...) =
+      std::make_tuple(std::get<RWIs>(BOPs)(
+          std::get<RWIs>(LocalAccs)[0],
+          std::tuple_element_t<RWIs, std::tuple<Reductions...>>::getOutPointer(
+              std::get<RWIs>(OutAccs))[OutAccIndex])...);
+
+  if (UniformPow2WG) {
+    std::tie(std::tuple_element_t<Is, std::tuple<Reductions...>>::getOutPointer(
+        std::get<Is>(OutAccs))[OutAccIndex]...) =
+        std::make_tuple(std::get<Is>(LocalAccs)[0]...);
+  } else {
+    std::tie(std::tuple_element_t<Is, std::tuple<Reductions...>>::getOutPointer(
+        std::get<Is>(OutAccs))[OutAccIndex]...) =
+        std::make_tuple(std::get<Is>(BOPs)(std::get<Is>(LocalAccs)[0],
+                                           std::get<Is>(LocalAccs)[WGSize])...);
+  }
+}
+
+// Concatenate an empty sequence.
+constexpr std::index_sequence<> concat_sequences(std::index_sequence<>) {
+  return {};
+}
+
+// Concatenate a sequence consisting of 1 element.
+template <size_t I>
+constexpr std::index_sequence<I> concat_sequences(std::index_sequence<I>) {
+  return {};
+}
+
+// Concatenate two potentially empty sequences.
+template <size_t... Is, size_t... Js>
+constexpr std::index_sequence<Is..., Js...>
+concat_sequences(std::index_sequence<Is...>, std::index_sequence<Js...>) {
+  return {};
+}
+
+// Concatenate more than 2 sequences.
+template <size_t... Is, size_t... Js, class... Rs>
+constexpr auto concat_sequences(std::index_sequence<Is...>,
+                                std::index_sequence<Js...>, Rs...) {
+  return concat_sequences(std::index_sequence<Is..., Js...>{}, Rs{}...);
+}
+
+struct IsRWReductionPredicate {
+  template <typename T> struct Func {
+    static constexpr bool value =
+        std::remove_pointer_t<T>::accessor_mode == access::mode::read_write;
+  };
+};
+
+struct IsNonUsmReductionPredicate {
+  template <typename T> struct Func {
+    static constexpr bool value = !std::remove_pointer_t<T>::is_usm;
+  };
+};
+
+struct EmptyReductionPredicate {
+  template <typename T> struct Func { static constexpr bool value = false; };
+};
+
+template <bool Cond, size_t I> struct FilterElement {
+  using type =
+      std::conditional_t<Cond, std::index_sequence<I>, std::index_sequence<>>;
+};
+
+/// For each index 'I' from the given indices pack 'Is' this function initially
+/// creates a number of short index_sequences, where each of such short
+/// index sequences is either empty (if the given Functor returns false for the
+/// type T[I]) or 1 element 'I' (otherwise). After that this function
+/// concatenates those short sequences into one and returns the result sequence.
+template <typename... T, typename FunctorT, size_t... Is,
+          std::enable_if_t<(sizeof...(Is) > 0), int> Z = 0>
+constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
+  return concat_sequences(
+      typename FilterElement<FunctorT::template Func<std::tuple_element_t<
+                                 Is, std::tuple<T...>>>::value,
+                             Is>::type{}...);
+}
+template <typename... T, typename FunctorT, size_t... Is,
+          std::enable_if_t<(sizeof...(Is) == 0), int> Z = 0>
+constexpr auto filterSequenceHelper(FunctorT, std::index_sequence<Is...>) {
+  return std::index_sequence<>{};
+}
+
+/// For each index 'I' from the given indices pack 'Is' this function returns
+/// an index sequence consisting of only those 'I's for which the 'FunctorT'
+/// applied to 'T[I]' returns true.
+template <typename... T, typename FunctorT, size_t... Is>
+constexpr auto filterSequence(FunctorT F, std::index_sequence<Is...> Indices) {
+  return filterSequenceHelper<T...>(F, Indices);
+}
+
+template <typename KernelName, bool UniformPow2WG, bool IsOneWG,
+          typename KernelType, int Dims, typename... Reductions, size_t... Is>
+void reduCGFuncImpl(handler &CGH, KernelType KernelFunc,
+                    const nd_range<Dims> &Range,
+                    std::tuple<Reductions...> &ReduTuple,
+                    std::index_sequence<Is...> ReduIndices) {
+
+  size_t WGSize = Range.get_local_range().size();
+  size_t LocalAccSize = WGSize + (UniformPow2WG ? 0 : 1);
+  auto LocalAccsTuple =
+      createReduLocalAccs<Reductions...>(LocalAccSize, CGH, ReduIndices);
+
+  size_t NWorkGroups = IsOneWG ? 1 : Range.get_group_range().size();
+  auto OutAccsTuple =
+      createReduOutAccs<IsOneWG>(NWorkGroups, CGH, ReduTuple, ReduIndices);
+  auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices);
+  auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices);
+
+  using Name = typename get_reduction_main_kernel_name_t<
+      KernelName, KernelType, UniformPow2WG, IsOneWG,
+      std::tuple<Reductions...>>::name;
+  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
+    auto ReduIndices = std::index_sequence_for<Reductions...>();
+    auto ReducersTuple =
+        createReducers<Reductions...>(IdentitiesTuple, BOPsTuple, ReduIndices);
+    // The .MValue field of each of the elements in ReducersTuple
+    // gets initialized in this call.
+    callReduUserKernelFunc(KernelFunc, NDIt, ReducersTuple, ReduIndices);
+
+    size_t WGSize = NDIt.get_local_range().size();
+    size_t LID = NDIt.get_local_linear_id();
+    initReduLocalAccs<UniformPow2WG>(LID, WGSize, LocalAccsTuple, ReducersTuple,
+                                     IdentitiesTuple, ReduIndices);
+    NDIt.barrier();
+
+    size_t PrevStep = WGSize;
+    for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
+      if (LID < CurStep) {
+        // LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
+        reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple,
+                            ReduIndices);
+      } else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1)) {
+        // LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
+        reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple,
+                            ReduIndices);
+      }
+      NDIt.barrier();
+      PrevStep = CurStep;
+    }
+
+    // Compute the partial sum/reduction for the work-group.
+    if (LID == 0) {
+      size_t GrID = NDIt.get_group_linear_id();
+      // If there is only one work-group, then the original accessors need to be
+      // updated, i.e. after the work in each work-group is done, the work-group
+      // result is added to the original value of the read-write accessors or
+      // USM memory.
+      std::conditional_t<IsOneWG, IsRWReductionPredicate,
+                         EmptyReductionPredicate>
+          Predicate;
+      auto RWReduIndices =
+          filterSequence<Reductions...>(Predicate, ReduIndices);
+      writeReduSumsToOutAccs<UniformPow2WG>(
+          GrID, WGSize, (std::tuple<Reductions...> *)nullptr, OutAccsTuple,
+          LocalAccsTuple, BOPsTuple, ReduIndices, RWReduIndices);
+    }
+  });
+}
+
+template <typename KernelName, typename KernelType, int Dims,
+          typename... Reductions, size_t... Is>
+void reduCGFunc(handler &CGH, KernelType KernelFunc,
+                const nd_range<Dims> &Range,
+                std::tuple<Reductions...> &ReduTuple,
+                std::index_sequence<Is...> ReduIndices) {
+  size_t NWorkItems = Range.get_global_range().size();
+  size_t WGSize = Range.get_local_range().size();
+  size_t NWorkGroups = Range.get_group_range().size();
+
+  bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
+  bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems);
+  if (NWorkGroups == 1) {
+    if (HasUniformWG)
+      reduCGFuncImpl<KernelName, true, true>(CGH, KernelFunc, Range, ReduTuple,
+                                             ReduIndices);
+    else
+      reduCGFuncImpl<KernelName, false, true>(CGH, KernelFunc, Range, ReduTuple,
+                                              ReduIndices);
+  } else {
+    if (HasUniformWG)
+      reduCGFuncImpl<KernelName, true, false>(CGH, KernelFunc, Range, ReduTuple,
+                                              ReduIndices);
+    else
+      reduCGFuncImpl<KernelName, false, false>(CGH, KernelFunc, Range,
+                                               ReduTuple, ReduIndices);
+  }
+}
+
+// The list of reductions may be empty; for such cases there is nothing to do.
+// This function is intentionally made template to eliminate the need in holding
+// it in sycl library, what would be less efficient and also would create the
+// need in keeping it for long due support backward ABI compatibility.
+template <typename HandlerT>
+std::enable_if_t<std::is_same<HandlerT, handler>::value>
+associateReduAccsWithHandlerHelper(HandlerT &) {}
+
+template <typename ReductionT>
+void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu) {
+  Redu.associateWithHandler(CGH);
+}
+
+template <typename ReductionT, typename... RestT,
+          enable_if_t<(sizeof...(RestT) > 0), int> Z = 0>
+void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu,
+                                        RestT &...Rest) {
+  Redu.associateWithHandler(CGH);
+  associateReduAccsWithHandlerHelper(CGH, Rest...);
+}
+
+template <typename... Reductions, size_t... Is>
+void associateReduAccsWithHandler(handler &CGH,
+                                  std::tuple<Reductions...> &ReduTuple,
+                                  std::index_sequence<Is...>) {
+  associateReduAccsWithHandlerHelper(CGH, std::get<Is>(ReduTuple)...);
+}
+
+template <typename KernelName, typename KernelType, bool UniformPow2WG,
+          bool IsOneWG, typename... Reductions, size_t... Is>
+void reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
+                       size_t WGSize, std::tuple<Reductions...> &ReduTuple,
+                       std::index_sequence<Is...> ReduIndices) {
+  // The last kernel DOES write to user's accessor passed to reduction.
+  // Associate it with handler manually.
+  std::conditional_t<IsOneWG, IsNonUsmReductionPredicate,
+                     EmptyReductionPredicate>
+      Predicate;
+  auto AccReduIndices = filterSequence<Reductions...>(Predicate, ReduIndices);
+  associateReduAccsWithHandler(CGH, ReduTuple, AccReduIndices);
+
+  size_t LocalAccSize = WGSize + (UniformPow2WG ? 0 : 1);
+  auto LocalAccsTuple =
+      createReduLocalAccs<Reductions...>(LocalAccSize, CGH, ReduIndices);
+  auto InAccsTuple =
+      getReadAccsToPreviousPartialReds(CGH, ReduTuple, ReduIndices);
+  auto OutAccsTuple =
+      createReduOutAccs<IsOneWG>(NWorkGroups, CGH, ReduTuple, ReduIndices);
+  auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices);
+  auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices);
+
+  using Name =
+      typename get_reduction_aux_kernel_name_t<KernelName, KernelType,
+                                               UniformPow2WG, IsOneWG,
+                                               std::tuple<Reductions...>>::name;
+  range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize};
+  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
+  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
+    auto ReduIndices = std::index_sequence_for<Reductions...>();
+    size_t WGSize = NDIt.get_local_range().size();
+    size_t LID = NDIt.get_local_linear_id();
+    size_t GID = NDIt.get_global_linear_id();
+    initReduLocalAccs<UniformPow2WG>(LID, GID, NWorkItems, WGSize,
+                                     LocalAccsTuple, InAccsTuple,
+                                     IdentitiesTuple, ReduIndices);
+    NDIt.barrier();
+
+    size_t PrevStep = WGSize;
+    for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
+      if (LID < CurStep) {
+        // LocalAcc[LID] = BOp(LocalAcc[LID], LocalAcc[LID + CurStep]);
+        reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple,
+                            ReduIndices);
+      } else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1)) {
+        // LocalAcc[WGSize] = BOp(LocalAcc[WGSize], LocalAcc[PrevStep - 1]);
+        reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple,
+                            ReduIndices);
+      }
+      NDIt.barrier();
+      PrevStep = CurStep;
+    }
+
+    // Compute the partial sum/reduction for the work-group.
+    if (LID == 0) {
+      size_t GrID = NDIt.get_group_linear_id();
+      // If there is only one work-group, then the original accessors need to be
+      // updated, i.e. after the work in each work-group is done, the work-group
+      // result is added to the original value of the read-write accessors or
+      // USM memory.
+      std::conditional_t<IsOneWG, IsRWReductionPredicate,
+                         EmptyReductionPredicate>
+          Predicate;
+      auto RWReduIndices =
+          filterSequence<Reductions...>(Predicate, ReduIndices);
+      writeReduSumsToOutAccs<UniformPow2WG>(
+          GrID, WGSize, (std::tuple<Reductions...> *)nullptr, OutAccsTuple,
+          LocalAccsTuple, BOPsTuple, ReduIndices, RWReduIndices);
+    }
+  });
+}
+
+template <typename KernelName, typename KernelType, typename... Reductions,
+          size_t... Is>
+size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
+                     std::tuple<Reductions...> &ReduTuple,
+                     std::index_sequence<Is...> ReduIndices) {
+  size_t NWorkGroups;
+  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
+
+  bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
+  bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems);
+  if (NWorkGroups == 1) {
+    if (HasUniformWG)
+      reduAuxCGFuncImpl<KernelName, KernelType, true, true>(
+          CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices);
+    else
+      reduAuxCGFuncImpl<KernelName, KernelType, false, true>(
+          CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices);
+  } else {
+    if (HasUniformWG)
+      reduAuxCGFuncImpl<KernelName, KernelType, true, false>(
+          CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices);
+    else
+      reduAuxCGFuncImpl<KernelName, KernelType, false, false>(
+          CGH, NWorkItems, NWorkGroups, WGSize, ReduTuple, ReduIndices);
+  }
+  return NWorkGroups;
+}
+
+template <typename Reduction> size_t reduGetMemPerWorkItemHelper(Reduction &) {
+  return sizeof(typename Reduction::result_type);
+}
+
+template <typename Reduction, typename... RestT>
+size_t reduGetMemPerWorkItemHelper(Reduction &, RestT... Rest) {
+  return sizeof(typename Reduction::result_type) +
+         reduGetMemPerWorkItemHelper(Rest...);
+}
+
+template <typename... ReductionT, size_t... Is>
+size_t reduGetMemPerWorkItem(std::tuple<ReductionT...> &ReduTuple,
+                             std::index_sequence<Is...>) {
+  return reduGetMemPerWorkItemHelper(std::get<Is>(ReduTuple)...);
+}
+
+/// Utility function: for the given tuple \param Tuple the function returns
+/// a new tuple consisting of only elements indexed by the index sequence.
+template <typename TupleT, std::size_t... Is>
+std::tuple<std::tuple_element_t<Is, TupleT>...>
+tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
+  return {std::get<Is>(std::move(Tuple))...};
+}
+
 } // namespace detail
 
 /// Creates and returns an object implementing the reduction functionality.
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 24d8ac49ad9c..f33ef1bfe0de 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -30,6 +30,7 @@
 #include <functional>
 #include <limits>
 #include <memory>
+#include <tuple>
 #include <type_traits>
 
 // SYCL_LANGUAGE_VERSION is 4 digit year followed by 2 digit revision
@@ -244,9 +245,33 @@ enable_if_t<!Reduction::has_fast_atomics, size_t>
 reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
               Reduction &Redu);
 
+template <typename KernelName, typename KernelType, int Dims,
+          typename... Reductions, size_t... Is>
+void reduCGFunc(handler &CGH, KernelType KernelFunc,
+                const nd_range<Dims> &Range,
+                std::tuple<Reductions...> &ReduTuple,
+                std::index_sequence<Is...>);
+
+template <typename KernelName, typename KernelType, typename... Reductions,
+          size_t... Is>
+size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
+                     std::tuple<Reductions...> &ReduTuple,
+                     std::index_sequence<Is...>);
+
 __SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
                                       size_t LocalMemBytesPerWorkItem);
 
+template <typename... ReductionT, size_t... Is>
+size_t reduGetMemPerWorkItem(std::tuple<ReductionT...> &ReduTuple,
+                             std::index_sequence<Is...>);
+
+template <typename TupleT, std::size_t... Is>
+std::tuple<std::tuple_element_t<Is, TupleT>...>
+tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>);
+
+template <typename FirstT, typename... RestT>
+struct are_all_but_last_reductions;
+
 } // namespace detail
 } // namespace ONEAPI
 
@@ -1184,7 +1209,6 @@ class __SYCL_EXPORT handler {
   /// globally visible, there is no need for the developer to provide
   /// a kernel name for it.
   ///
-  /// TODO: Need to handle more than 1 reduction in parallel_for().
   /// TODO: Support HOST. The kernels called by this parallel_for() may use
   /// some functionality that is not yet supported on HOST such as:
   /// barrier(), and ONEAPI::reduce() that also may be used in more
@@ -1257,6 +1281,55 @@ class __SYCL_EXPORT handler {
     } // end while (NWorkItems > 1)
   }
 
+  // This version of parallel_for may handle one or more reductions packed in
+  // \p Rest argument. Note thought that the last element in \p Rest pack is
+  // the kernel function.
+  // TODO: this variant is currently enabled for 2+ reductions only as the
+  // versions handling 1 reduction variable are more efficient right now.
+  template <typename KernelName = detail::auto_name, int Dims,
+            typename... RestT>
+  std::enable_if_t<
+      (sizeof...(RestT) >= 3 &&
+       ONEAPI::detail::are_all_but_last_reductions<RestT...>::value)>
+  parallel_for(nd_range<Dims> Range, RestT... Rest) {
+    std::tuple<RestT...> ArgsTuple(Rest...);
+    constexpr size_t NumArgs = sizeof...(RestT);
+    auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
+    auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
+    auto ReduTuple =
+        ONEAPI::detail::tuple_select_elements(ArgsTuple, ReduIndices);
+
+    size_t LocalMemPerWorkItem =
+        ONEAPI::detail::reduGetMemPerWorkItem(ReduTuple, ReduIndices);
+    // TODO: currently the maximal work group size is determined for the given
+    // queue/device, while it is safer to use queries to the kernel compiled
+    // for the device.
+    size_t MaxWGSize =
+        ONEAPI::detail::reduGetMaxWGSize(MQueue, LocalMemPerWorkItem);
+    if (Range.get_local_range().size() > MaxWGSize)
+      throw sycl::runtime_error("The implementation handling parallel_for with"
+                                " reduction requires work group size not bigger"
+                                " than " +
+                                    std::to_string(MaxWGSize),
+                                PI_INVALID_WORK_GROUP_SIZE);
+
+    ONEAPI::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, ReduTuple,
+                                           ReduIndices);
+    shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
+    this->finalize();
+
+    size_t NWorkItems = Range.get_group_range().size();
+    while (NWorkItems > 1) {
+      handler AuxHandler(QueueCopy, MIsHost);
+      AuxHandler.saveCodeLoc(MCodeLoc);
+
+      NWorkItems =
+          ONEAPI::detail::reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
+              AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
+      MLastEvent = AuxHandler.finalize();
+    } // end while (NWorkItems > 1)
+  }
+
   /// Hierarchical kernel invocation method of a kernel defined as a lambda
   /// encoding the body of each work-group to launch.
   ///

From b2ba2a4bbd520cad21313078ebe72a8a6862c243 Mon Sep 17 00:00:00 2001
From: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Mon, 1 Feb 2021 11:28:14 -0800
Subject: [PATCH 2/6] Remove obsolete/debugging code. clang-format. Change LIT
 test requiring c++11 (changed to c++14)

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/CL/sycl/ONEAPI/reduction.hpp           | 13 ++-----------
 ...ycl-include-gnu11.cpp => sycl-include-gnu14.cpp} |  2 +-
 2 files changed, 3 insertions(+), 12 deletions(-)
 rename sycl/test/regression/{sycl-include-gnu11.cpp => sycl-include-gnu14.cpp} (74%)

diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
index 83b190f95e6e..0480a9a01987 100644
--- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp
+++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
@@ -74,9 +74,6 @@ using IsReduBitAND = detail::bool_constant<
 
 template <typename T, class BinaryOperation>
 using IsReduOptForFastAtomicFetch =
-#ifdef SYCL_REDUCTION_NO_FAST_OPTS
-    detail::bool_constant<false>;
-#else
     detail::bool_constant<(is_geninteger32bit<T>::value ||
                            is_geninteger64bit<T>::value) &&
                           (IsReduPlus<T, BinaryOperation>::value ||
@@ -85,20 +82,15 @@ using IsReduOptForFastAtomicFetch =
                            IsReduBitOR<T, BinaryOperation>::value ||
                            IsReduBitXOR<T, BinaryOperation>::value ||
                            IsReduBitAND<T, BinaryOperation>::value)>;
-#endif
 
 template <typename T, class BinaryOperation>
 using IsReduOptForFastReduce = detail::bool_constant<
-#ifdef SYCL_REDUCTION_NO_FAST_OPTS
-    false>;
-#else
     (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value ||
      std::is_same<T, half>::value || std::is_same<T, float>::value ||
      std::is_same<T, double>::value) &&
     (IsReduPlus<T, BinaryOperation>::value ||
      IsReduMinimum<T, BinaryOperation>::value ||
      IsReduMaximum<T, BinaryOperation>::value)>;
-#endif
 
 // Identity = 0
 template <typename T, class BinaryOperation>
@@ -389,8 +381,7 @@ class reduction_impl : private reduction_impl_base {
   static constexpr int accessor_dim = Dims;
   static constexpr int buffer_dim = (Dims == 0) ? 1 : Dims;
   using local_accessor_type =
-      accessor<T, buffer_dim, access::mode::discard_read_write,
-               access::target::local>;
+      accessor<T, buffer_dim, access::mode::read_write, access::target::local>;
 
   static constexpr bool has_fast_atomics =
       IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
@@ -1374,7 +1365,7 @@ void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu) {
 template <typename ReductionT, typename... RestT,
           enable_if_t<(sizeof...(RestT) > 0), int> Z = 0>
 void associateReduAccsWithHandlerHelper(handler &CGH, ReductionT &Redu,
-                                        RestT &...Rest) {
+                                        RestT &... Rest) {
   Redu.associateWithHandler(CGH);
   associateReduAccsWithHandlerHelper(CGH, Rest...);
 }
diff --git a/sycl/test/regression/sycl-include-gnu11.cpp b/sycl/test/regression/sycl-include-gnu14.cpp
similarity index 74%
rename from sycl/test/regression/sycl-include-gnu11.cpp
rename to sycl/test/regression/sycl-include-gnu14.cpp
index c265b0ff2524..0147acf143de 100644
--- a/sycl/test/regression/sycl-include-gnu11.cpp
+++ b/sycl/test/regression/sycl-include-gnu14.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -std=gnu++11 -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -std=gnu++14 -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %RUN_ON_HOST %t.out
 
 // UNSUPPORTED: system-windows

From f0d0716cc063bfcdf3ac9a90fd052ce539699a9c Mon Sep 17 00:00:00 2001
From: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Mon, 1 Feb 2021 12:28:07 -0800
Subject: [PATCH 3/6] [NFC] Fix build warnings; change 'sycl' with 'SYCL' in
 comments

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/CL/sycl/ONEAPI/reduction.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
index 0480a9a01987..c87066bcab54 100644
--- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp
+++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
@@ -532,14 +532,14 @@ class reduction_impl : private reduction_impl_base {
   /// Returns user's USM pointer passed to reduction for editing.
   template <bool IsOneWG, bool _IsUSM = is_usm>
   std::enable_if_t<IsOneWG && _IsUSM, result_type *>
-  getWriteMemForPartialReds(size_t, handler &CGH) {
+  getWriteMemForPartialReds(size_t, handler &) {
     return getUSMPointer();
   }
 
   /// Returns user's accessor passed to reduction for editing.
   template <bool IsOneWG, bool _IsUSM = is_usm>
   std::enable_if_t<IsOneWG && !_IsUSM, accessor_type>
-  getWriteMemForPartialReds(size_t, handler &CGH) {
+  getWriteMemForPartialReds(size_t, handler &) {
     return *MAcc;
   }
 
@@ -658,7 +658,7 @@ struct get_reduction_aux_kernel_name_t {
 /// user's lambda function KernelFunc and also does one iteration of reduction
 /// of elements computed in user's lambda function.
 /// This version uses ONEAPI::reduce() algorithm to reduce elements in each
-/// of work-groups, then it calls fast sycl atomic operations to update
+/// of work-groups, then it calls fast SYCL atomic operations to update
 /// user's reduction variable.
 ///
 /// Briefly: calls user's lambda, ONEAPI::reduce() + atomic, INT + ADD/MIN/MAX.
@@ -685,7 +685,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
 /// user's lambda function KernelFunc and also does one iteration of reduction
 /// of elements computed in user's lambda function.
 /// This version uses tree-reduction algorithm to reduce elements in each
-/// of work-groups, then it calls fast sycl atomic operations to update
+/// of work-groups, then it calls fast SYCL atomic operations to update
 /// user's reduction variable.
 ///
 /// Briefly: calls user's lambda, tree-reduction + atomic, INT + AND/OR/XOR.
@@ -1351,7 +1351,7 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc,
 
 // The list of reductions may be empty; for such cases there is nothing to do.
 // This function is intentionally made template to eliminate the need in holding
-// it in sycl library, what would be less efficient and also would create the
+// it in SYCL library, what would be less efficient and also would create the
 // need in keeping it for long due support backward ABI compatibility.
 template <typename HandlerT>
 std::enable_if_t<std::is_same<HandlerT, handler>::value>

From 205c1245688d7c97017702cc4fbc2718df81bef2 Mon Sep 17 00:00:00 2001
From: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Mon, 1 Feb 2021 20:18:35 -0800
Subject: [PATCH 4/6] Fix the problem with lambda functions passed to
 parallel_for

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/CL/sycl/ONEAPI/reduction.hpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
index c87066bcab54..ddb933d53edc 100644
--- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp
+++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
@@ -1269,9 +1269,10 @@ void reduCGFuncImpl(handler &CGH, KernelType KernelFunc,
   auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices);
   auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices);
 
-  using Name = typename get_reduction_main_kernel_name_t<
-      KernelName, KernelType, UniformPow2WG, IsOneWG,
-      std::tuple<Reductions...>>::name;
+  using Name =
+      typename get_reduction_main_kernel_name_t<KernelName, KernelType,
+                                                UniformPow2WG, IsOneWG,
+                                                decltype(OutAccsTuple)>::name;
   CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
     auto ReduIndices = std::index_sequence_for<Reductions...>();
     auto ReducersTuple =
@@ -1403,7 +1404,7 @@ void reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
   using Name =
       typename get_reduction_aux_kernel_name_t<KernelName, KernelType,
                                                UniformPow2WG, IsOneWG,
-                                               std::tuple<Reductions...>>::name;
+                                               decltype(OutAccsTuple)>::name;
   range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize};
   nd_range<1> Range{GlobalRange, range<1>(WGSize)};
   CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {

From efcd7e1d86b7945e76ea237d65002286d644a2f1 Mon Sep 17 00:00:00 2001
From: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Wed, 3 Feb 2021 16:46:12 -0800
Subject: [PATCH 5/6] Fix initialization of local accessors (now both
 opencl:cpu and opencl:gpu) pass testing

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/CL/sycl/ONEAPI/reduction.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
index 70e4277e7d72..b01e471c01e7 100644
--- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp
+++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
@@ -1178,6 +1178,9 @@ void initReduLocalAccs(size_t LID, size_t GID, size_t NWorkItems, size_t WGSize,
   if (UniformPow2WG || GID < NWorkItems)
     std::tie(std::get<Is>(LocalAccs)[LID]...) =
         std::make_tuple(std::get<Is>(InputAccs)[GID]...);
+  else
+    std::tie(std::get<Is>(LocalAccs)[LID]...) =
+        std::make_tuple(std::get<Is>(Identities)...);
   if (!UniformPow2WG)
     std::tie(std::get<Is>(LocalAccs)[WGSize]...) =
         std::make_tuple(std::get<Is>(Identities)...);

From 94e1be0649f968184de167880387b0dde63782c5 Mon Sep 17 00:00:00 2001
From: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Thu, 4 Feb 2021 10:27:41 -0800
Subject: [PATCH 6/6] Address reviewer's comments. Fixes are mostly NFC and do
 not change code-gen at this moment.

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/CL/sycl/ONEAPI/reduction.hpp | 77 ++++++++++++++---------
 sycl/include/CL/sycl/handler.hpp          |  8 +--
 2 files changed, 49 insertions(+), 36 deletions(-)

diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
index b01e471c01e7..9c3eb5a1b55c 100644
--- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp
+++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
@@ -392,18 +392,17 @@ class reducer<T, BinaryOperation,
 /// implementation classes. It is needed to detect the reduction classes.
 class reduction_impl_base {};
 
-/// Predicate returning true if and only if 'FirstT' is a reduction class and
-/// all types except the last one from 'RestT' are reductions as well.
-template <typename FirstT, typename... RestT>
-struct are_all_but_last_reductions {
+/// Predicate returning true if all template type parameters except the last one
+/// are reductions.
+template <typename FirstT, typename... RestT> struct AreAllButLastReductions {
   static constexpr bool value =
       std::is_base_of<reduction_impl_base, FirstT>::value &&
-      are_all_but_last_reductions<RestT...>::value;
+      AreAllButLastReductions<RestT...>::value;
 };
 
-/// Helper specialization of are_all_but_last_reductions for one element only.
-/// Returns true if the last and only typename is not a reduction.
-template <typename T> struct are_all_but_last_reductions<T> {
+/// Helper specialization of AreAllButLastReductions for one element only.
+/// Returns true if the template parameter is not a reduction.
+template <typename T> struct AreAllButLastReductions<T> {
   static constexpr bool value = !std::is_base_of<reduction_impl_base, T>::value;
 };
 
@@ -1097,9 +1096,11 @@ reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
 /// the reductions for which a local accessors are needed, this function creates
 /// those local accessors and returns a tuple consisting of them.
 template <typename... Reductions, size_t... Is>
-std::tuple<typename Reductions::local_accessor_type...>
-createReduLocalAccs(size_t Size, handler &CGH, std::index_sequence<Is...>) {
-  return {Reductions::getReadWriteLocalAcc(Size, CGH)...};
+auto createReduLocalAccs(size_t Size, handler &CGH,
+                         std::index_sequence<Is...>) {
+  return std::make_tuple(
+      std::tuple_element_t<Is, std::tuple<Reductions...>>::getReadWriteLocalAcc(
+          Size, CGH)...);
 }
 
 /// For the given 'Reductions' types pack and indices enumerating them this
@@ -1154,7 +1155,7 @@ void callReduUserKernelFunc(KernelType KernelFunc, nd_item<Dims> NDIt,
   KernelFunc(NDIt, std::get<Is>(Reducers)...);
 }
 
-template <bool UniformPow2WG, typename... LocalAccT, typename... ReducerT,
+template <bool Pow2WG, typename... LocalAccT, typename... ReducerT,
           typename... ResultT, size_t... Is>
 void initReduLocalAccs(size_t LID, size_t WGSize,
                        std::tuple<LocalAccT...> LocalAccs,
@@ -1163,7 +1164,11 @@ void initReduLocalAccs(size_t LID, size_t WGSize,
                        std::index_sequence<Is...>) {
   std::tie(std::get<Is>(LocalAccs)[LID]...) =
       std::make_tuple(std::get<Is>(Reducers).MValue...);
-  if (!UniformPow2WG)
+
+  // For work-groups, which size is not power of two, local accessors have
+  // an additional element with index WGSize that is used by the tree-reduction
+  // algorithm. Initialize those additional elements with identity values here.
+  if (!Pow2WG)
     std::tie(std::get<Is>(LocalAccs)[WGSize]...) =
         std::make_tuple(std::get<Is>(Identities)...);
 }
@@ -1175,12 +1180,22 @@ void initReduLocalAccs(size_t LID, size_t GID, size_t NWorkItems, size_t WGSize,
                        std::tuple<LocalAccT...> InputAccs,
                        const std::tuple<ResultT...> Identities,
                        std::index_sequence<Is...>) {
+  // Normally, the local accessors are initialized with elements from the input
+  // accessors. The exception is the case when (GID >= NWorkItems), which
+  // possible only when UniformPow2WG is false. For that case the elements of
+  // local accessors are initialized with identity value, so they would not
+  // give any impact into the final partial sums during the tree-reduction
+  // algorithm work.
   if (UniformPow2WG || GID < NWorkItems)
     std::tie(std::get<Is>(LocalAccs)[LID]...) =
         std::make_tuple(std::get<Is>(InputAccs)[GID]...);
   else
     std::tie(std::get<Is>(LocalAccs)[LID]...) =
         std::make_tuple(std::get<Is>(Identities)...);
+
+  // For work-groups, which size is not power of two, local accessors have
+  // an additional element with index WGSize that is used by the tree-reduction
+  // algorithm. Initialize those additional elements with identity values here.
   if (!UniformPow2WG)
     std::tie(std::get<Is>(LocalAccs)[WGSize]...) =
         std::make_tuple(std::get<Is>(Identities)...);
@@ -1196,7 +1211,7 @@ void reduceReduLocalAccs(size_t IndexA, size_t IndexB,
                                           std::get<Is>(LocalAccs)[IndexB]))...);
 }
 
-template <bool UniformPow2WG, typename... Reductions, typename... OutAccT,
+template <bool Pow2WG, typename... Reductions, typename... OutAccT,
           typename... LocalAccT, typename... BOPsT, size_t... Is,
           size_t... RWIs>
 void writeReduSumsToOutAccs(size_t OutAccIndex, size_t WGSize,
@@ -1214,11 +1229,16 @@ void writeReduSumsToOutAccs(size_t OutAccIndex, size_t WGSize,
           std::tuple_element_t<RWIs, std::tuple<Reductions...>>::getOutPointer(
               std::get<RWIs>(OutAccs))[OutAccIndex])...);
 
-  if (UniformPow2WG) {
+  if (Pow2WG) {
+    // The partial sums for the work-group are stored in 0-th elements of local
+    // accessors. Simply write those sums to output accessors.
     std::tie(std::tuple_element_t<Is, std::tuple<Reductions...>>::getOutPointer(
         std::get<Is>(OutAccs))[OutAccIndex]...) =
         std::make_tuple(std::get<Is>(LocalAccs)[0]...);
   } else {
+    // Each of local accessors keeps two partial sums: in 0-th and WGsize-th
+    // elements. Combine them into final partial sums and write to output
+    // accessors.
     std::tie(std::tuple_element_t<Is, std::tuple<Reductions...>>::getOutPointer(
         std::get<Is>(OutAccs))[OutAccIndex]...) =
         std::make_tuple(std::get<Is>(BOPs)(std::get<Is>(LocalAccs)[0],
@@ -1300,15 +1320,15 @@ constexpr auto filterSequence(FunctorT F, std::index_sequence<Is...> Indices) {
   return filterSequenceHelper<T...>(F, Indices);
 }
 
-template <typename KernelName, bool UniformPow2WG, bool IsOneWG,
-          typename KernelType, int Dims, typename... Reductions, size_t... Is>
+template <typename KernelName, bool Pow2WG, bool IsOneWG, typename KernelType,
+          int Dims, typename... Reductions, size_t... Is>
 void reduCGFuncImpl(handler &CGH, KernelType KernelFunc,
                     const nd_range<Dims> &Range,
                     std::tuple<Reductions...> &ReduTuple,
                     std::index_sequence<Is...> ReduIndices) {
 
   size_t WGSize = Range.get_local_range().size();
-  size_t LocalAccSize = WGSize + (UniformPow2WG ? 0 : 1);
+  size_t LocalAccSize = WGSize + (Pow2WG ? 0 : 1);
   auto LocalAccsTuple =
       createReduLocalAccs<Reductions...>(LocalAccSize, CGH, ReduIndices);
 
@@ -1318,10 +1338,8 @@ void reduCGFuncImpl(handler &CGH, KernelType KernelFunc,
   auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices);
   auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices);
 
-  using Name =
-      typename get_reduction_main_kernel_name_t<KernelName, KernelType,
-                                                UniformPow2WG, IsOneWG,
-                                                decltype(OutAccsTuple)>::name;
+  using Name = typename get_reduction_main_kernel_name_t<
+      KernelName, KernelType, Pow2WG, IsOneWG, decltype(OutAccsTuple)>::name;
   CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
     auto ReduIndices = std::index_sequence_for<Reductions...>();
     auto ReducersTuple =
@@ -1332,8 +1350,8 @@ void reduCGFuncImpl(handler &CGH, KernelType KernelFunc,
 
     size_t WGSize = NDIt.get_local_range().size();
     size_t LID = NDIt.get_local_linear_id();
-    initReduLocalAccs<UniformPow2WG>(LID, WGSize, LocalAccsTuple, ReducersTuple,
-                                     IdentitiesTuple, ReduIndices);
+    initReduLocalAccs<Pow2WG>(LID, WGSize, LocalAccsTuple, ReducersTuple,
+                              IdentitiesTuple, ReduIndices);
     NDIt.barrier();
 
     size_t PrevStep = WGSize;
@@ -1342,7 +1360,7 @@ void reduCGFuncImpl(handler &CGH, KernelType KernelFunc,
         // LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
         reduceReduLocalAccs(LID, LID + CurStep, LocalAccsTuple, BOPsTuple,
                             ReduIndices);
-      } else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1)) {
+      } else if (!Pow2WG && LID == CurStep && (PrevStep & 0x1)) {
         // LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
         reduceReduLocalAccs(WGSize, PrevStep - 1, LocalAccsTuple, BOPsTuple,
                             ReduIndices);
@@ -1363,7 +1381,7 @@ void reduCGFuncImpl(handler &CGH, KernelType KernelFunc,
           Predicate;
       auto RWReduIndices =
           filterSequence<Reductions...>(Predicate, ReduIndices);
-      writeReduSumsToOutAccs<UniformPow2WG>(
+      writeReduSumsToOutAccs<Pow2WG>(
           GrID, WGSize, (std::tuple<Reductions...> *)nullptr, OutAccsTuple,
           LocalAccsTuple, BOPsTuple, ReduIndices, RWReduIndices);
     }
@@ -1376,21 +1394,18 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc,
                 const nd_range<Dims> &Range,
                 std::tuple<Reductions...> &ReduTuple,
                 std::index_sequence<Is...> ReduIndices) {
-  size_t NWorkItems = Range.get_global_range().size();
   size_t WGSize = Range.get_local_range().size();
   size_t NWorkGroups = Range.get_group_range().size();
-
   bool Pow2WG = (WGSize & (WGSize - 1)) == 0;
-  bool HasUniformWG = Pow2WG && (NWorkGroups * WGSize == NWorkItems);
   if (NWorkGroups == 1) {
-    if (HasUniformWG)
+    if (Pow2WG)
       reduCGFuncImpl<KernelName, true, true>(CGH, KernelFunc, Range, ReduTuple,
                                              ReduIndices);
     else
       reduCGFuncImpl<KernelName, false, true>(CGH, KernelFunc, Range, ReduTuple,
                                               ReduIndices);
   } else {
-    if (HasUniformWG)
+    if (Pow2WG)
       reduCGFuncImpl<KernelName, true, false>(CGH, KernelFunc, Range, ReduTuple,
                                               ReduIndices);
     else
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index f33ef1bfe0de..5fbd1d99be69 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -269,8 +269,7 @@ template <typename TupleT, std::size_t... Is>
 std::tuple<std::tuple_element_t<Is, TupleT>...>
 tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>);
 
-template <typename FirstT, typename... RestT>
-struct are_all_but_last_reductions;
+template <typename FirstT, typename... RestT> struct AreAllButLastReductions;
 
 } // namespace detail
 } // namespace ONEAPI
@@ -1288,9 +1287,8 @@ class __SYCL_EXPORT handler {
   // versions handling 1 reduction variable are more efficient right now.
   template <typename KernelName = detail::auto_name, int Dims,
             typename... RestT>
-  std::enable_if_t<
-      (sizeof...(RestT) >= 3 &&
-       ONEAPI::detail::are_all_but_last_reductions<RestT...>::value)>
+  std::enable_if_t<(sizeof...(RestT) >= 3 &&
+                    ONEAPI::detail::AreAllButLastReductions<RestT...>::value)>
   parallel_for(nd_range<Dims> Range, RestT... Rest) {
     std::tuple<RestT...> ArgsTuple(Rest...);
     constexpr size_t NumArgs = sizeof...(RestT);