@@ -179,33 +179,19 @@ class reduction_impl;
179179
180180using cl::sycl::detail::enable_if_t ;
181181
182- template <typename KernelName, typename KernelType, int Dims, class Reduction >
183- enable_if_t <Reduction::has_fast_reduce && Reduction::has_fast_atomics>
184- reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
185- Reduction &Redu, typename Reduction::rw_accessor_type &Out);
186-
187- template <typename KernelName, typename KernelType, int Dims, class Reduction >
188- enable_if_t <!Reduction::has_fast_reduce && Reduction::has_fast_atomics>
182+ template <typename KernelName, typename KernelType, int Dims, class Reduction ,
183+ typename OutputT>
184+ enable_if_t <Reduction::has_fast_atomics>
189185reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
190- Reduction &Redu, typename Reduction::rw_accessor_type &Out);
191-
192- template <typename KernelName, typename KernelType, int Dims, class Reduction >
193- enable_if_t <Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
194- reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
195- Reduction &Redu);
186+ Reduction &Redu, OutputT Out);
196187
197188template <typename KernelName, typename KernelType, int Dims, class Reduction >
198- enable_if_t <!Reduction::has_fast_reduce && !Reduction:: has_fast_atomics>
189+ enable_if_t <!Reduction::has_fast_atomics>
199190reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
200191 Reduction &Redu);
201192
202193template <typename KernelName, typename KernelType, int Dims, class Reduction >
203- enable_if_t <Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
204- reduAuxCGFunc (handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
205- Reduction &Redu);
206-
207- template <typename KernelName, typename KernelType, int Dims, class Reduction >
208- enable_if_t <!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
194+ enable_if_t <!Reduction::has_fast_atomics>
209195reduAuxCGFunc (handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
210196 Reduction &Redu);
211197} // namespace detail
@@ -958,24 +944,23 @@ class __SYCL_EXPORT handler {
958944 template <typename KernelName = detail::auto_name, typename KernelType,
959945 int Dims, typename Reduction>
960946 detail::enable_if_t <Reduction::accessor_mode == access::mode::read_write &&
961- Reduction::has_fast_atomics>
947+ Reduction::has_fast_atomics && !Reduction::is_usm >
962948 parallel_for (nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
963- if (Reduction::is_usm)
964- Redu.associateWithHandler (*this );
965- shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
966- auto Acc = Redu.getUserAccessor ();
967- intel::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu, Acc);
968-
969- // Submit non-blocking copy from reduction accessor to user's reduction
970- // variable.
971- if (Reduction::is_usm) {
972- this ->finalize ();
973- handler CopyHandler (QueueCopy, MIsHost);
974- CopyHandler.saveCodeLoc (MCodeLoc);
975- Redu.associateWithHandler (CopyHandler);
976- CopyHandler.copy (Acc, Redu.getUSMPointer ());
977- MLastEvent = CopyHandler.finalize ();
978- }
949+ intel::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu,
950+ Redu.getUserAccessor ());
951+ }
952+
953+ // / Implements parallel_for() accepting nd_range and 1 reduction variable
954+ // / having 'read_write' access mode.
955+ // / This version uses fast sycl::atomic operations to update user's reduction
956+ // / variable at the end of each work-group work.
957+ template <typename KernelName = detail::auto_name, typename KernelType,
958+ int Dims, typename Reduction>
959+ detail::enable_if_t <Reduction::accessor_mode == access::mode::read_write &&
960+ Reduction::has_fast_atomics && Reduction::is_usm>
961+ parallel_for (nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
962+ intel::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu,
963+ Redu.getUSMPointer ());
979964 }
980965
981966 // / Implements parallel_for() accepting nd_range and 1 reduction variable
@@ -1045,8 +1030,6 @@ class __SYCL_EXPORT handler {
10451030 // necessary to reduce all partial sums into one final sum.
10461031
10471032 // 1. Call the kernel that includes user's lambda function.
1048- if (Reduction::is_usm && NWorkGroups == 1 )
1049- Redu.associateWithHandler (*this );
10501033 intel::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu);
10511034 shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
10521035 this ->finalize ();
@@ -1073,26 +1056,16 @@ class __SYCL_EXPORT handler {
10731056 handler AuxHandler (QueueCopy, MIsHost);
10741057 AuxHandler.saveCodeLoc (MCodeLoc);
10751058
1076- // The last kernel DOES write to reduction 's accessor.
1059+ // The last kernel DOES write to user 's accessor passed to reduction .
10771060 // Associate it with handler manually.
1078- if (NWorkGroups == 1 )
1061+ if (NWorkGroups == 1 && !Reduction::is_usm )
10791062 Redu.associateWithHandler (AuxHandler);
10801063 intel::detail::reduAuxCGFunc<KernelName, KernelType>(AuxHandler, Range,
10811064 NWorkItems, Redu);
10821065 MLastEvent = AuxHandler.finalize ();
10831066
10841067 NWorkItems = NWorkGroups;
10851068 } // end while (NWorkItems > 1)
1086-
1087- // Submit non-blocking copy from reduction accessor to user's reduction
1088- // variable.
1089- if (Reduction::is_usm) {
1090- handler CopyHandler (QueueCopy, MIsHost);
1091- CopyHandler.saveCodeLoc (MCodeLoc);
1092- Redu.associateWithHandler (CopyHandler);
1093- CopyHandler.copy (Redu.getUserAccessor (), Redu.getUSMPointer ());
1094- MLastEvent = CopyHandler.finalize ();
1095- }
10961069 }
10971070
10981071 // / Hierarchical kernel invocation method of a kernel defined as a lambda
0 commit comments