Skip to content

Commit 2e73da7

Browse files
authored
[SYCL] Implement true USM reductions (#1888)
This patch replaces the existing implementation of USM reductions with the more reliable and more efficient implementation. The previous implementation created placeholder buffer/accessor for USM pointer passed to reduction. That caused some unnecessary overhead. Even worse, that approach did not work on systems with detached memory, and could cause the need in additional workaround - copy from USM to HOST mem before the main reduction work and copy from HOST to USM memory after reduction work. That would make the implementation even slower. The new approach uses USM memory passed by user to reduction, and it works 2x faster than the previous approach with workaround. In order to avoid code duplication which could be caused by adding true USM implementations, the significant rework in reduCGFunc and reduAuxCGFunc was done. The re-work though did NOT change the actual implementation and the algorithmic parts. Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent bf8493c commit 2e73da7

File tree

2 files changed

+367
-487
lines changed

2 files changed

+367
-487
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 24 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -179,33 +179,19 @@ class reduction_impl;
179179

180180
using cl::sycl::detail::enable_if_t;
181181

182-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
183-
enable_if_t<Reduction::has_fast_reduce && Reduction::has_fast_atomics>
184-
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
185-
Reduction &Redu, typename Reduction::rw_accessor_type &Out);
186-
187-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
188-
enable_if_t<!Reduction::has_fast_reduce && Reduction::has_fast_atomics>
182+
template <typename KernelName, typename KernelType, int Dims, class Reduction,
183+
typename OutputT>
184+
enable_if_t<Reduction::has_fast_atomics>
189185
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
190-
Reduction &Redu, typename Reduction::rw_accessor_type &Out);
191-
192-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
193-
enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
194-
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
195-
Reduction &Redu);
186+
Reduction &Redu, OutputT Out);
196187

197188
template <typename KernelName, typename KernelType, int Dims, class Reduction>
198-
enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
189+
enable_if_t<!Reduction::has_fast_atomics>
199190
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
200191
Reduction &Redu);
201192

202193
template <typename KernelName, typename KernelType, int Dims, class Reduction>
203-
enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
204-
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
205-
Reduction &Redu);
206-
207-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
208-
enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
194+
enable_if_t<!Reduction::has_fast_atomics>
209195
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
210196
Reduction &Redu);
211197
} // namespace detail
@@ -958,24 +944,23 @@ class __SYCL_EXPORT handler {
958944
template <typename KernelName = detail::auto_name, typename KernelType,
959945
int Dims, typename Reduction>
960946
detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
961-
Reduction::has_fast_atomics>
947+
Reduction::has_fast_atomics && !Reduction::is_usm>
962948
parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
963-
if (Reduction::is_usm)
964-
Redu.associateWithHandler(*this);
965-
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
966-
auto Acc = Redu.getUserAccessor();
967-
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu, Acc);
968-
969-
// Submit non-blocking copy from reduction accessor to user's reduction
970-
// variable.
971-
if (Reduction::is_usm) {
972-
this->finalize();
973-
handler CopyHandler(QueueCopy, MIsHost);
974-
CopyHandler.saveCodeLoc(MCodeLoc);
975-
Redu.associateWithHandler(CopyHandler);
976-
CopyHandler.copy(Acc, Redu.getUSMPointer());
977-
MLastEvent = CopyHandler.finalize();
978-
}
949+
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
950+
Redu.getUserAccessor());
951+
}
952+
953+
/// Implements parallel_for() accepting nd_range and 1 reduction variable
954+
/// having 'read_write' access mode.
955+
/// This version uses fast sycl::atomic operations to update user's reduction
956+
/// variable at the end of each work-group work.
957+
template <typename KernelName = detail::auto_name, typename KernelType,
958+
int Dims, typename Reduction>
959+
detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
960+
Reduction::has_fast_atomics && Reduction::is_usm>
961+
parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
962+
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
963+
Redu.getUSMPointer());
979964
}
980965

981966
/// Implements parallel_for() accepting nd_range and 1 reduction variable
@@ -1045,8 +1030,6 @@ class __SYCL_EXPORT handler {
10451030
// necessary to reduce all partial sums into one final sum.
10461031

10471032
// 1. Call the kernel that includes user's lambda function.
1048-
if (Reduction::is_usm && NWorkGroups == 1)
1049-
Redu.associateWithHandler(*this);
10501033
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
10511034
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
10521035
this->finalize();
@@ -1073,26 +1056,16 @@ class __SYCL_EXPORT handler {
10731056
handler AuxHandler(QueueCopy, MIsHost);
10741057
AuxHandler.saveCodeLoc(MCodeLoc);
10751058

1076-
// The last kernel DOES write to reduction's accessor.
1059+
// The last kernel DOES write to user's accessor passed to reduction.
10771060
// Associate it with handler manually.
1078-
if (NWorkGroups == 1)
1061+
if (NWorkGroups == 1 && !Reduction::is_usm)
10791062
Redu.associateWithHandler(AuxHandler);
10801063
intel::detail::reduAuxCGFunc<KernelName, KernelType>(AuxHandler, Range,
10811064
NWorkItems, Redu);
10821065
MLastEvent = AuxHandler.finalize();
10831066

10841067
NWorkItems = NWorkGroups;
10851068
} // end while (NWorkItems > 1)
1086-
1087-
// Submit non-blocking copy from reduction accessor to user's reduction
1088-
// variable.
1089-
if (Reduction::is_usm) {
1090-
handler CopyHandler(QueueCopy, MIsHost);
1091-
CopyHandler.saveCodeLoc(MCodeLoc);
1092-
Redu.associateWithHandler(CopyHandler);
1093-
CopyHandler.copy(Redu.getUserAccessor(), Redu.getUSMPointer());
1094-
MLastEvent = CopyHandler.finalize();
1095-
}
10961069
}
10971070

10981071
/// Hierarchical kernel invocation method of a kernel defined as a lambda

0 commit comments

Comments
 (0)