diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 2164fcd45358f..c64dfa71b1066 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -200,8 +200,7 @@ void event_impl::wait( waitInternal(); else if (MCommand) detail::Scheduler::getInstance().waitForEvent(Self); - if (MCommand && !SYCLConfig::get()) - detail::Scheduler::getInstance().cleanupFinishedCommands(std::move(Self)); + cleanupCommand(std::move(Self)); #ifdef XPTI_ENABLE_INSTRUMENTATION instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); @@ -222,6 +221,12 @@ void event_impl::wait_and_throw( Cmd->getQueue()->throw_asynchronous(); } +void event_impl::cleanupCommand( + std::shared_ptr Self) const { + if (MCommand && !SYCLConfig::get()) + detail::Scheduler::getInstance().cleanupFinishedCommands(std::move(Self)); +} + template <> cl_ulong event_impl::get_profiling_info() const { diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index f32e4111e782a..d2b31fcdfbe69 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -74,6 +74,12 @@ class event_impl { /// \param Self is a pointer to this event. void wait_and_throw(std::shared_ptr Self); + /// Clean up the command associated with the event. Assumes that the task this + /// event is associated with has been completed. + /// + /// \param Self is a pointer to this event. + void cleanupCommand(std::shared_ptr Self) const; + /// Queries this event for profiling information. /// /// If the requested info is not available when this member function is diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 3de5a8e35572b..c92cfa1676918 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -61,7 +61,11 @@ event queue_impl::memset(const std::shared_ptr &Self, return event(); event ResEvent = prepareUSMEvent(Self, NativeEvent); - addSharedEvent(ResEvent); + // Track only if we won't be able to handle it with piQueueFinish. + // FIXME these events are stored for level zero until as a workaround, remove + // once piEventRelease no longer calls wait on the event in the plugin. + if (!MSupportOOO || getPlugin().getBackend() == backend::level_zero) + addSharedEvent(ResEvent); return ResEvent; } @@ -76,7 +80,11 @@ event queue_impl::memcpy(const std::shared_ptr &Self, return event(); event ResEvent = prepareUSMEvent(Self, NativeEvent); - addSharedEvent(ResEvent); + // Track only if we won't be able to handle it with piQueueFinish. + // FIXME these events are stored for level zero until as a workaround, remove + // once piEventRelease no longer calls wait on the event in the plugin. + if (!MSupportOOO || getPlugin().getBackend() == backend::level_zero) + addSharedEvent(ResEvent); return ResEvent; } @@ -92,7 +100,11 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, return event(); event ResEvent = prepareUSMEvent(Self, NativeEvent); - addSharedEvent(ResEvent); + // Track only if we won't be able to handle it with piQueueFinish. + // FIXME these events are stored for level zero until as a workaround, remove + // once piEventRelease no longer calls wait on the event in the plugin. + if (!MSupportOOO || getPlugin().getBackend() == backend::level_zero) + addSharedEvent(ResEvent); return ResEvent; } @@ -101,8 +113,14 @@ void queue_impl::addEvent(const event &Event) { Command *Cmd = (Command *)(Eimpl->getCommand()); if (!Cmd) { // if there is no command on the event, we cannot track it with MEventsWeak - // as that will leave it with no owner. Track in MEventsShared - addSharedEvent(Event); + // as that will leave it with no owner. Track in MEventsShared only if we're + // unable to call piQueueFinish during wait. + // FIXME these events are stored for level zero until as a workaround, + // remove once piEventRelease no longer calls wait on the event in the + // plugin. + if (is_host() || !MSupportOOO || + getPlugin().getBackend() == backend::level_zero) + addSharedEvent(Event); } else { std::weak_ptr EventWeakPtr{Eimpl}; std::lock_guard Lock{MMutex}; @@ -114,6 +132,10 @@ void queue_impl::addEvent(const event &Event) { /// but some events have no other owner. In this case, /// addSharedEvent will have the queue track the events via a shared pointer. void queue_impl::addSharedEvent(const event &Event) { + // FIXME The assertion should be corrected once the Level Zero workaround is + // removed. + assert(is_host() || !MSupportOOO || + getPlugin().getBackend() == backend::level_zero); std::lock_guard Lock(MMutex); // Events stored in MEventsShared are not released anywhere else aside from // calls to queue::wait/wait_and_throw, which a user application might not @@ -234,21 +256,49 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId); #endif - std::vector> Events; - std::vector USMEvents; + std::vector> WeakEvents; + std::vector SharedEvents; { - std::lock_guard Lock(MMutex); - Events.swap(MEventsWeak); - USMEvents.swap(MEventsShared); + std::lock_guard Lock(MMutex); + WeakEvents.swap(MEventsWeak); + SharedEvents.swap(MEventsShared); + } + // If the queue is either a host one or does not support OOO (and we use + // multiple in-order queues as a result of that), wait for each event + // directly. Otherwise, only wait for unenqueued or host task events, starting + // from the latest submitted task in order to minimize total amount of calls, + // then handle the rest with piQueueFinish. + bool SupportsPiFinish = !is_host() && MSupportOOO; + for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); + EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { + if (std::shared_ptr EventImplSharedPtr = + EventImplWeakPtrIt->lock()) { + // A nullptr PI event indicates that piQueueFinish will not cover it, + // either because it's a host task event or an unenqueued one. + if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandleRef()) { + EventImplSharedPtr->wait(EventImplSharedPtr); + } + } + } + if (SupportsPiFinish) { + const detail::plugin &Plugin = getPlugin(); + Plugin.call(getHandleRef()); + for (std::weak_ptr &EventImplWeakPtr : WeakEvents) + if (std::shared_ptr EventImplSharedPtr = + EventImplWeakPtr.lock()) + EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); + // FIXME these events are stored for level zero until as a workaround, + // remove once piEventRelease no longer calls wait on the event in the + // plugin. + if (Plugin.getBackend() == backend::level_zero) { + SharedEvents.clear(); + } + assert(SharedEvents.empty() && "Queues that support calling piQueueFinish " + "shouldn't have shared events"); + } else { + for (event &Event : SharedEvents) + Event.wait(); } - - for (std::weak_ptr &EventImplWeakPtr : Events) - if (std::shared_ptr EventImplPtr = EventImplWeakPtr.lock()) - EventImplPtr->wait(EventImplPtr); - - for (event &Event : USMEvents) - Event.wait(); - #ifdef XPTI_ENABLE_INSTRUMENTATION instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); #endif diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index e0f28f1b9c2de..8e1b17f5ff920 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -486,10 +486,14 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr(); // 1. Async work is not supported for host device. - // 2. The event handle can be null in case of, for example, alloca command, - // which is currently synchronous, so don't generate OpenCL event. - // Though, this event isn't host one as it's context isn't host one. - if (DepEvent->is_host() || DepEvent->getHandleRef() == nullptr) { + // 2. Some types of commands do not produce PI events after they are enqueued + // (e.g. alloca). Note that we can't check the pi event to make that + // distinction since the command might still be unenqueued at this point. + bool PiEventExpected = !DepEvent->is_host(); + if (auto *DepCmd = static_cast(DepEvent->getCommand())) + PiEventExpected &= DepCmd->producesPiEvent(); + + if (!PiEventExpected) { // call to waitInternal() is in waitForPreparedHostEvents() as it's called // from enqueue process functions MPreparedHostDepsEvents.push_back(DepEvent); @@ -520,6 +524,8 @@ const ContextImplPtr &Command::getWorkerContext() const { const QueueImplPtr &Command::getWorkerQueue() const { return MQueue; } +bool Command::producesPiEvent() const { return true; } + Command *Command::addDep(DepDesc NewDep) { Command *ConnectionCmd = nullptr; @@ -731,6 +737,8 @@ void AllocaCommandBase::emitInstrumentationData() { #endif } +bool AllocaCommandBase::producesPiEvent() const { return false; } + AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData, AllocaCommandBase *LinkedAllocaCmd) @@ -998,6 +1006,8 @@ void ReleaseCommand::printDot(std::ostream &Stream) const { } } +bool ReleaseCommand::producesPiEvent() const { return false; } + MapMemObject::MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode) @@ -1392,6 +1402,8 @@ void EmptyCommand::printDot(std::ostream &Stream) const { } } +bool EmptyCommand::producesPiEvent() const { return false; } + void MemCpyCommandHost::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\""; @@ -2193,6 +2205,10 @@ cl_int ExecCGCommand::enqueueImp() { return PI_INVALID_OPERATION; } +bool ExecCGCommand::producesPiEvent() const { + return MCommandGroup->getType() != CG::CGTYPE::CODEPLAY_HOST_TASK; +} + } // namespace detail } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 9f85146eaf347..d976577821ebb 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -189,6 +189,9 @@ class Command { /// for memory copy commands. virtual const QueueImplPtr &getWorkerQueue() const; + /// Returns true iff the command produces a PI event on non-host devices. + virtual bool producesPiEvent() const; + protected: EventImplPtr MEvent; QueueImplPtr MQueue; @@ -306,6 +309,8 @@ class EmptyCommand : public Command { void emitInstrumentationData() override; + bool producesPiEvent() const final; + private: cl_int enqueueImp() final; @@ -323,6 +328,7 @@ class ReleaseCommand : public Command { void printDot(std::ostream &Stream) const final; void emitInstrumentationData() override; + bool producesPiEvent() const final; private: cl_int enqueueImp() final; @@ -347,6 +353,8 @@ class AllocaCommandBase : public Command { void emitInstrumentationData() override; + bool producesPiEvent() const final; + void *MMemAllocation = nullptr; /// Alloca command linked with current command. @@ -518,6 +526,8 @@ class ExecCGCommand : public Command { MCommandGroup.release(); } + bool producesPiEvent() const final; + private: cl_int enqueueImp() final; diff --git a/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp b/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp index d98140ac4b1ff..167df039b4474 100644 --- a/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp +++ b/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp @@ -25,10 +25,10 @@ // CHECK: ZE ---> zeCommandListClose // CHECK: ZE ---> zeCommandQueueExecuteCommandLists // CHECK: ---> piEventGetInfo -// CHECK-NOT: piEventsWait +// CHECK-NOT: piQueueFinish // CHECK: ---> piEnqueueKernelLaunch // CHECK: ZE ---> zeCommandListAppendLaunchKernel -// CHECK: ---> piEventsWait +// CHECK: ---> piQueueFinish // Look for close and Execute after piEventsWait // CHECK: ZE ---> zeCommandListClose // CHECK: ZE ---> zeCommandQueueExecuteCommandLists diff --git a/sycl/test/on-device/plugins/level_zero_batch_test.cpp b/sycl/test/on-device/plugins/level_zero_batch_test.cpp index f834e2bfab121..43426e6fb6dd1 100644 --- a/sycl/test/on-device/plugins/level_zero_batch_test.cpp +++ b/sycl/test/on-device/plugins/level_zero_batch_test.cpp @@ -86,7 +86,7 @@ // CKB4: ZE ---> zeCommandQueueExecuteCommandLists( // CKB8: ZE ---> zeCommandListClose( // CKB8: ZE ---> zeCommandQueueExecuteCommandLists( -// CKALL: ---> piEventsWait( +// CKALL: ---> piQueueFinish( // CKB3: ZE ---> zeCommandListClose( // CKB3: ZE ---> zeCommandQueueExecuteCommandLists( // CKB5: ZE ---> zeCommandListClose( @@ -142,7 +142,7 @@ // CKB4: ZE ---> zeCommandQueueExecuteCommandLists( // CKB8: ZE ---> zeCommandListClose( // CKB8: ZE ---> zeCommandQueueExecuteCommandLists( -// CKALL: ---> piEventsWait( +// CKALL: ---> piQueueFinish( // CKB3: ZE ---> zeCommandListClose( // CKB3: ZE ---> zeCommandQueueExecuteCommandLists( // CKB5: ZE ---> zeCommandListClose( @@ -198,7 +198,7 @@ // CKB4: ZE ---> zeCommandQueueExecuteCommandLists( // CKB8: ZE ---> zeCommandListClose( // CKB8: ZE ---> zeCommandQueueExecuteCommandLists( -// CKALL: ---> piEventsWait( +// CKALL: ---> piQueueFinish( // CKB3: ZE ---> zeCommandListClose( // CKB3: ZE ---> zeCommandQueueExecuteCommandLists( // CKB5: ZE ---> zeCommandListClose( diff --git a/sycl/unittests/queue/CMakeLists.txt b/sycl/unittests/queue/CMakeLists.txt index 732e5de87e87d..9aa147ed10e0c 100644 --- a/sycl/unittests/queue/CMakeLists.txt +++ b/sycl/unittests/queue/CMakeLists.txt @@ -1,3 +1,4 @@ add_sycl_unittest(QueueTests OBJECT EventClear.cpp + Wait.cpp ) diff --git a/sycl/unittests/queue/EventClear.cpp b/sycl/unittests/queue/EventClear.cpp index d5b42a187b292..24bf30fa58b2d 100644 --- a/sycl/unittests/queue/EventClear.cpp +++ b/sycl/unittests/queue/EventClear.cpp @@ -25,6 +25,19 @@ std::unique_ptr TestContext; const int ExpectedEventThreshold = 128; +pi_result redefinedQueueCreate(pi_context context, pi_device device, + pi_queue_properties properties, + pi_queue *queue) { + // Use in-order queues to force storing events for calling wait on them, + // rather than calling piQueueFinish. + if (properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) { + return PI_INVALID_QUEUE_PROPERTIES; + } + return PI_SUCCESS; +} + +pi_result redefinedQueueRelease(pi_queue Queue) { return PI_SUCCESS; } + pi_result redefinedUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, size_t count, pi_uint32 num_events_in_waitlist, @@ -83,6 +96,8 @@ bool preparePiMock(platform &Plt) { } unittest::PiMock Mock{Plt}; + Mock.redefine(redefinedQueueCreate); + Mock.redefine(redefinedQueueRelease); Mock.redefine( redefinedUSMEnqueueMemset); Mock.redefine(redefinedEventsWait); diff --git a/sycl/unittests/queue/Wait.cpp b/sycl/unittests/queue/Wait.cpp new file mode 100644 index 0000000000000..417b2c5078c94 --- /dev/null +++ b/sycl/unittests/queue/Wait.cpp @@ -0,0 +1,198 @@ +//==--------------------- Wait.cpp --- queue unit tests --------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include + +namespace { +using namespace cl::sycl; + +struct TestCtx { + bool SupportOOO = true; + bool PiQueueFinishCalled = false; + int NEventsWaitedFor = 0; + int EventReferenceCount = 0; +}; +static TestCtx TestContext; + +pi_result redefinedQueueCreate(pi_context context, pi_device device, + pi_queue_properties properties, + pi_queue *queue) { + if (!TestContext.SupportOOO && + properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) { + return PI_INVALID_QUEUE_PROPERTIES; + } + return PI_SUCCESS; +} + +pi_result redefinedQueueRelease(pi_queue Queue) { return PI_SUCCESS; } + +pi_result redefinedUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value, + size_t Count, + pi_uint32 Num_events_in_waitlist, + const pi_event *Events_waitlist, + pi_event *Event) { + // Provide a dummy non-nullptr value + TestContext.EventReferenceCount = 1; + *Event = reinterpret_cast(1); + return PI_SUCCESS; +} +pi_result redefinedEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer, + const void *Pattern, size_t PatternSize, + size_t Offset, size_t Size, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + // Provide a dummy non-nullptr value + TestContext.EventReferenceCount = 1; + *Event = reinterpret_cast(1); + return PI_SUCCESS; +} + +pi_result redefinedQueueFinish(pi_queue Queue) { + TestContext.PiQueueFinishCalled = true; + return PI_SUCCESS; +} +pi_result redefinedEventsWait(pi_uint32 num_events, + const pi_event *event_list) { + ++TestContext.NEventsWaitedFor; + return PI_SUCCESS; +} + +pi_result redefinedEventGetInfo(pi_event event, pi_event_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return PI_SUCCESS; +} + +pi_result redefinedEventRetain(pi_event event) { + ++TestContext.EventReferenceCount; + return PI_SUCCESS; +} + +pi_result redefinedEventRelease(pi_event event) { + --TestContext.EventReferenceCount; + return PI_SUCCESS; +} + +bool preparePiMock(platform &Plt) { + if (Plt.is_host()) { + std::cout << "Not run on host - no PI events created in that case" + << std::endl; + return false; + } + + unittest::PiMock Mock{Plt}; + Mock.redefine(redefinedQueueCreate); + Mock.redefine(redefinedQueueRelease); + Mock.redefine(redefinedQueueFinish); + Mock.redefine( + redefinedUSMEnqueueMemset); + Mock.redefine(redefinedEventsWait); + Mock.redefine( + redefinedEnqueueMemBufferFill); + Mock.redefine(redefinedEventGetInfo); + Mock.redefine(redefinedEventRetain); + Mock.redefine(redefinedEventRelease); + return true; +} + +TEST(QueueWait, QueueWaitTest) { + platform Plt{default_selector()}; + if (!preparePiMock(Plt)) + return; + context Ctx{Plt}; + queue Q{Ctx, default_selector()}; + + unsigned char *HostAlloc = (unsigned char *)malloc_host(1, Ctx); + + // USM API event + TestContext = {}; + Q.memset(HostAlloc, 42, 1); + // No need to keep the event since we'll use piQueueFinish. + // FIXME ... unless the plugin is Level Zero, where there's a workaround that + // releases events later. + if (detail::getSyclObjImpl(Plt)->getPlugin().getBackend() != + backend::level_zero) { + ASSERT_EQ(TestContext.EventReferenceCount, 0); + } + Q.wait(); + ASSERT_EQ(TestContext.NEventsWaitedFor, 0); + ASSERT_TRUE(TestContext.PiQueueFinishCalled); + + // Events with temporary ownership + { + TestContext = {}; + buffer buf{range<1>(1)}; + Q.submit([&](handler &Cgh) { + auto acc = buf.template get_access(Cgh); + Cgh.fill(acc, 42); + }); + Q.wait(); + // Still owned by the execution graph + ASSERT_EQ(TestContext.EventReferenceCount, 1); + ASSERT_EQ(TestContext.NEventsWaitedFor, 0); + ASSERT_TRUE(TestContext.PiQueueFinishCalled); + } + + // Blocked commands + { + TestContext = {}; + buffer buf{range<1>(1)}; + event HostTaskEvent = Q.submit([&](handler &Cgh) { + auto acc = buf.template get_access(Cgh); + Cgh.host_task([=]() { (void)acc; }); + }); + std::shared_ptr HostTaskEventImpl = + detail::getSyclObjImpl(HostTaskEvent); + auto *Cmd = static_cast(HostTaskEventImpl->getCommand()); + detail::Command *EmptyTask = *Cmd->MUsers.begin(); + ASSERT_EQ(EmptyTask->getType(), detail::Command::EMPTY_TASK); + HostTaskEvent.wait(); + // Use the empty task produced by the host task to block the next commands + while (EmptyTask->MEnqueueStatus != + detail::EnqueueResultT::SyclEnqueueSuccess) + continue; + EmptyTask->MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueBlocked; + Q.submit([&](handler &Cgh) { + auto acc = buf.template get_access(Cgh); + Cgh.fill(acc, 42); + }); + Q.submit([&](handler &Cgh) { + auto acc = buf.template get_access(Cgh); + Cgh.fill(acc, 42); + }); + // Unblock the empty task to allow the submitted events to complete once + // enqueued. + EmptyTask->MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueSuccess; + Q.wait(); + // Only a single event (the last one) should be waited for here. + ASSERT_EQ(TestContext.NEventsWaitedFor, 1); + ASSERT_TRUE(TestContext.PiQueueFinishCalled); + } + + // Test behaviour for emulating an OOO queue with multiple in-order ones. + TestContext = {}; + TestContext.SupportOOO = false; + Q = {Ctx, default_selector()}; + Q.memset(HostAlloc, 42, 1); + // The event is kept alive in this case to call wait. + ASSERT_EQ(TestContext.EventReferenceCount, 1); + Q.wait(); + ASSERT_EQ(TestContext.EventReferenceCount, 0); + ASSERT_EQ(TestContext.NEventsWaitedFor, 1); + ASSERT_FALSE(TestContext.PiQueueFinishCalled); +} + +} // namespace