From dc32a6673a497ef09c342f39702bc57456403505 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 22 Jun 2021 14:28:36 +0300 Subject: [PATCH 1/7] [SYCL] Lower queue::wait() to piQueueFinish when possible This patch changes the logic of queue::wait() from waiting on each individual event in order of submission of their tasks to checking if each event's task has been enqueued, waiting for those that haven't been and calling piQueueFinish to take care of the rest. Notable exceptions to this new behaviour are host queues, queues that emulate out-of-order execution by creating multiple queues underneath and host task events, which are run on host regardless of the queue they are bound to. --- sycl/source/detail/event_impl.cpp | 9 +++- sycl/source/detail/event_impl.hpp | 6 +++ sycl/source/detail/queue_impl.cpp | 65 +++++++++++++++++++++-------- sycl/unittests/queue/EventClear.cpp | 15 +++++++ 4 files changed, 75 insertions(+), 20 deletions(-) diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 2164fcd45358f..c64dfa71b1066 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -200,8 +200,7 @@ void event_impl::wait( waitInternal(); else if (MCommand) detail::Scheduler::getInstance().waitForEvent(Self); - if (MCommand && !SYCLConfig::get()) - detail::Scheduler::getInstance().cleanupFinishedCommands(std::move(Self)); + cleanupCommand(std::move(Self)); #ifdef XPTI_ENABLE_INSTRUMENTATION instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); @@ -222,6 +221,12 @@ void event_impl::wait_and_throw( Cmd->getQueue()->throw_asynchronous(); } +void event_impl::cleanupCommand( + std::shared_ptr Self) const { + if (MCommand && !SYCLConfig::get()) + detail::Scheduler::getInstance().cleanupFinishedCommands(std::move(Self)); +} + template <> cl_ulong event_impl::get_profiling_info() const { diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index f32e4111e782a..d2b31fcdfbe69 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -74,6 +74,12 @@ class event_impl { /// \param Self is a pointer to this event. void wait_and_throw(std::shared_ptr Self); + /// Clean up the command associated with the event. Assumes that the task this + /// event is associated with has been completed. + /// + /// \param Self is a pointer to this event. + void cleanupCommand(std::shared_ptr Self) const; + /// Queries this event for profiling information. /// /// If the requested info is not available when this member function is diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 3de5a8e35572b..0e63afdc4dfe4 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -61,7 +61,9 @@ event queue_impl::memset(const std::shared_ptr &Self, return event(); event ResEvent = prepareUSMEvent(Self, NativeEvent); - addSharedEvent(ResEvent); + // Track only if we won't be able to handle it with piQueueFinish. + if (!MSupportOOO) + addSharedEvent(ResEvent); return ResEvent; } @@ -76,7 +78,9 @@ event queue_impl::memcpy(const std::shared_ptr &Self, return event(); event ResEvent = prepareUSMEvent(Self, NativeEvent); - addSharedEvent(ResEvent); + // Track only if we won't be able to handle it with piQueueFinish. + if (!MSupportOOO) + addSharedEvent(ResEvent); return ResEvent; } @@ -92,7 +96,9 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, return event(); event ResEvent = prepareUSMEvent(Self, NativeEvent); - addSharedEvent(ResEvent); + // Track only if we won't be able to handle it with piQueueFinish. + if (!MSupportOOO) + addSharedEvent(ResEvent); return ResEvent; } @@ -101,8 +107,10 @@ void queue_impl::addEvent(const event &Event) { Command *Cmd = (Command *)(Eimpl->getCommand()); if (!Cmd) { // if there is no command on the event, we cannot track it with MEventsWeak - // as that will leave it with no owner. Track in MEventsShared - addSharedEvent(Event); + // as that will leave it with no owner. Track in MEventsShared only if we're + // unable to call piQueueFinish during wait. + if (is_host() || !MSupportOOO) + addSharedEvent(Event); } else { std::weak_ptr EventWeakPtr{Eimpl}; std::lock_guard Lock{MMutex}; @@ -234,21 +242,42 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId); #endif - std::vector> Events; - std::vector USMEvents; + std::vector> WeakEvents; + std::vector SharedEvents; { - std::lock_guard Lock(MMutex); - Events.swap(MEventsWeak); - USMEvents.swap(MEventsShared); + std::lock_guard Lock(MMutex); + WeakEvents.swap(MEventsWeak); + SharedEvents.swap(MEventsShared); + } + // If the queue is either a host one or does not support OOO (and we use + // multiple in-order queues as a result of that), wait for each event + // directly. Otherwise, only wait for unenqueued or host task events, starting + // from the latest submitted task in order to minimize total amount of calls, + // then handle the rest with piQueueFinish. + for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); + EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { + if (std::shared_ptr EventImplSharedPtr = + EventImplWeakPtrIt->lock()) { + // A nullptr PI event indicates that piQueueFinish will not cover it, + // either because it's a host task event or an unenqueued one. + if (is_host() || !MSupportOOO || + nullptr == EventImplSharedPtr->getHandleRef()) { + EventImplSharedPtr->wait(EventImplSharedPtr); + } + } + } + if (!is_host() && MSupportOOO) { + const detail::plugin &Plugin = getPlugin(); + Plugin.call(getHandleRef()); + for (std::weak_ptr &EventImplWeakPtr : WeakEvents) + if (std::shared_ptr EventImplSharedPtr = + EventImplWeakPtr.lock()) + EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); + assert(SharedEvents.empty()); + } else { + for (event &Event : SharedEvents) + Event.wait(); } - - for (std::weak_ptr &EventImplWeakPtr : Events) - if (std::shared_ptr EventImplPtr = EventImplWeakPtr.lock()) - EventImplPtr->wait(EventImplPtr); - - for (event &Event : USMEvents) - Event.wait(); - #ifdef XPTI_ENABLE_INSTRUMENTATION instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); #endif diff --git a/sycl/unittests/queue/EventClear.cpp b/sycl/unittests/queue/EventClear.cpp index d5b42a187b292..24bf30fa58b2d 100644 --- a/sycl/unittests/queue/EventClear.cpp +++ b/sycl/unittests/queue/EventClear.cpp @@ -25,6 +25,19 @@ std::unique_ptr TestContext; const int ExpectedEventThreshold = 128; +pi_result redefinedQueueCreate(pi_context context, pi_device device, + pi_queue_properties properties, + pi_queue *queue) { + // Use in-order queues to force storing events for calling wait on them, + // rather than calling piQueueFinish. + if (properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) { + return PI_INVALID_QUEUE_PROPERTIES; + } + return PI_SUCCESS; +} + +pi_result redefinedQueueRelease(pi_queue Queue) { return PI_SUCCESS; } + pi_result redefinedUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, size_t count, pi_uint32 num_events_in_waitlist, @@ -83,6 +96,8 @@ bool preparePiMock(platform &Plt) { } unittest::PiMock Mock{Plt}; + Mock.redefine(redefinedQueueCreate); + Mock.redefine(redefinedQueueRelease); Mock.redefine( redefinedUSMEnqueueMemset); Mock.redefine(redefinedEventsWait); From eaabc1f0ab2cca15955883fc494a15251a1ae7d3 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 6 Jul 2021 14:27:50 +0300 Subject: [PATCH 2/7] Further reduce the number of wait calls for blocked commands Any blocked command was being added to MPreparedHostDepsEvents which led to waiting for the dependency event before enqueueing the dependant command rather than passing the dependency to PI. --- sycl/source/detail/scheduler/commands.cpp | 24 +++++++++++++++++++---- sycl/source/detail/scheduler/commands.hpp | 10 ++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index e0f28f1b9c2de..8e1b17f5ff920 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -486,10 +486,14 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr(); // 1. Async work is not supported for host device. - // 2. The event handle can be null in case of, for example, alloca command, - // which is currently synchronous, so don't generate OpenCL event. - // Though, this event isn't host one as it's context isn't host one. - if (DepEvent->is_host() || DepEvent->getHandleRef() == nullptr) { + // 2. Some types of commands do not produce PI events after they are enqueued + // (e.g. alloca). Note that we can't check the pi event to make that + // distinction since the command might still be unenqueued at this point. + bool PiEventExpected = !DepEvent->is_host(); + if (auto *DepCmd = static_cast(DepEvent->getCommand())) + PiEventExpected &= DepCmd->producesPiEvent(); + + if (!PiEventExpected) { // call to waitInternal() is in waitForPreparedHostEvents() as it's called // from enqueue process functions MPreparedHostDepsEvents.push_back(DepEvent); @@ -520,6 +524,8 @@ const ContextImplPtr &Command::getWorkerContext() const { const QueueImplPtr &Command::getWorkerQueue() const { return MQueue; } +bool Command::producesPiEvent() const { return true; } + Command *Command::addDep(DepDesc NewDep) { Command *ConnectionCmd = nullptr; @@ -731,6 +737,8 @@ void AllocaCommandBase::emitInstrumentationData() { #endif } +bool AllocaCommandBase::producesPiEvent() const { return false; } + AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData, AllocaCommandBase *LinkedAllocaCmd) @@ -998,6 +1006,8 @@ void ReleaseCommand::printDot(std::ostream &Stream) const { } } +bool ReleaseCommand::producesPiEvent() const { return false; } + MapMemObject::MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode) @@ -1392,6 +1402,8 @@ void EmptyCommand::printDot(std::ostream &Stream) const { } } +bool EmptyCommand::producesPiEvent() const { return false; } + void MemCpyCommandHost::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\""; @@ -2193,6 +2205,10 @@ cl_int ExecCGCommand::enqueueImp() { return PI_INVALID_OPERATION; } +bool ExecCGCommand::producesPiEvent() const { + return MCommandGroup->getType() != CG::CGTYPE::CODEPLAY_HOST_TASK; +} + } // namespace detail } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 9f85146eaf347..d976577821ebb 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -189,6 +189,9 @@ class Command { /// for memory copy commands. virtual const QueueImplPtr &getWorkerQueue() const; + /// Returns true iff the command produces a PI event on non-host devices. + virtual bool producesPiEvent() const; + protected: EventImplPtr MEvent; QueueImplPtr MQueue; @@ -306,6 +309,8 @@ class EmptyCommand : public Command { void emitInstrumentationData() override; + bool producesPiEvent() const final; + private: cl_int enqueueImp() final; @@ -323,6 +328,7 @@ class ReleaseCommand : public Command { void printDot(std::ostream &Stream) const final; void emitInstrumentationData() override; + bool producesPiEvent() const final; private: cl_int enqueueImp() final; @@ -347,6 +353,8 @@ class AllocaCommandBase : public Command { void emitInstrumentationData() override; + bool producesPiEvent() const final; + void *MMemAllocation = nullptr; /// Alloca command linked with current command. @@ -518,6 +526,8 @@ class ExecCGCommand : public Command { MCommandGroup.release(); } + bool producesPiEvent() const final; + private: cl_int enqueueImp() final; From 0ab7cd2b5d7281aba2ef1b353fc583dc77564b9a Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 6 Jul 2021 17:23:37 +0300 Subject: [PATCH 3/7] Add unit tests --- sycl/unittests/queue/CMakeLists.txt | 1 + sycl/unittests/queue/Wait.cpp | 210 ++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 sycl/unittests/queue/Wait.cpp diff --git a/sycl/unittests/queue/CMakeLists.txt b/sycl/unittests/queue/CMakeLists.txt index 732e5de87e87d..9aa147ed10e0c 100644 --- a/sycl/unittests/queue/CMakeLists.txt +++ b/sycl/unittests/queue/CMakeLists.txt @@ -1,3 +1,4 @@ add_sycl_unittest(QueueTests OBJECT EventClear.cpp + Wait.cpp ) diff --git a/sycl/unittests/queue/Wait.cpp b/sycl/unittests/queue/Wait.cpp new file mode 100644 index 0000000000000..066ab51f22066 --- /dev/null +++ b/sycl/unittests/queue/Wait.cpp @@ -0,0 +1,210 @@ +//==--------------------- Wait.cpp --- queue unit tests --------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +namespace { +using namespace cl::sycl; + +struct TestCtx { + bool SupportOOO = true; + bool PiQueueFinishCalled = false; + int NEventsWaitedFor = 0; + int EventReferenceCount = 0; +}; +static TestCtx TestContext; + +pi_result redefinedQueueCreate(pi_context context, pi_device device, + pi_queue_properties properties, + pi_queue *queue) { + if (!TestContext.SupportOOO && + properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) { + return PI_INVALID_QUEUE_PROPERTIES; + } + return PI_SUCCESS; +} + +pi_result redefinedQueueRelease(pi_queue Queue) { return PI_SUCCESS; } + +pi_result redefinedUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value, + size_t Count, + pi_uint32 Num_events_in_waitlist, + const pi_event *Events_waitlist, + pi_event *Event) { + // Provide a dummy non-nullptr value + TestContext.EventReferenceCount = 1; + *Event = reinterpret_cast(1); + return PI_SUCCESS; +} +pi_result redefinedEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer, + const void *Pattern, size_t PatternSize, + size_t Offset, size_t Size, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + // Provide a dummy non-nullptr value + TestContext.EventReferenceCount = 1; + *Event = reinterpret_cast(1); + return PI_SUCCESS; +} + +pi_result redefinedQueueFinish(pi_queue Queue) { + TestContext.PiQueueFinishCalled = true; + return PI_SUCCESS; +} +pi_result redefinedEventsWait(pi_uint32 num_events, + const pi_event *event_list) { + ++TestContext.NEventsWaitedFor; + return PI_SUCCESS; +} + +pi_result redefinedEventGetInfo(pi_event event, pi_event_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { +#if 0 + EXPECT_EQ(param_name, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS) + << "Unexpected event info requested"; + // Report first half of events as complete. + // Report second half of events as running. + // This is important, because removal algorithm assumes that + // events are likely to be removed oldest first, and stops removing + // at the first non-completed event. + static int Counter = 0; + auto *Result = reinterpret_cast(param_value); + *Result = (Counter < (ExpectedEventThreshold / 2)) ? PI_EVENT_COMPLETE + : PI_EVENT_RUNNING; + Counter++; +#endif + return PI_SUCCESS; +} + +pi_result redefinedEventRetain(pi_event event) { + ++TestContext.EventReferenceCount; + return PI_SUCCESS; +} + +pi_result redefinedEventRelease(pi_event event) { + --TestContext.EventReferenceCount; + return PI_SUCCESS; +} + +bool preparePiMock(platform &Plt) { + if (Plt.is_host()) { + std::cout << "Not run on host - no PI events created in that case" + << std::endl; + return false; + } + // TODO: Skip tests for CUDA temporarily + if (detail::getSyclObjImpl(Plt)->getPlugin().getBackend() == backend::cuda) { + std::cout << "Not run on CUDA - usm is not supported for CUDA backend yet" + << std::endl; + return false; + } + + unittest::PiMock Mock{Plt}; + Mock.redefine(redefinedQueueCreate); + Mock.redefine(redefinedQueueRelease); + Mock.redefine(redefinedQueueFinish); + Mock.redefine( + redefinedUSMEnqueueMemset); + Mock.redefine(redefinedEventsWait); + Mock.redefine( + redefinedEnqueueMemBufferFill); + Mock.redefine(redefinedEventGetInfo); + Mock.redefine(redefinedEventRetain); + Mock.redefine(redefinedEventRelease); + return true; +} + +TEST(QueueWait, Finish) { + platform Plt{default_selector()}; + if (!preparePiMock(Plt)) + return; + context Ctx{Plt}; + queue Q{Ctx, default_selector()}; + + unsigned char *HostAlloc = (unsigned char *)malloc_host(1, Ctx); + + // USM API event + TestContext = {}; + Q.memset(HostAlloc, 42, 1); + // No need to keep the event since we'll use piQueueFinish. + ASSERT_EQ(TestContext.EventReferenceCount, 0); + Q.wait(); + ASSERT_EQ(TestContext.NEventsWaitedFor, 0); + ASSERT_TRUE(TestContext.PiQueueFinishCalled); + + // Events with temporary ownership + { + TestContext = {}; + buffer buf{range<1>(1)}; + Q.submit([&](handler &Cgh) { + auto acc = buf.template get_access(Cgh); + Cgh.fill(acc, 42); + }); + Q.wait(); + // Still owned by the execution graph + ASSERT_EQ(TestContext.EventReferenceCount, 1); + ASSERT_EQ(TestContext.NEventsWaitedFor, 0); + ASSERT_TRUE(TestContext.PiQueueFinishCalled); + } + + // Blocked commands + TestContext = {}; + buffer buf{range<1>(1)}; + event HostTaskEvent = Q.submit([&](handler &Cgh) { + auto acc = buf.template get_access(Cgh); + Cgh.host_task([=]() { (void)acc; }); + }); + std::shared_ptr HostTaskEventImpl = + detail::getSyclObjImpl(HostTaskEvent); + auto *Cmd = static_cast(HostTaskEventImpl->getCommand()); + detail::Command *EmptyTask = *Cmd->MUsers.begin(); + ASSERT_EQ(EmptyTask->getType(), detail::Command::EMPTY_TASK); + HostTaskEvent.wait(); + // Use the empty task produced by the host task to block the next commands + while (EmptyTask->MEnqueueStatus != + detail::EnqueueResultT::SyclEnqueueSuccess) + continue; + EmptyTask->MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueBlocked; + Q.submit([&](handler &Cgh) { + auto acc = buf.template get_access(Cgh); + Cgh.fill(acc, 42); + }); + Q.submit([&](handler &Cgh) { + auto acc = buf.template get_access(Cgh); + Cgh.fill(acc, 42); + }); + // Unblock the empty task to allow the submitted events to complete once + // enqueued. + EmptyTask->MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueSuccess; + Q.wait(); + // Only a single event (the last one) should be waited for here. + ASSERT_EQ(TestContext.NEventsWaitedFor, 1); + ASSERT_TRUE(TestContext.PiQueueFinishCalled); + + // Test behaviour for emulating an OOO queue with multiple in-order ones. + TestContext = {}; + TestContext.SupportOOO = false; + Q = {Ctx, default_selector()}; + Q.memset(HostAlloc, 42, 1); + // The event is kept alive in this case to call wait. + ASSERT_EQ(TestContext.EventReferenceCount, 1); + Q.wait(); + ASSERT_EQ(TestContext.EventReferenceCount, 0); + ASSERT_EQ(TestContext.NEventsWaitedFor, 1); + ASSERT_FALSE(TestContext.PiQueueFinishCalled); +} + +} // namespace From 70eb1aacb8c34c9c8444d64e48f1c4da4c941676 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 6 Jul 2021 17:39:13 +0300 Subject: [PATCH 4/7] Remove leftover code --- sycl/unittests/queue/Wait.cpp | 82 +++++++++++++++-------------------- 1 file changed, 35 insertions(+), 47 deletions(-) diff --git a/sycl/unittests/queue/Wait.cpp b/sycl/unittests/queue/Wait.cpp index 066ab51f22066..678ca99b0b09d 100644 --- a/sycl/unittests/queue/Wait.cpp +++ b/sycl/unittests/queue/Wait.cpp @@ -72,20 +72,6 @@ pi_result redefinedEventsWait(pi_uint32 num_events, pi_result redefinedEventGetInfo(pi_event event, pi_event_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) { -#if 0 - EXPECT_EQ(param_name, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS) - << "Unexpected event info requested"; - // Report first half of events as complete. - // Report second half of events as running. - // This is important, because removal algorithm assumes that - // events are likely to be removed oldest first, and stops removing - // at the first non-completed event. - static int Counter = 0; - auto *Result = reinterpret_cast(param_value); - *Result = (Counter < (ExpectedEventThreshold / 2)) ? PI_EVENT_COMPLETE - : PI_EVENT_RUNNING; - Counter++; -#endif return PI_SUCCESS; } @@ -127,7 +113,7 @@ bool preparePiMock(platform &Plt) { return true; } -TEST(QueueWait, Finish) { +TEST(QueueWait, QueueWaitTest) { platform Plt{default_selector()}; if (!preparePiMock(Plt)) return; @@ -161,38 +147,40 @@ TEST(QueueWait, Finish) { } // Blocked commands - TestContext = {}; - buffer buf{range<1>(1)}; - event HostTaskEvent = Q.submit([&](handler &Cgh) { - auto acc = buf.template get_access(Cgh); - Cgh.host_task([=]() { (void)acc; }); - }); - std::shared_ptr HostTaskEventImpl = - detail::getSyclObjImpl(HostTaskEvent); - auto *Cmd = static_cast(HostTaskEventImpl->getCommand()); - detail::Command *EmptyTask = *Cmd->MUsers.begin(); - ASSERT_EQ(EmptyTask->getType(), detail::Command::EMPTY_TASK); - HostTaskEvent.wait(); - // Use the empty task produced by the host task to block the next commands - while (EmptyTask->MEnqueueStatus != - detail::EnqueueResultT::SyclEnqueueSuccess) - continue; - EmptyTask->MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueBlocked; - Q.submit([&](handler &Cgh) { - auto acc = buf.template get_access(Cgh); - Cgh.fill(acc, 42); - }); - Q.submit([&](handler &Cgh) { - auto acc = buf.template get_access(Cgh); - Cgh.fill(acc, 42); - }); - // Unblock the empty task to allow the submitted events to complete once - // enqueued. - EmptyTask->MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueSuccess; - Q.wait(); - // Only a single event (the last one) should be waited for here. - ASSERT_EQ(TestContext.NEventsWaitedFor, 1); - ASSERT_TRUE(TestContext.PiQueueFinishCalled); + { + TestContext = {}; + buffer buf{range<1>(1)}; + event HostTaskEvent = Q.submit([&](handler &Cgh) { + auto acc = buf.template get_access(Cgh); + Cgh.host_task([=]() { (void)acc; }); + }); + std::shared_ptr HostTaskEventImpl = + detail::getSyclObjImpl(HostTaskEvent); + auto *Cmd = static_cast(HostTaskEventImpl->getCommand()); + detail::Command *EmptyTask = *Cmd->MUsers.begin(); + ASSERT_EQ(EmptyTask->getType(), detail::Command::EMPTY_TASK); + HostTaskEvent.wait(); + // Use the empty task produced by the host task to block the next commands + while (EmptyTask->MEnqueueStatus != + detail::EnqueueResultT::SyclEnqueueSuccess) + continue; + EmptyTask->MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueBlocked; + Q.submit([&](handler &Cgh) { + auto acc = buf.template get_access(Cgh); + Cgh.fill(acc, 42); + }); + Q.submit([&](handler &Cgh) { + auto acc = buf.template get_access(Cgh); + Cgh.fill(acc, 42); + }); + // Unblock the empty task to allow the submitted events to complete once + // enqueued. + EmptyTask->MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueSuccess; + Q.wait(); + // Only a single event (the last one) should be waited for here. + ASSERT_EQ(TestContext.NEventsWaitedFor, 1); + ASSERT_TRUE(TestContext.PiQueueFinishCalled); + } // Test behaviour for emulating an OOO queue with multiple in-order ones. TestContext = {}; From 97d76dca52b6acb203a8ba76c2b4c3901c4cd8ad Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 6 Jul 2021 18:43:40 +0300 Subject: [PATCH 5/7] Apply comments --- sycl/source/detail/queue_impl.cpp | 10 ++++++---- sycl/unittests/queue/Wait.cpp | 6 ------ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 0e63afdc4dfe4..34ef35c56d671 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -122,6 +122,7 @@ void queue_impl::addEvent(const event &Event) { /// but some events have no other owner. In this case, /// addSharedEvent will have the queue track the events via a shared pointer. void queue_impl::addSharedEvent(const event &Event) { + assert(is_host() || !MSupportOOO); std::lock_guard Lock(MMutex); // Events stored in MEventsShared are not released anywhere else aside from // calls to queue::wait/wait_and_throw, which a user application might not @@ -254,26 +255,27 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { // directly. Otherwise, only wait for unenqueued or host task events, starting // from the latest submitted task in order to minimize total amount of calls, // then handle the rest with piQueueFinish. + bool SupportsPiFinish = !is_host() && MSupportOOO; for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { if (std::shared_ptr EventImplSharedPtr = EventImplWeakPtrIt->lock()) { // A nullptr PI event indicates that piQueueFinish will not cover it, // either because it's a host task event or an unenqueued one. - if (is_host() || !MSupportOOO || - nullptr == EventImplSharedPtr->getHandleRef()) { + if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandleRef()) { EventImplSharedPtr->wait(EventImplSharedPtr); } } } - if (!is_host() && MSupportOOO) { + if (SupportsPiFinish) { const detail::plugin &Plugin = getPlugin(); Plugin.call(getHandleRef()); for (std::weak_ptr &EventImplWeakPtr : WeakEvents) if (std::shared_ptr EventImplSharedPtr = EventImplWeakPtr.lock()) EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); - assert(SharedEvents.empty()); + assert(SharedEvents.empty() && "Queues that support calling piQueueFinish " + "shouldn't have shared events"); } else { for (event &Event : SharedEvents) Event.wait(); diff --git a/sycl/unittests/queue/Wait.cpp b/sycl/unittests/queue/Wait.cpp index 678ca99b0b09d..860563f78dfcb 100644 --- a/sycl/unittests/queue/Wait.cpp +++ b/sycl/unittests/queue/Wait.cpp @@ -91,12 +91,6 @@ bool preparePiMock(platform &Plt) { << std::endl; return false; } - // TODO: Skip tests for CUDA temporarily - if (detail::getSyclObjImpl(Plt)->getPlugin().getBackend() == backend::cuda) { - std::cout << "Not run on CUDA - usm is not supported for CUDA backend yet" - << std::endl; - return false; - } unittest::PiMock Mock{Plt}; Mock.redefine(redefinedQueueCreate); From 88ac4a6ae89ab23092abf9cc8ee1fffe6fdd4948 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 7 Jul 2021 13:08:46 +0300 Subject: [PATCH 6/7] Add a workaround to avoid waiting for events in L0 plugin during submit Level Zero plugin has recently started to call wait on events during piEventRelease. With the queue::wait change we started to release some events during submit, prior to their completion. This introduces implicit wait on every submit that would produce such an event, which also interferes with level zero batching. Work around this by storing such events for level zero only to release them later. --- sycl/source/detail/queue_impl.cpp | 29 ++++++++++++++++++++++++----- sycl/unittests/queue/Wait.cpp | 8 +++++++- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 34ef35c56d671..c92cfa1676918 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -62,7 +62,9 @@ event queue_impl::memset(const std::shared_ptr &Self, event ResEvent = prepareUSMEvent(Self, NativeEvent); // Track only if we won't be able to handle it with piQueueFinish. - if (!MSupportOOO) + // FIXME these events are stored for level zero until as a workaround, remove + // once piEventRelease no longer calls wait on the event in the plugin. + if (!MSupportOOO || getPlugin().getBackend() == backend::level_zero) addSharedEvent(ResEvent); return ResEvent; } @@ -79,7 +81,9 @@ event queue_impl::memcpy(const std::shared_ptr &Self, event ResEvent = prepareUSMEvent(Self, NativeEvent); // Track only if we won't be able to handle it with piQueueFinish. - if (!MSupportOOO) + // FIXME these events are stored for level zero until as a workaround, remove + // once piEventRelease no longer calls wait on the event in the plugin. + if (!MSupportOOO || getPlugin().getBackend() == backend::level_zero) addSharedEvent(ResEvent); return ResEvent; } @@ -97,7 +101,9 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, event ResEvent = prepareUSMEvent(Self, NativeEvent); // Track only if we won't be able to handle it with piQueueFinish. - if (!MSupportOOO) + // FIXME these events are stored for level zero until as a workaround, remove + // once piEventRelease no longer calls wait on the event in the plugin. + if (!MSupportOOO || getPlugin().getBackend() == backend::level_zero) addSharedEvent(ResEvent); return ResEvent; } @@ -109,7 +115,11 @@ void queue_impl::addEvent(const event &Event) { // if there is no command on the event, we cannot track it with MEventsWeak // as that will leave it with no owner. Track in MEventsShared only if we're // unable to call piQueueFinish during wait. - if (is_host() || !MSupportOOO) + // FIXME these events are stored for level zero until as a workaround, + // remove once piEventRelease no longer calls wait on the event in the + // plugin. + if (is_host() || !MSupportOOO || + getPlugin().getBackend() == backend::level_zero) addSharedEvent(Event); } else { std::weak_ptr EventWeakPtr{Eimpl}; @@ -122,7 +132,10 @@ void queue_impl::addEvent(const event &Event) { /// but some events have no other owner. In this case, /// addSharedEvent will have the queue track the events via a shared pointer. void queue_impl::addSharedEvent(const event &Event) { - assert(is_host() || !MSupportOOO); + // FIXME The assertion should be corrected once the Level Zero workaround is + // removed. + assert(is_host() || !MSupportOOO || + getPlugin().getBackend() == backend::level_zero); std::lock_guard Lock(MMutex); // Events stored in MEventsShared are not released anywhere else aside from // calls to queue::wait/wait_and_throw, which a user application might not @@ -274,6 +287,12 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { if (std::shared_ptr EventImplSharedPtr = EventImplWeakPtr.lock()) EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); + // FIXME these events are stored for level zero until as a workaround, + // remove once piEventRelease no longer calls wait on the event in the + // plugin. + if (Plugin.getBackend() == backend::level_zero) { + SharedEvents.clear(); + } assert(SharedEvents.empty() && "Queues that support calling piQueueFinish " "shouldn't have shared events"); } else { diff --git a/sycl/unittests/queue/Wait.cpp b/sycl/unittests/queue/Wait.cpp index 860563f78dfcb..417b2c5078c94 100644 --- a/sycl/unittests/queue/Wait.cpp +++ b/sycl/unittests/queue/Wait.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -120,7 +121,12 @@ TEST(QueueWait, QueueWaitTest) { TestContext = {}; Q.memset(HostAlloc, 42, 1); // No need to keep the event since we'll use piQueueFinish. - ASSERT_EQ(TestContext.EventReferenceCount, 0); + // FIXME ... unless the plugin is Level Zero, where there's a workaround that + // releases events later. + if (detail::getSyclObjImpl(Plt)->getPlugin().getBackend() != + backend::level_zero) { + ASSERT_EQ(TestContext.EventReferenceCount, 0); + } Q.wait(); ASSERT_EQ(TestContext.NEventsWaitedFor, 0); ASSERT_TRUE(TestContext.PiQueueFinishCalled); From e534ae210930d6a86dbcbe5dea2043c1f9386a51 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 7 Jul 2021 16:00:28 +0300 Subject: [PATCH 7/7] Adjust level zero tests to the changes --- .../on-device/plugins/level_zero_batch_event_status.cpp | 4 ++-- sycl/test/on-device/plugins/level_zero_batch_test.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp b/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp index d98140ac4b1ff..167df039b4474 100644 --- a/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp +++ b/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp @@ -25,10 +25,10 @@ // CHECK: ZE ---> zeCommandListClose // CHECK: ZE ---> zeCommandQueueExecuteCommandLists // CHECK: ---> piEventGetInfo -// CHECK-NOT: piEventsWait +// CHECK-NOT: piQueueFinish // CHECK: ---> piEnqueueKernelLaunch // CHECK: ZE ---> zeCommandListAppendLaunchKernel -// CHECK: ---> piEventsWait +// CHECK: ---> piQueueFinish // Look for close and Execute after piEventsWait // CHECK: ZE ---> zeCommandListClose // CHECK: ZE ---> zeCommandQueueExecuteCommandLists diff --git a/sycl/test/on-device/plugins/level_zero_batch_test.cpp b/sycl/test/on-device/plugins/level_zero_batch_test.cpp index f834e2bfab121..43426e6fb6dd1 100644 --- a/sycl/test/on-device/plugins/level_zero_batch_test.cpp +++ b/sycl/test/on-device/plugins/level_zero_batch_test.cpp @@ -86,7 +86,7 @@ // CKB4: ZE ---> zeCommandQueueExecuteCommandLists( // CKB8: ZE ---> zeCommandListClose( // CKB8: ZE ---> zeCommandQueueExecuteCommandLists( -// CKALL: ---> piEventsWait( +// CKALL: ---> piQueueFinish( // CKB3: ZE ---> zeCommandListClose( // CKB3: ZE ---> zeCommandQueueExecuteCommandLists( // CKB5: ZE ---> zeCommandListClose( @@ -142,7 +142,7 @@ // CKB4: ZE ---> zeCommandQueueExecuteCommandLists( // CKB8: ZE ---> zeCommandListClose( // CKB8: ZE ---> zeCommandQueueExecuteCommandLists( -// CKALL: ---> piEventsWait( +// CKALL: ---> piQueueFinish( // CKB3: ZE ---> zeCommandListClose( // CKB3: ZE ---> zeCommandQueueExecuteCommandLists( // CKB5: ZE ---> zeCommandListClose( @@ -198,7 +198,7 @@ // CKB4: ZE ---> zeCommandQueueExecuteCommandLists( // CKB8: ZE ---> zeCommandListClose( // CKB8: ZE ---> zeCommandQueueExecuteCommandLists( -// CKALL: ---> piEventsWait( +// CKALL: ---> piQueueFinish( // CKB3: ZE ---> zeCommandListClose( // CKB3: ZE ---> zeCommandQueueExecuteCommandLists( // CKB5: ZE ---> zeCommandListClose(