From 8a377c77b983421f759f361802f46abdb6006095 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 20 Oct 2022 20:19:26 +0000 Subject: [PATCH 01/33] [SYCL] Reuse discarded L0 events in scope of command list Patch implements reset and reuse of Level Zero events in scope of cmd list. Same level zero events are reused inside different command lists in scope of the queue, we need new event only to switch between command lists. The scheme in scope of command list looks like this: Operation1 = zeCommantListAppendMemoryCopy (signal event1) zeCommandListAppendBarrier(wait for event1) zeCommandListAppendEventReset(event1) Operation2 = zeCommandListAppendMemoryCopy (signal event2) zeCommandListAppendBarrier(wait for event2) zeCommandListAppendEventReset(event2) Operation3 = zeCommandListAppendMemoryCopy (signal event1) If we switch to a different command list then we signal new event and insert a barrier into new command list waiting or that event. CmdList1: Operation1 = zeCommantListAppendMemoryCopy (signal event1) zeCommandListAppendBarrier(wait for event1) zeCommandListAppendEventReset(event1) zeCommandListAppendSignalEvent(NewEvent) CmdList2: zeCommandListAppendBarrier(wait for NewEvent) --- sycl/plugins/level_zero/pi_level_zero.cpp | 545 ++++++++++++++++------ sycl/plugins/level_zero/pi_level_zero.hpp | 70 ++- 2 files changed, 477 insertions(+), 138 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index def60bc219aca..605b120662b6a 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -96,6 +96,16 @@ static const bool DisableEventsCaching = [] { return std::stoi(DisableEventsCachingFlag) != 0; }(); +// This is an experimental option that allows reset and reuse of uncompleted +// events in the in-order queue with discard_events property. +static const bool ReuseDiscardedEvents = [] { + const char *ReuseDiscardedEventsFlag = + std::getenv("SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS"); + if (!ReuseDiscardedEventsFlag) + return true; + return std::stoi(ReuseDiscardedEventsFlag) > 0; +}(); + // This class encapsulates actions taken along with a call to Level Zero API. class ZeCall { private: @@ -513,27 +523,29 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool, return PI_SUCCESS; } -pi_result _pi_context::decrementUnreleasedEventsInPool(pi_event Event) { - std::shared_lock EventLock(Event->Mutex, std::defer_lock); - std::scoped_lock> LockAll( - ZeEventPoolCacheMutex, EventLock); - if (!Event->ZeEventPool) { +pi_result _pi_context::decrementUnreleasedEventsInPool( + pi_queue Queue, ze_event_pool_handle_t ZeEventPool, bool HostVisible) { + std::lock_guard LockAll(ZeEventPoolCacheMutex); + if (!ZeEventPool) { // This must be an interop event created on a users's pool. // Do nothing. return PI_SUCCESS; } + bool ProfilingEnabled = + !Queue || (Queue->Properties & PI_QUEUE_PROFILING_ENABLE) != 0; + std::list *ZePoolCache = - getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled()); + getZeEventPoolCache(HostVisible, ProfilingEnabled); // Put the empty pool to the cache of the pools. - if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) + if (NumEventsUnreleasedInEventPool[ZeEventPool] == 0) die("Invalid event release: event pool doesn't have unreleased events"); - if (--NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) { - if (ZePoolCache->front() != Event->ZeEventPool) { - ZePoolCache->push_back(Event->ZeEventPool); + if (--NumEventsUnreleasedInEventPool[ZeEventPool] == 0) { + if (ZePoolCache->front() != ZeEventPool) { + ZePoolCache->push_back(ZeEventPool); } - NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool; + NumEventsAvailableInEventPool[ZeEventPool] = MaxNumEventsPerPool; } return PI_SUCCESS; @@ -649,6 +661,19 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, if (!(condition)) \ return error; +pi_result +_pi_queue::resetLastDiscardedEvent(pi_command_list_ptr_t CommandList) { + if (DiscardedLastCommandEvent.Handle && CommandList != CommandListMap.end()) { + ZE_CALL(zeCommandListAppendBarrier, (CommandList->first, nullptr, 1, + &(DiscardedLastCommandEvent.Handle))); + ZE_CALL(zeCommandListAppendEventReset, + (CommandList->first, DiscardedLastCommandEvent.Handle)); + addEventToCache(DiscardedLastCommandEvent); + DiscardedLastCommandEvent = {nullptr, nullptr, false}; + } + return PI_SUCCESS; +} + // This helper function creates a pi_event and associate a pi_queue. // Note that the caller of this function must have acquired lock on the Queue // that is passed in. @@ -664,6 +689,10 @@ inline static pi_result createEventAndAssociateQueue( pi_queue Queue, pi_event *Event, pi_command_type CommandType, pi_command_list_ptr_t CommandList, bool IsInternal = false, bool ForceHostVisible = false) { + if (Queue->isInOrderQueue() && Queue->isDiscardEvents()) + // We are going to get/create event for the next command so it is time to + // reset the last disarded event (if any) and put it to the cache. + Queue->resetLastDiscardedEvent(CommandList); if (!ForceHostVisible) ForceHostVisible = DeviceEventsSetting == AllHostVisible; @@ -699,6 +728,107 @@ inline static pi_result createEventAndAssociateQueue( return PI_SUCCESS; } +pi_result _pi_queue::signalEvent(pi_command_list_ptr_t CommandList) { + pi_event SpecialEvent; + PI_CALL(createEventAndAssociateQueue( + this, &SpecialEvent, PI_COMMAND_TYPE_USER, CommandList, + /* IsDiscarded */ false, /* ForceHostVisible */ false)); + + // We want a barrier in the beginning of a next command list waiting for this + // special event. + ActiveBarriers.push_back(SpecialEvent); + + // We don't need additional dependency through LastCommandEvent. + LastCommandEvent = nullptr; + + ZE_CALL(zeCommandListAppendSignalEvent, + (CommandList->first, SpecialEvent->ZeEvent)); + return PI_SUCCESS; +} + +pi_result _pi_queue::getEventFromCache(bool HostVisible, + ze_event_handle_t *Event) { + auto Cache = getEventCache(HostVisible); + + if (Cache->empty()) { + *Event = nullptr; + return PI_SUCCESS; + } + + auto It = Cache->begin(); + *Event = (*It).Handle; + Cache->erase(It); + + return PI_SUCCESS; +} + +void _pi_queue::addEventToCache(ze_event Event) { + auto Cache = getEventCache(Event.HostVisible); + Cache->emplace_back(Event); +} + +// This function is used to get or create discarded event. When event is +// discarded we don't create pi_event object and reuse events from the queue +// cache if any. +inline static pi_result +getOrCreateDiscardedEvent(pi_queue Queue, ze_event_handle_t *ZeEvent, + pi_command_list_ptr_t CommandList, + _pi_ze_event_list_t WaitList, + bool HostVisible = false) { + if (!HostVisible) + HostVisible = DeviceEventsSetting == AllHostVisible; + bool ProfilingEnabled = + !Queue || (Queue->Properties & PI_QUEUE_PROFILING_ENABLE) != 0; + + Queue->getEventFromCache(HostVisible, ZeEvent); + + ze_event_pool_handle_t ZeEventPool = {}; + if (*ZeEvent == nullptr) { + size_t Index = 0; + + if (auto Res = Queue->Context->getFreeSlotInExistingOrNewPool( + ZeEventPool, Index, HostVisible, ProfilingEnabled)) + return Res; + + ZeStruct ZeEventDesc; + ZeEventDesc.index = Index; + ZeEventDesc.wait = 0; + + if (HostVisible) { + ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + } else { + // + // Set the scope to "device" for every event. This is sufficient for + // global device access and peer device access. If needed to be seen on + // the host we are doing special handling, see EventsScope options. + // + // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be + // used in some circumstances. + // + ZeEventDesc.signal = 0; + } + + ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, ZeEvent)); + } + + // We don't have pi_event object for discarded event, so store waitlist in the + // command list. Events from the waitlist will be released after command list + // is finished. + CommandList->second.WaitLists.push_back(WaitList); + + // We've got event for the next command above, it is time to reset the last + // disarded event and put it to the cache. + Queue->resetLastDiscardedEvent(CommandList); + Queue->DiscardedLastCommandEvent = {*ZeEvent, ZeEventPool, HostVisible}; + Queue->LastCommandEvent = nullptr; + + // We need to keep track of number of discarded events for batching purposes. + // Otherwise discarded events won't be taken into account which will affect + // batching. + CommandList->second.NumDiscardedEvents++; + return PI_SUCCESS; +} + pi_result _pi_device::initialize(int SubSubDeviceOrdinal, int SubSubDeviceIndex) { uint32_t numQueueGroups = 0; @@ -1011,6 +1141,33 @@ _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, std::vector &EventListToCleanup) { bool UseCopyEngine = CommandList->second.isCopy(this); + if (isInOrderQueue() && isDiscardEvents()) { + // If there were discarded events in the command list then we have to + // release kernels associated with them. + for (auto Kernel : CommandList->second.Kernels) { + piKernelRelease(Kernel); + } + CommandList->second.Kernels.clear(); + + // If there were discarded events in the command list then we have to + // release events from wait lists associated with them. + for (auto WaitList : CommandList->second.WaitLists) { + std::list EventsToBeReleased; + WaitList.collectEventsForReleaseAndDestroyPiZeEventList( + EventsToBeReleased); + + // Event may be in the wait list of more than one event. But we have to + // cleanup it only once, that's why use unordered_set to make it happen. + std::unordered_set Events; + std::copy(EventsToBeReleased.begin(), EventsToBeReleased.end(), + std::inserter(Events, Events.begin())); + + for (auto Event : Events) + EventListToCleanup.push_back(Event); + } + CommandList->second.WaitLists.clear(); + } + // Immediate commandlists do not have an associated fence. if (CommandList->second.ZeFence != nullptr) { // Fence had been signalled meaning the associated command-list completed. @@ -1019,6 +1176,7 @@ _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, ZE_CALL(zeFenceReset, (CommandList->second.ZeFence)); ZE_CALL(zeCommandListReset, (CommandList->first)); CommandList->second.ZeFenceInUse = false; + CommandList->second.NumDiscardedEvents = 0; } auto &EventList = CommandList->second.EventList; @@ -1305,6 +1463,7 @@ pi_result _pi_context::getAvailableCommandList( // Immediate commandlists have been pre-allocated and are always available. if (Queue->Device->useImmediateCommandLists()) { CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList(); + if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) return Res; return PI_SUCCESS; @@ -1414,6 +1573,8 @@ pi_result _pi_context::getAvailableCommandList( true /* QueueLocked */); CommandList = it; CommandList->second.ZeFenceInUse = true; + if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) + return Res; return PI_SUCCESS; } } @@ -1554,13 +1715,17 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // traces incurs much different timings than real execution // ansyway, and many regression tests use it. // - bool CurrentlyEmpty = !PrintPiTrace && this->LastCommandEvent == nullptr; + bool CurrentlyEmpty = !PrintPiTrace && this->LastCommandEvent == nullptr && + this->DiscardedLastCommandEvent.Handle == nullptr; // The list can be empty if command-list only contains signals of proxy // events. - if (!CommandList->second.EventList.empty()) + if (!CommandList->second.EventList.empty() && + !this->DiscardedLastCommandEvent.Handle) this->LastCommandEvent = CommandList->second.EventList.back(); + this->LastCommandList = CommandList; + if (!Device->useImmediateCommandLists()) { // Batch if allowed to, but don't batch if we know there are no kernels // from this queue that are currently executing. This is intended to get @@ -1588,8 +1753,8 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, } adjustBatchSizeForFullBatch(UseCopyEngine); - CommandBatch.OpenCommandList = CommandListMap.end(); } + CommandBatch.OpenCommandList = CommandListMap.end(); } auto &ZeCommandQueue = CommandList->second.ZeQueue; @@ -1669,16 +1834,25 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // Indicate no cleanup is needed for this PI event as it is special. HostVisibleEvent->CleanedUp = true; + this->resetLastDiscardedEvent(CommandList); + // Finally set to signal the host-visible event at the end of the // command-list after a barrier that waits for all commands // completion. ZE_CALL(zeCommandListAppendBarrier, (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr)); + } else if (this->DiscardedLastCommandEvent.Handle) { + this->resetLastDiscardedEvent(CommandList); + this->signalEvent(CommandList); } + } else if (this->DiscardedLastCommandEvent.Handle) { + this->resetLastDiscardedEvent(CommandList); + this->signalEvent(CommandList); } // Close the command list and have it ready for dispatch. ZE_CALL(zeCommandListClose, (CommandList->first)); + this->LastCommandList = CommandListMap.end(); // Offload command list to the GPU for asynchronous execution auto ZeCommandList = CommandList->first; auto ZeResult = ZE_CALL_NOCHECK( @@ -1884,16 +2058,27 @@ pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, for (pi_event &BarrierEvent : ActiveBarriers) PI_CALL(piEventReleaseInternal(BarrierEvent)); ActiveBarriers.clear(); - ActiveBarriers.insert( - ActiveBarriers.end(), ActiveBarriersWaitList.PiEventList, - ActiveBarriersWaitList.PiEventList + ActiveBarriersWaitList.Length); + + // For in-order queue every command depends on the previous one so we don't + // need to insert active barriers for every next command list. + if (!isInOrderQueue()) + ActiveBarriers.insert( + ActiveBarriers.end(), ActiveBarriersWaitList.PiEventList, + ActiveBarriersWaitList.PiEventList + ActiveBarriersWaitList.Length); // If there are more active barriers, insert a barrier on the command-list. We // do not need an event for finishing so we pass nullptr. - if (!ActiveBarriers.empty()) + if (ActiveBarriersWaitList.Length) ZE_CALL(zeCommandListAppendBarrier, (CmdList->first, nullptr, ActiveBarriersWaitList.Length, ActiveBarriersWaitList.ZeEventList)); + + // Active barriers are released at queue synchronization, but for in-order + // queue we don't keep them till that point so store them in the command list, + // they will be released on completion of command list. + if (isInOrderQueue()) + CmdList->second.WaitLists.push_back(ActiveBarriersWaitList); + return PI_SUCCESS; } @@ -1924,6 +2109,37 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( this->ZeEventList = nullptr; this->PiEventList = nullptr; + if (CurQueue->isInOrderQueue()) { + if (CurQueue->Device->useImmediateCommandLists()) { + if (CurQueue->isDiscardEvents()) { + // If we have an in-order queue where some events are discarded and if + // new command list is different from the last used then signal new + // event from the last immediate command list. It is going to be waited + // in the new immediate command list. + auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine); + auto NextImmCmdList = QueueGroup.ImmCmdLists[QueueGroup.NextIndex]; + if (CurQueue->LastCommandEvent == nullptr && + CurQueue->DiscardedLastCommandEvent.Handle != nullptr && + CurQueue->LastCommandList != CurQueue->CommandListMap.end() && + CurQueue->LastCommandList != NextImmCmdList) { + CurQueue->resetLastDiscardedEvent(CurQueue->LastCommandList); + CurQueue->signalEvent(CurQueue->LastCommandList); + } + } + } else { + // Close open command list if command is going to be submitted to a + // different command list. + if ((CurQueue->LastCommandEvent != nullptr || + CurQueue->DiscardedLastCommandEvent.Handle != nullptr) && + CurQueue->LastCommandList != CurQueue->CommandListMap.end() && + CurQueue->LastCommandList->second.isCopy(CurQueue) != UseCopyEngine) { + if (auto Res = CurQueue->executeOpenCommandList( + CurQueue->LastCommandList->second.isCopy(CurQueue))) + return Res; + } + } + } + try { if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) { this->ZeEventList = new ze_event_handle_t[EventListLength + 1]; @@ -2018,18 +2234,6 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( // previous command has finished. The event associated with the last // enqueued command is added into the waitlist to ensure in-order semantics. if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) { - - // Ensure LastCommandEvent's batch is submitted if it is differrent - // from the one this command is going to. - const auto &OpenCommandList = - CurQueue->eventOpenCommandList(CurQueue->LastCommandEvent); - if (OpenCommandList != CurQueue->CommandListMap.end() && - OpenCommandList->second.isCopy(CurQueue) != UseCopyEngine) { - - if (auto Res = CurQueue->executeOpenCommandList( - OpenCommandList->second.isCopy(CurQueue))) - return Res; - } std::shared_lock Lock(CurQueue->LastCommandEvent->Mutex); this->ZeEventList[TmpListLength] = CurQueue->LastCommandEvent->ZeEvent; this->PiEventList[TmpListLength] = CurQueue->LastCommandEvent; @@ -3692,6 +3896,15 @@ static pi_result piQueueReleaseInternal(pi_queue Queue) { if (!Queue->RefCount.decrementAndTest()) return PI_SUCCESS; + for (auto Cache : Queue->EventCaches) { + for (auto Event : Cache) { + ZE_CALL(zeEventDestroy, (Event.Handle)); + if (auto Res = Queue->Context->decrementUnreleasedEventsInPool( + Queue, Event.Pool, Event.HostVisible)) + return Res; + } + } + if (Queue->OwnZeCommandQueue) { for (auto &ZeQueue : Queue->ComputeQueueGroup.ZeQueues) { if (ZeQueue) @@ -5386,19 +5599,29 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, return Res; ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - pi_result Res = createEventAndAssociateQueue( - Queue, Event, PI_COMMAND_TYPE_NDRANGE_KERNEL, CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - // Save the kernel in the event, so that when the event is signalled - // the code can do a piKernelRelease on this kernel. - (*Event)->CommandData = (void *)Kernel; + if (OutEvent || !ReuseDiscardedEvents) { + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + pi_result Res = createEventAndAssociateQueue( + Queue, Event, PI_COMMAND_TYPE_NDRANGE_KERNEL, CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + + // Save the kernel in the event, so that when the event is signalled + // the code can do a piKernelRelease on this kernel. + (*Event)->CommandData = (void *)Kernel; + } else { + pi_result Res = + getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); + if (Res != PI_SUCCESS) + return Res; + + CommandList->second.Kernels.push_back(Kernel); + } // Increment the reference count of the Kernel and indicate that the Kernel is // in use. Once the event has been signalled, the code in @@ -5428,8 +5651,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // Add the command to the command list, which implies submission. ZE_CALL(zeCommandListAppendLaunchKernel, (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions, - ZeEvent, (*Event)->WaitList.Length, - (*Event)->WaitList.ZeEventList)); + ZeEvent, TmpWaitList.Length, TmpWaitList.ZeEventList)); } else { // Add the command to the command list for later submission. // No lock is needed here, unlike the immediate commandlist case above, @@ -5437,14 +5659,13 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // submitted only when the comamndlist is closed. Then, a lock is held. ZE_CALL(zeCommandListAppendLaunchKernel, (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions, - ZeEvent, (*Event)->WaitList.Length, - (*Event)->WaitList.ZeEventList)); + ZeEvent, TmpWaitList.Length, TmpWaitList.ZeEventList)); } zePrint("calling zeCommandListAppendLaunchKernel() with" " ZeEvent %#lx\n", pi_cast(ZeEvent)); - printZeEventList((*Event)->WaitList); + printZeEventList(TmpWaitList); // Execute command list asynchronously, as the event will be used // to track down its completion. @@ -6013,7 +6234,8 @@ static pi_result piEventReleaseInternal(pi_event Event) { if (DisableEventsCaching) { ZE_CALL(zeEventDestroy, (Event->ZeEvent)); auto Context = Event->Context; - if (auto Res = Context->decrementUnreleasedEventsInPool(Event)) + if (auto Res = Context->decrementUnreleasedEventsInPool( + Event->Queue, Event->ZeEventPool, Event->isHostVisible())) return Res; } } @@ -6294,21 +6516,27 @@ pi_result piEnqueueEventsWait(pi_queue Queue, pi_uint32 NumEventsInWaitList, return Res; ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; + if (OutEvent || !ReuseDiscardedEvents) { + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue( + Queue, Event, PI_COMMAND_TYPE_USER, CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + } else { + pi_result Res = + getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); + if (Res != PI_SUCCESS) + return Res; + } - const auto &WaitList = (*Event)->WaitList; auto ZeCommandList = CommandList->first; ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); + (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); ZE_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); @@ -6385,7 +6613,12 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, // We use the same approach if // SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a // positive value. - if (NumEventsInWaitList || !UseMultipleCmdlistBarriers) { + // We also use the same approach if we have in-order queue because inserted + // barrier will depend on last command event and every next command will + // depend on event signalled by barrier, so no need to populate ActiveBarriers + // in this case as well. + if (NumEventsInWaitList || !UseMultipleCmdlistBarriers || + Queue->isInOrderQueue()) { // Retain the events as they will be owned by the result event. _pi_ze_event_list_t TmpWaitList; if (auto Res = TmpWaitList.createAndRetainPiZeEventList( @@ -6408,8 +6641,10 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, if (auto Res = Queue->executeCommandList(CmdList, false, OkToBatch)) return Res; - if (UseMultipleCmdlistBarriers) { + if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) { // Retain and save the resulting event for future commands. + // This is redundant for in-order queues because we separately handle + // dependency chain between commands in in-order queue. (*Event)->RefCount.increment(); Queue->ActiveBarriers.push_back(*Event); } @@ -6647,28 +6882,35 @@ enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst, return Res; ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; + + if (OutEvent || !ReuseDiscardedEvents) { + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, + CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + } else { + pi_result Res = + getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); + if (Res != PI_SUCCESS) + return Res; + } const auto &ZeCommandList = CommandList->first; - const auto &WaitList = (*Event)->WaitList; - if (WaitList.Length) { + if (TmpWaitList.Length) { ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); + (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); } zePrint("calling zeCommandListAppendMemoryCopy() with\n" " ZeEvent %#lx\n", pi_cast(ZeEvent)); - printZeEventList(WaitList); + printZeEventList(TmpWaitList); ZE_CALL(zeCommandListAppendMemoryCopy, (ZeCommandList, Dst, Src, Size, ZeEvent, 0, nullptr)); @@ -6710,27 +6952,33 @@ static pi_result enqueueMemCopyRectHelper( return Res; ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; + if (OutEvent || !ReuseDiscardedEvents) { + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, + CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + } else { + pi_result Res = + getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); + if (Res != PI_SUCCESS) + return Res; + } const auto &ZeCommandList = CommandList->first; - const auto &WaitList = (*Event)->WaitList; - if (WaitList.Length) { + if (TmpWaitList.Length) { ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); + (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); } zePrint("calling zeCommandListAppendMemoryCopy() with\n" " ZeEvent %#lx\n", pi_cast(ZeEvent)); - printZeEventList(WaitList); + printZeEventList(TmpWaitList); uint32_t SrcOriginX = pi_cast(SrcOrigin->x_bytes); uint32_t SrcOriginY = pi_cast(SrcOrigin->y_scalar); @@ -6966,23 +7214,29 @@ enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, return Res; ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; + if (OutEvent || !ReuseDiscardedEvents) { + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, + CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + } else { + pi_result Res = + getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); + if (Res != PI_SUCCESS) + return Res; + } const auto &ZeCommandList = CommandList->first; - const auto &WaitList = (*Event)->WaitList; - if (WaitList.Length) { + if (TmpWaitList.Length) { ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); + (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); } ZE_CALL( @@ -6992,7 +7246,7 @@ enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, zePrint("calling zeCommandListAppendMemoryFill() with\n" " ZeEvent %#lx\n", pi_cast(ZeEvent)); - printZeEventList(WaitList); + printZeEventList(TmpWaitList); // Execute command list asynchronously, as the event will be used // to track down its completion. @@ -7390,22 +7644,28 @@ static pi_result enqueueMemImageCommandHelper( return Res; ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; + if (OutEvent || !ReuseDiscardedEvents) { + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, + CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + } else { + pi_result Res = + getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); + if (Res != PI_SUCCESS) + return Res; + } const auto &ZeCommandList = CommandList->first; - const auto &WaitList = (*Event)->WaitList; - if (WaitList.Length) { + if (TmpWaitList.Length) { ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); + (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); } if (CommandType == PI_COMMAND_TYPE_IMAGE_READ) { pi_mem SrcMem = pi_cast(const_cast(Src)); @@ -8389,21 +8649,28 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size, // TODO: do we need to create a unique command type for this? ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - const auto &WaitList = (*Event)->WaitList; + if (OutEvent || !ReuseDiscardedEvents) { + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, + CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + } else { + pi_result Res = + getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); + if (Res != PI_SUCCESS) + return Res; + } + const auto &ZeCommandList = CommandList->first; - if (WaitList.Length) { + if (TmpWaitList.Length) { ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); + (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); } // TODO: figure out how to translate "flags" ZE_CALL(zeCommandListAppendMemoryPrefetch, (ZeCommandList, Ptr, Size)); @@ -8454,22 +8721,28 @@ pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, // TODO: do we need to create a unique command type for this? ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; + if (OutEvent || !ReuseDiscardedEvents) { + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, + CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + } else { + pi_result Res = + getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); + if (Res != PI_SUCCESS) + return Res; + } const auto &ZeCommandList = CommandList->first; - const auto &WaitList = (*Event)->WaitList; - if (WaitList.Length) { + if (TmpWaitList.Length) { ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); + (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); } ZE_CALL(zeCommandListAppendMemAdvise, diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 802d1a5e57717..e7307d7c4ba7b 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -581,6 +581,8 @@ struct _pi_device : _pi_object { ZeCache> ZeDeviceCacheProperties; }; +struct _pi_ze_event_list_t; + // Structure describing the specific use of a command-list in a queue. // This is because command-lists are re-used across multiple queues // in the same context. @@ -609,7 +611,17 @@ struct pi_command_list_info_t { // TODO: use this for optimizing events in the same command-list, e.g. // only have last one visible to the host. std::vector EventList{}; - size_t size() const { return EventList.size(); } + + // List of kernels in this command list associated with discarded events. + std::list Kernels; + + // List of dependent events associated with discarded events in this command + // list. + std::list<_pi_ze_event_list_t> WaitLists; + + size_t NumDiscardedEvents = 0; + + size_t size() const { return EventList.size() + NumDiscardedEvents; } void append(pi_event Event) { EventList.push_back(Event); } }; @@ -736,7 +748,9 @@ struct _pi_context : _pi_object { // Decrement number of events living in the pool upon event destroy // and return the pool to the cache if there are no unreleased events. - pi_result decrementUnreleasedEventsInPool(pi_event Event); + pi_result decrementUnreleasedEventsInPool(pi_queue Queue, + ze_event_pool_handle_t ZeEventPool, + bool HostVisible); // Store USM allocator context(internal allocator structures) // for USM shared and device allocations. There is 1 allocator context @@ -830,6 +844,16 @@ struct _pi_context : _pi_object { } }; +struct ze_event { + // Level Zero event handle. + ze_event_handle_t Handle; + + // Level Zero event pool handle. + ze_event_pool_handle_t Pool; + + bool HostVisible = false; +}; + struct _pi_queue : _pi_object { _pi_queue(std::vector &ComputeQueues, std::vector &CopyQueues, @@ -917,6 +941,40 @@ struct _pi_queue : _pi_object { // command is enqueued. pi_event LastCommandEvent = nullptr; + // Keep track of the last command list used by in-order queue. + // This is needed because we need to handle the change of the command list in + // a specific way. + pi_command_list_ptr_t LastCommandList = CommandListMap.end(); + + // Caches of events for reuse. + std::vector> EventCaches{2}; + auto getEventCache(bool HostVisible) { + return HostVisible ? &EventCaches[0] : &EventCaches[1]; + } + + // Get event from the queue's cache. + pi_result getEventFromCache(bool HostVisible, ze_event_handle_t *Event); + + // Add event to the queue's cache. + void addEventToCache(ze_event Event); + + // Append command to provided command list to reset the last discarded event. + // If we have in-order and discard_events mode we reset and reuse events in + // scope of the same command lists. This method allows to wait for the last + // discarded event, reset it and put to the cache for future reuse. + pi_result resetLastDiscardedEvent(pi_command_list_ptr_t); + + // Append command to provided command list to signal new event. + // While we submit commands in scope of the same command list we can reuse + // events but when we switch to a different command list we currently use a + // new event. This method is used to signal new event from the last used + // command list. This new event will be waited in new command list. + pi_result signalEvent(pi_command_list_ptr_t); + + // We store the last discarded event here. We also store additional + // information with it: host visibility and event pool where it was created. + ze_event DiscardedLastCommandEvent{nullptr, nullptr, false}; + // Kernel is not necessarily submitted for execution during // piEnqueueKernelLaunch, it may be batched. That's why we need to save the // list of kernels which is going to be submitted but have not been submitted @@ -1310,6 +1368,14 @@ struct _pi_ze_event_list_t { this->Length = other.Length; return *this; } + + _pi_ze_event_list_t(const _pi_ze_event_list_t &other) { + this->ZeEventList = other.ZeEventList; + this->PiEventList = other.PiEventList; + this->Length = other.Length; + } + + _pi_ze_event_list_t() {} }; struct _pi_event : _pi_object { From 3135fd30b66d3020acdc3c9ae003dec75a5a7b79 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 2 Nov 2022 15:16:00 -0700 Subject: [PATCH 02/33] Don't use separate structure to handle discarded events, use pi_event instead --- sycl/plugins/level_zero/pi_level_zero.cpp | 102 ++++++++-------------- sycl/plugins/level_zero/pi_level_zero.hpp | 25 ++---- 2 files changed, 42 insertions(+), 85 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 605b120662b6a..11bb94e029ab5 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -523,29 +523,27 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool, return PI_SUCCESS; } -pi_result _pi_context::decrementUnreleasedEventsInPool( - pi_queue Queue, ze_event_pool_handle_t ZeEventPool, bool HostVisible) { - std::lock_guard LockAll(ZeEventPoolCacheMutex); - if (!ZeEventPool) { +pi_result _pi_context::decrementUnreleasedEventsInPool(pi_event Event) { + std::shared_lock EventLock(Event->Mutex, std::defer_lock); + std::scoped_lock> LockAll( + ZeEventPoolCacheMutex, EventLock); + if (!Event->ZeEventPool) { // This must be an interop event created on a users's pool. // Do nothing. return PI_SUCCESS; } - bool ProfilingEnabled = - !Queue || (Queue->Properties & PI_QUEUE_PROFILING_ENABLE) != 0; - std::list *ZePoolCache = - getZeEventPoolCache(HostVisible, ProfilingEnabled); + getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled()); // Put the empty pool to the cache of the pools. - if (NumEventsUnreleasedInEventPool[ZeEventPool] == 0) + if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) die("Invalid event release: event pool doesn't have unreleased events"); - if (--NumEventsUnreleasedInEventPool[ZeEventPool] == 0) { - if (ZePoolCache->front() != ZeEventPool) { - ZePoolCache->push_back(ZeEventPool); + if (--NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) { + if (ZePoolCache->front() != Event->ZeEventPool) { + ZePoolCache->push_back(Event->ZeEventPool); } - NumEventsAvailableInEventPool[ZeEventPool] = MaxNumEventsPerPool; + NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool; } return PI_SUCCESS; @@ -663,13 +661,14 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, pi_result _pi_queue::resetLastDiscardedEvent(pi_command_list_ptr_t CommandList) { - if (DiscardedLastCommandEvent.Handle && CommandList != CommandListMap.end()) { - ZE_CALL(zeCommandListAppendBarrier, (CommandList->first, nullptr, 1, - &(DiscardedLastCommandEvent.Handle))); + if (DiscardedLastCommandEvent && CommandList != CommandListMap.end()) { + ZE_CALL(zeCommandListAppendBarrier, + (CommandList->first, nullptr, 1, + &(DiscardedLastCommandEvent->ZeEvent))); ZE_CALL(zeCommandListAppendEventReset, - (CommandList->first, DiscardedLastCommandEvent.Handle)); + (CommandList->first, DiscardedLastCommandEvent->ZeEvent)); addEventToCache(DiscardedLastCommandEvent); - DiscardedLastCommandEvent = {nullptr, nullptr, false}; + DiscardedLastCommandEvent = nullptr; } return PI_SUCCESS; } @@ -746,8 +745,7 @@ pi_result _pi_queue::signalEvent(pi_command_list_ptr_t CommandList) { return PI_SUCCESS; } -pi_result _pi_queue::getEventFromCache(bool HostVisible, - ze_event_handle_t *Event) { +pi_result _pi_queue::getEventFromCache(bool HostVisible, pi_event *Event) { auto Cache = getEventCache(HostVisible); if (Cache->empty()) { @@ -756,14 +754,14 @@ pi_result _pi_queue::getEventFromCache(bool HostVisible, } auto It = Cache->begin(); - *Event = (*It).Handle; + *Event = *It; Cache->erase(It); return PI_SUCCESS; } -void _pi_queue::addEventToCache(ze_event Event) { - auto Cache = getEventCache(Event.HostVisible); +void _pi_queue::addEventToCache(pi_event Event) { + auto Cache = getEventCache(Event->isHostVisible()); Cache->emplace_back(Event); } @@ -777,39 +775,12 @@ getOrCreateDiscardedEvent(pi_queue Queue, ze_event_handle_t *ZeEvent, bool HostVisible = false) { if (!HostVisible) HostVisible = DeviceEventsSetting == AllHostVisible; - bool ProfilingEnabled = - !Queue || (Queue->Properties & PI_QUEUE_PROFILING_ENABLE) != 0; - Queue->getEventFromCache(HostVisible, ZeEvent); + pi_event Event = nullptr; + Queue->getEventFromCache(HostVisible, &Event); - ze_event_pool_handle_t ZeEventPool = {}; - if (*ZeEvent == nullptr) { - size_t Index = 0; - - if (auto Res = Queue->Context->getFreeSlotInExistingOrNewPool( - ZeEventPool, Index, HostVisible, ProfilingEnabled)) - return Res; - - ZeStruct ZeEventDesc; - ZeEventDesc.index = Index; - ZeEventDesc.wait = 0; - - if (HostVisible) { - ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; - } else { - // - // Set the scope to "device" for every event. This is sufficient for - // global device access and peer device access. If needed to be seen on - // the host we are doing special handling, see EventsScope options. - // - // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be - // used in some circumstances. - // - ZeEventDesc.signal = 0; - } - - ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, ZeEvent)); - } + if (!Event) + PI_CALL(EventCreate(Queue->Context, Queue, HostVisible, &Event)); // We don't have pi_event object for discarded event, so store waitlist in the // command list. Events from the waitlist will be released after command list @@ -819,8 +790,9 @@ getOrCreateDiscardedEvent(pi_queue Queue, ze_event_handle_t *ZeEvent, // We've got event for the next command above, it is time to reset the last // disarded event and put it to the cache. Queue->resetLastDiscardedEvent(CommandList); - Queue->DiscardedLastCommandEvent = {*ZeEvent, ZeEventPool, HostVisible}; + Queue->DiscardedLastCommandEvent = Event; Queue->LastCommandEvent = nullptr; + *ZeEvent = Event->ZeEvent; // We need to keep track of number of discarded events for batching purposes. // Otherwise discarded events won't be taken into account which will affect @@ -1716,12 +1688,12 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // ansyway, and many regression tests use it. // bool CurrentlyEmpty = !PrintPiTrace && this->LastCommandEvent == nullptr && - this->DiscardedLastCommandEvent.Handle == nullptr; + this->DiscardedLastCommandEvent == nullptr; // The list can be empty if command-list only contains signals of proxy // events. if (!CommandList->second.EventList.empty() && - !this->DiscardedLastCommandEvent.Handle) + !this->DiscardedLastCommandEvent) this->LastCommandEvent = CommandList->second.EventList.back(); this->LastCommandList = CommandList; @@ -1841,11 +1813,11 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // completion. ZE_CALL(zeCommandListAppendBarrier, (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr)); - } else if (this->DiscardedLastCommandEvent.Handle) { + } else if (this->DiscardedLastCommandEvent) { this->resetLastDiscardedEvent(CommandList); this->signalEvent(CommandList); } - } else if (this->DiscardedLastCommandEvent.Handle) { + } else if (this->DiscardedLastCommandEvent) { this->resetLastDiscardedEvent(CommandList); this->signalEvent(CommandList); } @@ -2119,7 +2091,7 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine); auto NextImmCmdList = QueueGroup.ImmCmdLists[QueueGroup.NextIndex]; if (CurQueue->LastCommandEvent == nullptr && - CurQueue->DiscardedLastCommandEvent.Handle != nullptr && + CurQueue->DiscardedLastCommandEvent != nullptr && CurQueue->LastCommandList != CurQueue->CommandListMap.end() && CurQueue->LastCommandList != NextImmCmdList) { CurQueue->resetLastDiscardedEvent(CurQueue->LastCommandList); @@ -2130,7 +2102,7 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( // Close open command list if command is going to be submitted to a // different command list. if ((CurQueue->LastCommandEvent != nullptr || - CurQueue->DiscardedLastCommandEvent.Handle != nullptr) && + CurQueue->DiscardedLastCommandEvent != nullptr) && CurQueue->LastCommandList != CurQueue->CommandListMap.end() && CurQueue->LastCommandList->second.isCopy(CurQueue) != UseCopyEngine) { if (auto Res = CurQueue->executeOpenCommandList( @@ -3898,10 +3870,9 @@ static pi_result piQueueReleaseInternal(pi_queue Queue) { for (auto Cache : Queue->EventCaches) { for (auto Event : Cache) { - ZE_CALL(zeEventDestroy, (Event.Handle)); - if (auto Res = Queue->Context->decrementUnreleasedEventsInPool( - Queue, Event.Pool, Event.HostVisible)) + if (auto Res = Queue->Context->decrementUnreleasedEventsInPool(Event)) return Res; + PI_CALL(piEventRelease(Event)); } } @@ -6234,8 +6205,7 @@ static pi_result piEventReleaseInternal(pi_event Event) { if (DisableEventsCaching) { ZE_CALL(zeEventDestroy, (Event->ZeEvent)); auto Context = Event->Context; - if (auto Res = Context->decrementUnreleasedEventsInPool( - Event->Queue, Event->ZeEventPool, Event->isHostVisible())) + if (auto Res = Context->decrementUnreleasedEventsInPool(Event)) return Res; } } diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index e7307d7c4ba7b..3192576298a81 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -748,9 +748,7 @@ struct _pi_context : _pi_object { // Decrement number of events living in the pool upon event destroy // and return the pool to the cache if there are no unreleased events. - pi_result decrementUnreleasedEventsInPool(pi_queue Queue, - ze_event_pool_handle_t ZeEventPool, - bool HostVisible); + pi_result decrementUnreleasedEventsInPool(pi_event Event); // Store USM allocator context(internal allocator structures) // for USM shared and device allocations. There is 1 allocator context @@ -844,16 +842,6 @@ struct _pi_context : _pi_object { } }; -struct ze_event { - // Level Zero event handle. - ze_event_handle_t Handle; - - // Level Zero event pool handle. - ze_event_pool_handle_t Pool; - - bool HostVisible = false; -}; - struct _pi_queue : _pi_object { _pi_queue(std::vector &ComputeQueues, std::vector &CopyQueues, @@ -947,16 +935,16 @@ struct _pi_queue : _pi_object { pi_command_list_ptr_t LastCommandList = CommandListMap.end(); // Caches of events for reuse. - std::vector> EventCaches{2}; + std::vector> EventCaches{2}; auto getEventCache(bool HostVisible) { return HostVisible ? &EventCaches[0] : &EventCaches[1]; } // Get event from the queue's cache. - pi_result getEventFromCache(bool HostVisible, ze_event_handle_t *Event); + pi_result getEventFromCache(bool HostVisible, pi_event *Event); // Add event to the queue's cache. - void addEventToCache(ze_event Event); + void addEventToCache(pi_event Event); // Append command to provided command list to reset the last discarded event. // If we have in-order and discard_events mode we reset and reuse events in @@ -971,9 +959,8 @@ struct _pi_queue : _pi_object { // command list. This new event will be waited in new command list. pi_result signalEvent(pi_command_list_ptr_t); - // We store the last discarded event here. We also store additional - // information with it: host visibility and event pool where it was created. - ze_event DiscardedLastCommandEvent{nullptr, nullptr, false}; + // We store the last discarded event here. + pi_event DiscardedLastCommandEvent = nullptr; // Kernel is not necessarily submitted for execution during // piEnqueueKernelLaunch, it may be batched. That's why we need to save the From 81c7a1116db17009cccbc69c768fde3a7d109d43 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 3 Nov 2022 02:51:49 +0000 Subject: [PATCH 03/33] Reuse only ze_event handles, still create pi_event objects --- sycl/plugins/level_zero/pi_level_zero.cpp | 342 ++++++++-------------- sycl/plugins/level_zero/pi_level_zero.hpp | 20 +- 2 files changed, 137 insertions(+), 225 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 11bb94e029ab5..2cd4157afdacd 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -661,14 +661,14 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, pi_result _pi_queue::resetLastDiscardedEvent(pi_command_list_ptr_t CommandList) { - if (DiscardedLastCommandEvent && CommandList != CommandListMap.end()) { + if (LastCommandEvent && LastCommandEvent->IsDiscarded && + CommandList != CommandListMap.end()) { ZE_CALL(zeCommandListAppendBarrier, - (CommandList->first, nullptr, 1, - &(DiscardedLastCommandEvent->ZeEvent))); + (CommandList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); ZE_CALL(zeCommandListAppendEventReset, - (CommandList->first, DiscardedLastCommandEvent->ZeEvent)); - addEventToCache(DiscardedLastCommandEvent); - DiscardedLastCommandEvent = nullptr; + (CommandList->first, LastCommandEvent->ZeEvent)); + PI_CALL(addEventToCache(LastCommandEvent)); + LastCommandEvent = nullptr; } return PI_SUCCESS; } @@ -688,17 +688,27 @@ inline static pi_result createEventAndAssociateQueue( pi_queue Queue, pi_event *Event, pi_command_type CommandType, pi_command_list_ptr_t CommandList, bool IsInternal = false, bool ForceHostVisible = false) { + if (!ForceHostVisible) + ForceHostVisible = DeviceEventsSetting == AllHostVisible; + + // If event is discarded then try to get event from the queue cache. + *Event = IsInternal ? Queue->getEventFromCache(ForceHostVisible) : nullptr; + if (Queue->isInOrderQueue() && Queue->isDiscardEvents()) - // We are going to get/create event for the next command so it is time to - // reset the last disarded event (if any) and put it to the cache. + // We've possibly got discarded event above so it is time to reset the last + // disarded event (if any) and put it to the cache. Queue->resetLastDiscardedEvent(CommandList); - if (!ForceHostVisible) - ForceHostVisible = DeviceEventsSetting == AllHostVisible; - PI_CALL(EventCreate(Queue->Context, Queue, ForceHostVisible, Event)); + if (*Event == nullptr) + PI_CALL(EventCreate(Queue->Context, Queue, ForceHostVisible, Event)); (*Event)->Queue = Queue; (*Event)->CommandType = CommandType; + (*Event)->IsDiscarded = IsInternal; + // Discarded event doesn't own ze_event, it is used by multiple pi_event + // objects. + if (IsInternal) + (*Event)->OwnZeEvent = false; // Append this Event to the CommandList, if any if (CommandList != Queue->CommandListMap.end()) { @@ -745,59 +755,34 @@ pi_result _pi_queue::signalEvent(pi_command_list_ptr_t CommandList) { return PI_SUCCESS; } -pi_result _pi_queue::getEventFromCache(bool HostVisible, pi_event *Event) { +pi_event _pi_queue::getEventFromCache(bool HostVisible) { auto Cache = getEventCache(HostVisible); - if (Cache->empty()) { - *Event = nullptr; - return PI_SUCCESS; - } + if (Cache->empty()) + return nullptr; auto It = Cache->begin(); - *Event = *It; + pi_event RetEvent = *It; Cache->erase(It); - - return PI_SUCCESS; + return RetEvent; } -void _pi_queue::addEventToCache(pi_event Event) { - auto Cache = getEventCache(Event->isHostVisible()); - Cache->emplace_back(Event); -} +pi_result _pi_queue::addEventToCache(pi_event Event) { + pi_event CacheEvent; + try { + CacheEvent = new _pi_event(Event->ZeEvent, Event->ZeEventPool, Context, + PI_COMMAND_TYPE_USER, true); + } catch (const std::bad_alloc &) { + return PI_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + + if (Event->isHostVisible()) + CacheEvent->HostVisibleEvent = CacheEvent; -// This function is used to get or create discarded event. When event is -// discarded we don't create pi_event object and reuse events from the queue -// cache if any. -inline static pi_result -getOrCreateDiscardedEvent(pi_queue Queue, ze_event_handle_t *ZeEvent, - pi_command_list_ptr_t CommandList, - _pi_ze_event_list_t WaitList, - bool HostVisible = false) { - if (!HostVisible) - HostVisible = DeviceEventsSetting == AllHostVisible; - - pi_event Event = nullptr; - Queue->getEventFromCache(HostVisible, &Event); - - if (!Event) - PI_CALL(EventCreate(Queue->Context, Queue, HostVisible, &Event)); - - // We don't have pi_event object for discarded event, so store waitlist in the - // command list. Events from the waitlist will be released after command list - // is finished. - CommandList->second.WaitLists.push_back(WaitList); - - // We've got event for the next command above, it is time to reset the last - // disarded event and put it to the cache. - Queue->resetLastDiscardedEvent(CommandList); - Queue->DiscardedLastCommandEvent = Event; - Queue->LastCommandEvent = nullptr; - *ZeEvent = Event->ZeEvent; - - // We need to keep track of number of discarded events for batching purposes. - // Otherwise discarded events won't be taken into account which will affect - // batching. - CommandList->second.NumDiscardedEvents++; + auto Cache = getEventCache(CacheEvent->isHostVisible()); + Cache->emplace_back(CacheEvent); return PI_SUCCESS; } @@ -1114,13 +1099,6 @@ _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, bool UseCopyEngine = CommandList->second.isCopy(this); if (isInOrderQueue() && isDiscardEvents()) { - // If there were discarded events in the command list then we have to - // release kernels associated with them. - for (auto Kernel : CommandList->second.Kernels) { - piKernelRelease(Kernel); - } - CommandList->second.Kernels.clear(); - // If there were discarded events in the command list then we have to // release events from wait lists associated with them. for (auto WaitList : CommandList->second.WaitLists) { @@ -1148,7 +1126,6 @@ _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, ZE_CALL(zeFenceReset, (CommandList->second.ZeFence)); ZE_CALL(zeCommandListReset, (CommandList->first)); CommandList->second.ZeFenceInUse = false; - CommandList->second.NumDiscardedEvents = 0; } auto &EventList = CommandList->second.EventList; @@ -1687,13 +1664,11 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // traces incurs much different timings than real execution // ansyway, and many regression tests use it. // - bool CurrentlyEmpty = !PrintPiTrace && this->LastCommandEvent == nullptr && - this->DiscardedLastCommandEvent == nullptr; + bool CurrentlyEmpty = !PrintPiTrace && this->LastCommandEvent == nullptr; // The list can be empty if command-list only contains signals of proxy // events. - if (!CommandList->second.EventList.empty() && - !this->DiscardedLastCommandEvent) + if (!CommandList->second.EventList.empty()) this->LastCommandEvent = CommandList->second.EventList.back(); this->LastCommandList = CommandList; @@ -1813,11 +1788,12 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // completion. ZE_CALL(zeCommandListAppendBarrier, (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr)); - } else if (this->DiscardedLastCommandEvent) { + } else if (this->LastCommandEvent && + this->LastCommandEvent->IsDiscarded) { this->resetLastDiscardedEvent(CommandList); this->signalEvent(CommandList); } - } else if (this->DiscardedLastCommandEvent) { + } else if (this->LastCommandEvent && this->LastCommandEvent->IsDiscarded) { this->resetLastDiscardedEvent(CommandList); this->signalEvent(CommandList); } @@ -2090,8 +2066,8 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( // in the new immediate command list. auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine); auto NextImmCmdList = QueueGroup.ImmCmdLists[QueueGroup.NextIndex]; - if (CurQueue->LastCommandEvent == nullptr && - CurQueue->DiscardedLastCommandEvent != nullptr && + if (CurQueue->LastCommandEvent != nullptr && + CurQueue->LastCommandEvent->IsDiscarded && CurQueue->LastCommandList != CurQueue->CommandListMap.end() && CurQueue->LastCommandList != NextImmCmdList) { CurQueue->resetLastDiscardedEvent(CurQueue->LastCommandList); @@ -2101,8 +2077,7 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( } else { // Close open command list if command is going to be submitted to a // different command list. - if ((CurQueue->LastCommandEvent != nullptr || - CurQueue->DiscardedLastCommandEvent != nullptr) && + if (CurQueue->LastCommandEvent != nullptr && CurQueue->LastCommandList != CurQueue->CommandListMap.end() && CurQueue->LastCommandList->second.isCopy(CurQueue) != UseCopyEngine) { if (auto Res = CurQueue->executeOpenCommandList( @@ -2113,7 +2088,8 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( } try { - if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) { + if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr && + !CurQueue->LastCommandEvent->IsDiscarded) { this->ZeEventList = new ze_event_handle_t[EventListLength + 1]; this->PiEventList = new pi_event[EventListLength + 1]; } else if (EventListLength > 0) { @@ -2205,7 +2181,8 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( // For in-order queues, every command should be executed only after the // previous command has finished. The event associated with the last // enqueued command is added into the waitlist to ensure in-order semantics. - if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) { + if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr && + !CurQueue->LastCommandEvent->IsDiscarded) { std::shared_lock Lock(CurQueue->LastCommandEvent->Mutex); this->ZeEventList[TmpListLength] = CurQueue->LastCommandEvent->ZeEvent; this->PiEventList[TmpListLength] = CurQueue->LastCommandEvent; @@ -3872,7 +3849,7 @@ static pi_result piQueueReleaseInternal(pi_queue Queue) { for (auto Event : Cache) { if (auto Res = Queue->Context->decrementUnreleasedEventsInPool(Event)) return Res; - PI_CALL(piEventRelease(Event)); + PI_CALL(piEventReleaseInternal(Event)); } } @@ -5570,29 +5547,19 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, return Res; ze_event_handle_t ZeEvent = nullptr; + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + pi_result Res = createEventAndAssociateQueue( + Queue, Event, PI_COMMAND_TYPE_NDRANGE_KERNEL, CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; - if (OutEvent || !ReuseDiscardedEvents) { - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - pi_result Res = createEventAndAssociateQueue( - Queue, Event, PI_COMMAND_TYPE_NDRANGE_KERNEL, CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - - // Save the kernel in the event, so that when the event is signalled - // the code can do a piKernelRelease on this kernel. - (*Event)->CommandData = (void *)Kernel; - } else { - pi_result Res = - getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); - if (Res != PI_SUCCESS) - return Res; - - CommandList->second.Kernels.push_back(Kernel); - } + // Save the kernel in the event, so that when the event is signalled + // the code can do a piKernelRelease on this kernel. + (*Event)->CommandData = (void *)Kernel; // Increment the reference count of the Kernel and indicate that the Kernel is // in use. Once the event has been signalled, the code in @@ -6041,6 +6008,7 @@ static pi_result CleanupCompletedEvent(pi_event Event, bool QueueLocked) { // a dangling pointer to this event. It could also cause unneeded // already finished events to show up in the wait list. if (AssociatedQueue->LastCommandEvent == Event) { + assert(!AssociatedQueue->LastCommandEvent->IsDiscarded); AssociatedQueue->LastCommandEvent = nullptr; } } @@ -6486,23 +6454,16 @@ pi_result piEnqueueEventsWait(pi_queue Queue, pi_uint32 NumEventsInWaitList, return Res; ze_event_handle_t ZeEvent = nullptr; - if (OutEvent || !ReuseDiscardedEvents) { - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue( - Queue, Event, PI_COMMAND_TYPE_USER, CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, + CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - } else { - pi_result Res = - getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); - if (Res != PI_SUCCESS) - return Res; - } + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; auto ZeCommandList = CommandList->first; ZE_CALL(zeCommandListAppendWaitOnEvents, @@ -6853,22 +6814,15 @@ enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst, ze_event_handle_t ZeEvent = nullptr; - if (OutEvent || !ReuseDiscardedEvents) { - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - } else { - pi_result Res = - getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); - if (Res != PI_SUCCESS) - return Res; - } + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, + CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; if (TmpWaitList.Length) { @@ -6922,22 +6876,15 @@ static pi_result enqueueMemCopyRectHelper( return Res; ze_event_handle_t ZeEvent = nullptr; - if (OutEvent || !ReuseDiscardedEvents) { - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - } else { - pi_result Res = - getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); - if (Res != PI_SUCCESS) - return Res; - } + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, + CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; @@ -7184,23 +7131,16 @@ enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, return Res; ze_event_handle_t ZeEvent = nullptr; - if (OutEvent || !ReuseDiscardedEvents) { - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, + CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - } else { - pi_result Res = - getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); - if (Res != PI_SUCCESS) - return Res; - } + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; @@ -7614,22 +7554,15 @@ static pi_result enqueueMemImageCommandHelper( return Res; ze_event_handle_t ZeEvent = nullptr; - if (OutEvent || !ReuseDiscardedEvents) { - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - } else { - pi_result Res = - getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); - if (Res != PI_SUCCESS) - return Res; - } + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, + CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; @@ -8619,23 +8552,15 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size, // TODO: do we need to create a unique command type for this? ze_event_handle_t ZeEvent = nullptr; - - if (OutEvent || !ReuseDiscardedEvents) { - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - } else { - pi_result Res = - getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); - if (Res != PI_SUCCESS) - return Res; - } + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, + CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; if (TmpWaitList.Length) { @@ -8691,22 +8616,15 @@ pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, // TODO: do we need to create a unique command type for this? ze_event_handle_t ZeEvent = nullptr; - if (OutEvent || !ReuseDiscardedEvents) { - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - } else { - pi_result Res = - getOrCreateDiscardedEvent(Queue, &ZeEvent, CommandList, TmpWaitList); - if (Res != PI_SUCCESS) - return Res; - } + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, + CommandList, IsInternal); + if (Res != PI_SUCCESS) + return Res; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 3192576298a81..09e95c94feba2 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -612,16 +612,9 @@ struct pi_command_list_info_t { // only have last one visible to the host. std::vector EventList{}; - // List of kernels in this command list associated with discarded events. - std::list Kernels; - - // List of dependent events associated with discarded events in this command - // list. std::list<_pi_ze_event_list_t> WaitLists; - size_t NumDiscardedEvents = 0; - - size_t size() const { return EventList.size() + NumDiscardedEvents; } + size_t size() const { return EventList.size(); } void append(pi_event Event) { EventList.push_back(Event); } }; @@ -941,10 +934,10 @@ struct _pi_queue : _pi_object { } // Get event from the queue's cache. - pi_result getEventFromCache(bool HostVisible, pi_event *Event); + pi_event getEventFromCache(bool HostVisible); // Add event to the queue's cache. - void addEventToCache(pi_event Event); + pi_result addEventToCache(pi_event Event); // Append command to provided command list to reset the last discarded event. // If we have in-order and discard_events mode we reset and reuse events in @@ -959,9 +952,6 @@ struct _pi_queue : _pi_object { // command list. This new event will be waited in new command list. pi_result signalEvent(pi_command_list_ptr_t); - // We store the last discarded event here. - pi_event DiscardedLastCommandEvent = nullptr; - // Kernel is not necessarily submitted for execution during // piEnqueueKernelLaunch, it may be batched. That's why we need to save the // list of kernels which is going to be submitted but have not been submitted @@ -1433,6 +1423,10 @@ struct _pi_event : _pi_object { // being visible to the host at all. bool Completed = {false}; + // Indicates that this event is discarded, i.e. it is not visible outside of + // plugin. + bool IsDiscarded = {false}; + // Besides each PI object keeping a total reference count in // _pi_object::RefCount we keep special track of the event *external* // references. This way we are able to tell when the event is not referenced From 0448cb8c0df9a74e7395fd21e67cd3579cf72605 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 3 Nov 2022 06:15:11 +0000 Subject: [PATCH 04/33] Simplify barrier insertion in the beginning of command list --- sycl/plugins/level_zero/pi_level_zero.cpp | 58 +++++++++-------------- sycl/plugins/level_zero/pi_level_zero.hpp | 2 +- 2 files changed, 23 insertions(+), 37 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 2cd4157afdacd..c81d965305176 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -743,12 +743,8 @@ pi_result _pi_queue::signalEvent(pi_command_list_ptr_t CommandList) { this, &SpecialEvent, PI_COMMAND_TYPE_USER, CommandList, /* IsDiscarded */ false, /* ForceHostVisible */ false)); - // We want a barrier in the beginning of a next command list waiting for this - // special event. - ActiveBarriers.push_back(SpecialEvent); - - // We don't need additional dependency through LastCommandEvent. - LastCommandEvent = nullptr; + PI_CALL(piEventRelease(SpecialEvent)); + LastCommandEvent = SpecialEvent; ZE_CALL(zeCommandListAppendSignalEvent, (CommandList->first, SpecialEvent->ZeEvent)); @@ -1098,24 +1094,9 @@ _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, std::vector &EventListToCleanup) { bool UseCopyEngine = CommandList->second.isCopy(this); - if (isInOrderQueue() && isDiscardEvents()) { - // If there were discarded events in the command list then we have to - // release events from wait lists associated with them. - for (auto WaitList : CommandList->second.WaitLists) { - std::list EventsToBeReleased; - WaitList.collectEventsForReleaseAndDestroyPiZeEventList( - EventsToBeReleased); - - // Event may be in the wait list of more than one event. But we have to - // cleanup it only once, that's why use unordered_set to make it happen. - std::unordered_set Events; - std::copy(EventsToBeReleased.begin(), EventsToBeReleased.end(), - std::inserter(Events, Events.begin())); - - for (auto Event : Events) - EventListToCleanup.push_back(Event); - } - CommandList->second.WaitLists.clear(); + if (CommandList->second.SpecialEvent) { + EventListToCleanup.push_back(CommandList->second.SpecialEvent); + CommandList->second.SpecialEvent = nullptr; } // Immediate commandlists do not have an associated fence. @@ -1995,6 +1976,20 @@ pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, if (ActiveBarriers.empty()) return PI_SUCCESS; + // For in-order queue every command depends on the previous one so we don't + // need to insert active barriers for every next command list. + // But we have to handle LastCommandEvent as an active barrier if we have + // discard_events mode. + if (isInOrderQueue() && isDiscardEvents() && LastCommandEvent) { + ZE_CALL(zeCommandListAppendBarrier, + (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); + CmdList->second.SpecialEvent = LastCommandEvent; + // This event will be released on command list completion. + PI_CALL(piEventRetain(CmdList->second.SpecialEvent)); + LastCommandEvent = nullptr; + return PI_SUCCESS; + } + // Create a wait-list and retain events. This will filter out finished events. _pi_ze_event_list_t ActiveBarriersWaitList; if (auto Res = ActiveBarriersWaitList.createAndRetainPiZeEventList( @@ -2007,12 +2002,9 @@ pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, PI_CALL(piEventReleaseInternal(BarrierEvent)); ActiveBarriers.clear(); - // For in-order queue every command depends on the previous one so we don't - // need to insert active barriers for every next command list. - if (!isInOrderQueue()) - ActiveBarriers.insert( - ActiveBarriers.end(), ActiveBarriersWaitList.PiEventList, - ActiveBarriersWaitList.PiEventList + ActiveBarriersWaitList.Length); + ActiveBarriers.insert( + ActiveBarriers.end(), ActiveBarriersWaitList.PiEventList, + ActiveBarriersWaitList.PiEventList + ActiveBarriersWaitList.Length); // If there are more active barriers, insert a barrier on the command-list. We // do not need an event for finishing so we pass nullptr. @@ -2021,12 +2013,6 @@ pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, (CmdList->first, nullptr, ActiveBarriersWaitList.Length, ActiveBarriersWaitList.ZeEventList)); - // Active barriers are released at queue synchronization, but for in-order - // queue we don't keep them till that point so store them in the command list, - // they will be released on completion of command list. - if (isInOrderQueue()) - CmdList->second.WaitLists.push_back(ActiveBarriersWaitList); - return PI_SUCCESS; } diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 09e95c94feba2..c35960942f220 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -612,7 +612,7 @@ struct pi_command_list_info_t { // only have last one visible to the host. std::vector EventList{}; - std::list<_pi_ze_event_list_t> WaitLists; + pi_event SpecialEvent = nullptr; size_t size() const { return EventList.size(); } void append(pi_event Event) { EventList.push_back(Event); } From 6541fb342d8662c24fb836f3f8f8bc227997beae Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 3 Nov 2022 17:51:24 +0000 Subject: [PATCH 05/33] Enforce round robin behavior when reusing discarded events --- sycl/plugins/level_zero/pi_level_zero.cpp | 82 +++++++++++++---------- sycl/plugins/level_zero/pi_level_zero.hpp | 4 ++ 2 files changed, 51 insertions(+), 35 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index c81d965305176..18ec4f22f1f69 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -659,6 +659,26 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, if (!(condition)) \ return error; + +pi_result +_pi_queue::setLastDiscardedEvent(pi_event Event) { + try { + // We expect previous event to be in the cache. + assert(LastDiscardedEvent == nullptr); + LastDiscardedEvent = new _pi_event(Event->ZeEvent, Event->ZeEventPool, Context, + PI_COMMAND_TYPE_USER, true); + } catch (const std::bad_alloc &) { + return PI_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + + if (Event->isHostVisible()) + LastDiscardedEvent->HostVisibleEvent = LastDiscardedEvent; + + return PI_SUCCESS; +} + pi_result _pi_queue::resetLastDiscardedEvent(pi_command_list_ptr_t CommandList) { if (LastCommandEvent && LastCommandEvent->IsDiscarded && @@ -667,8 +687,10 @@ _pi_queue::resetLastDiscardedEvent(pi_command_list_ptr_t CommandList) { (CommandList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); ZE_CALL(zeCommandListAppendEventReset, (CommandList->first, LastCommandEvent->ZeEvent)); - PI_CALL(addEventToCache(LastCommandEvent)); - LastCommandEvent = nullptr; + + // Remember last discarded event. Can't put it to the cache right now to avoid taking it as the next discarded event which will cause using same event two times in a row. + // We need to round robin between two events. + setLastDiscardedEvent(LastCommandEvent); } return PI_SUCCESS; } @@ -694,10 +716,12 @@ inline static pi_result createEventAndAssociateQueue( // If event is discarded then try to get event from the queue cache. *Event = IsInternal ? Queue->getEventFromCache(ForceHostVisible) : nullptr; - if (Queue->isInOrderQueue() && Queue->isDiscardEvents()) + if (IsInternal && Queue->LastDiscardedEvent) { // We've possibly got discarded event above so it is time to reset the last // disarded event (if any) and put it to the cache. - Queue->resetLastDiscardedEvent(CommandList); + PI_CALL(Queue->addEventToCache(Queue->LastDiscardedEvent)); + Queue->LastDiscardedEvent = nullptr; + } if (*Event == nullptr) PI_CALL(EventCreate(Queue->Context, Queue, ForceHostVisible, Event)); @@ -764,21 +788,8 @@ pi_event _pi_queue::getEventFromCache(bool HostVisible) { } pi_result _pi_queue::addEventToCache(pi_event Event) { - pi_event CacheEvent; - try { - CacheEvent = new _pi_event(Event->ZeEvent, Event->ZeEventPool, Context, - PI_COMMAND_TYPE_USER, true); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - if (Event->isHostVisible()) - CacheEvent->HostVisibleEvent = CacheEvent; - - auto Cache = getEventCache(CacheEvent->isHostVisible()); - Cache->emplace_back(CacheEvent); + auto Cache = getEventCache(Event->isHostVisible()); + Cache->emplace_back(Event); return PI_SUCCESS; } @@ -1045,10 +1056,16 @@ pi_result _pi_context::finalize() { } } { + std::unordered_set Pools; std::scoped_lock Lock(ZeEventPoolCacheMutex); for (auto &ZePoolCache : ZeEventPoolCache) { - for (auto &ZePool : ZePoolCache) + for (auto &ZePool : ZePoolCache) { + if (Pools.find(ZePool) != Pools.end()) + std::cout << "Removing two times" << std::endl; + ZE_CALL(zeEventPoolDestroy, (ZePool)); + Pools.insert(ZePool); + } ZePoolCache.clear(); } } @@ -1649,8 +1666,11 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // The list can be empty if command-list only contains signals of proxy // events. - if (!CommandList->second.EventList.empty()) + if (!CommandList->second.EventList.empty() && this->LastCommandEvent != CommandList->second.EventList.back()) { this->LastCommandEvent = CommandList->second.EventList.back(); + if (this->LastCommandEvent->IsDiscarded) + PI_CALL(resetLastDiscardedEvent(CommandList)); + } this->LastCommandList = CommandList; @@ -1762,8 +1782,6 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // Indicate no cleanup is needed for this PI event as it is special. HostVisibleEvent->CleanedUp = true; - this->resetLastDiscardedEvent(CommandList); - // Finally set to signal the host-visible event at the end of the // command-list after a barrier that waits for all commands // completion. @@ -1771,11 +1789,9 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr)); } else if (this->LastCommandEvent && this->LastCommandEvent->IsDiscarded) { - this->resetLastDiscardedEvent(CommandList); this->signalEvent(CommandList); } } else if (this->LastCommandEvent && this->LastCommandEvent->IsDiscarded) { - this->resetLastDiscardedEvent(CommandList); this->signalEvent(CommandList); } @@ -2056,7 +2072,6 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( CurQueue->LastCommandEvent->IsDiscarded && CurQueue->LastCommandList != CurQueue->CommandListMap.end() && CurQueue->LastCommandList != NextImmCmdList) { - CurQueue->resetLastDiscardedEvent(CurQueue->LastCommandList); CurQueue->signalEvent(CurQueue->LastCommandList); } } @@ -3830,14 +3845,13 @@ static pi_result piQueueReleaseInternal(pi_queue Queue) { if (!Queue->RefCount.decrementAndTest()) return PI_SUCCESS; + + if (Queue->LastDiscardedEvent) + PI_CALL(Queue->addEventToCache(Queue->LastDiscardedEvent)); - for (auto Cache : Queue->EventCaches) { - for (auto Event : Cache) { - if (auto Res = Queue->Context->decrementUnreleasedEventsInPool(Event)) - return Res; + for (auto Cache : Queue->EventCaches) + for (auto Event : Cache) PI_CALL(piEventReleaseInternal(Event)); - } - } if (Queue->OwnZeCommandQueue) { for (auto &ZeQueue : Queue->ComputeQueueGroup.ZeQueues) { @@ -5993,10 +6007,8 @@ static pi_result CleanupCompletedEvent(pi_event Event, bool QueueLocked) { // If we don't do this, the event can get released and freed leaving // a dangling pointer to this event. It could also cause unneeded // already finished events to show up in the wait list. - if (AssociatedQueue->LastCommandEvent == Event) { - assert(!AssociatedQueue->LastCommandEvent->IsDiscarded); + if (AssociatedQueue->LastCommandEvent == Event) AssociatedQueue->LastCommandEvent = nullptr; - } } // Release this event since we explicitly retained it on creation and diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index c35960942f220..2ffd445ad058b 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -922,6 +922,10 @@ struct _pi_queue : _pi_object { // command is enqueued. pi_event LastCommandEvent = nullptr; + pi_event LastDiscardedEvent = nullptr; + + pi_result setLastDiscardedEvent(pi_event Event); + // Keep track of the last command list used by in-order queue. // This is needed because we need to handle the change of the command list in // a specific way. From 9603869d04ebc5480aa30bda9b3d4a01bd68380d Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 3 Nov 2022 19:06:18 +0000 Subject: [PATCH 06/33] Handle LastCommandInBatchHostVisible mode properly in discard_events mode --- sycl/plugins/level_zero/pi_level_zero.cpp | 39 +++++++++++++++-------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 18ec4f22f1f69..59b344b74be20 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -659,14 +659,12 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, if (!(condition)) \ return error; - -pi_result -_pi_queue::setLastDiscardedEvent(pi_event Event) { +pi_result _pi_queue::setLastDiscardedEvent(pi_event Event) { try { // We expect previous event to be in the cache. assert(LastDiscardedEvent == nullptr); - LastDiscardedEvent = new _pi_event(Event->ZeEvent, Event->ZeEventPool, Context, - PI_COMMAND_TYPE_USER, true); + LastDiscardedEvent = new _pi_event(Event->ZeEvent, Event->ZeEventPool, + Context, PI_COMMAND_TYPE_USER, true); } catch (const std::bad_alloc &) { return PI_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -688,8 +686,9 @@ _pi_queue::resetLastDiscardedEvent(pi_command_list_ptr_t CommandList) { ZE_CALL(zeCommandListAppendEventReset, (CommandList->first, LastCommandEvent->ZeEvent)); - // Remember last discarded event. Can't put it to the cache right now to avoid taking it as the next discarded event which will cause using same event two times in a row. - // We need to round robin between two events. + // Remember last discarded event. Can't put it to the cache right now to + // avoid taking it as the next discarded event which will cause using same + // event two times in a row. We need to round robin between two events. setLastDiscardedEvent(LastCommandEvent); } return PI_SUCCESS; @@ -1666,7 +1665,8 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // The list can be empty if command-list only contains signals of proxy // events. - if (!CommandList->second.EventList.empty() && this->LastCommandEvent != CommandList->second.EventList.back()) { + if (!CommandList->second.EventList.empty() && + this->LastCommandEvent != CommandList->second.EventList.back()) { this->LastCommandEvent = CommandList->second.EventList.back(); if (this->LastCommandEvent->IsDiscarded) PI_CALL(resetLastDiscardedEvent(CommandList)); @@ -1777,16 +1777,27 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // after createEventAndAssociateQueue ref count is 2 and then +1 for // each event in the EventList. PI_CALL(piEventReleaseInternal(HostVisibleEvent)); - PI_CALL(piEventReleaseInternal(HostVisibleEvent)); - // Indicate no cleanup is needed for this PI event as it is special. - HostVisibleEvent->CleanedUp = true; + if (isInOrderQueue() && isDiscardEvents()) { + // If we have in-order queue with discarded events then we want to treat this event as regular event and use it as a dependency for the next command. + LastCommandEvent = HostVisibleEvent; + } else { + // For all other queues treat this as a special event and indicate no cleanup is needed. + PI_CALL(piEventReleaseInternal(HostVisibleEvent)); + HostVisibleEvent->CleanedUp = true; + } // Finally set to signal the host-visible event at the end of the // command-list after a barrier that waits for all commands // completion. - ZE_CALL(zeCommandListAppendBarrier, - (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr)); + if (LastCommandEvent && LastCommandEvent->IsDiscarded) { + // If we the last event is discarded then we already have a barrier inserted, so just signal event. + ZE_CALL(zeCommandListAppendSignalEvent, + (CommandList->first, HostVisibleEvent->ZeEvent)); + } else { + ZE_CALL(zeCommandListAppendBarrier, + (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr)); + } } else if (this->LastCommandEvent && this->LastCommandEvent->IsDiscarded) { this->signalEvent(CommandList); @@ -3845,7 +3856,7 @@ static pi_result piQueueReleaseInternal(pi_queue Queue) { if (!Queue->RefCount.decrementAndTest()) return PI_SUCCESS; - + if (Queue->LastDiscardedEvent) PI_CALL(Queue->addEventToCache(Queue->LastDiscardedEvent)); From 3e72dd4767d3d0768174820453bf354083e3d2cd Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 3 Nov 2022 19:19:15 +0000 Subject: [PATCH 07/33] Remove redundant changes --- sycl/plugins/level_zero/pi_level_zero.hpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 2ffd445ad058b..9beb285bd94a8 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -946,7 +946,7 @@ struct _pi_queue : _pi_object { // Append command to provided command list to reset the last discarded event. // If we have in-order and discard_events mode we reset and reuse events in // scope of the same command lists. This method allows to wait for the last - // discarded event, reset it and put to the cache for future reuse. + // discarded event, reset it. pi_result resetLastDiscardedEvent(pi_command_list_ptr_t); // Append command to provided command list to signal new event. @@ -1349,14 +1349,6 @@ struct _pi_ze_event_list_t { this->Length = other.Length; return *this; } - - _pi_ze_event_list_t(const _pi_ze_event_list_t &other) { - this->ZeEventList = other.ZeEventList; - this->PiEventList = other.PiEventList; - this->Length = other.Length; - } - - _pi_ze_event_list_t() {} }; struct _pi_event : _pi_object { From 94461406a3ec8290e311288a0cf88c058d3413ce Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 3 Nov 2022 19:50:37 +0000 Subject: [PATCH 08/33] Remove redundant changes --- sycl/plugins/level_zero/pi_level_zero.cpp | 54 +++++++++++++---------- sycl/plugins/level_zero/pi_level_zero.hpp | 2 - 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 59b344b74be20..e5c4cc67b9ad0 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2028,7 +2028,6 @@ pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, for (pi_event &BarrierEvent : ActiveBarriers) PI_CALL(piEventReleaseInternal(BarrierEvent)); ActiveBarriers.clear(); - ActiveBarriers.insert( ActiveBarriers.end(), ActiveBarriersWaitList.PiEventList, ActiveBarriersWaitList.PiEventList + ActiveBarriersWaitList.Length); @@ -2039,7 +2038,6 @@ pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, ZE_CALL(zeCommandListAppendBarrier, (CmdList->first, nullptr, ActiveBarriersWaitList.Length, ActiveBarriersWaitList.ZeEventList)); - return PI_SUCCESS; } @@ -5600,7 +5598,8 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // Add the command to the command list, which implies submission. ZE_CALL(zeCommandListAppendLaunchKernel, (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions, - ZeEvent, TmpWaitList.Length, TmpWaitList.ZeEventList)); + ZeEvent, (*Event)->WaitList.Length, + (*Event)->WaitList.ZeEventList)); } else { // Add the command to the command list for later submission. // No lock is needed here, unlike the immediate commandlist case above, @@ -5608,13 +5607,14 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // submitted only when the comamndlist is closed. Then, a lock is held. ZE_CALL(zeCommandListAppendLaunchKernel, (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions, - ZeEvent, TmpWaitList.Length, TmpWaitList.ZeEventList)); + ZeEvent, (*Event)->WaitList.Length, + (*Event)->WaitList.ZeEventList)); } zePrint("calling zeCommandListAppendLaunchKernel() with" " ZeEvent %#lx\n", pi_cast(ZeEvent)); - printZeEventList(TmpWaitList); + printZeEventList((*Event)->WaitList); // Execute command list asynchronously, as the event will be used // to track down its completion. @@ -6018,8 +6018,9 @@ static pi_result CleanupCompletedEvent(pi_event Event, bool QueueLocked) { // If we don't do this, the event can get released and freed leaving // a dangling pointer to this event. It could also cause unneeded // already finished events to show up in the wait list. - if (AssociatedQueue->LastCommandEvent == Event) + if (AssociatedQueue->LastCommandEvent == Event) { AssociatedQueue->LastCommandEvent = nullptr; + } } // Release this event since we explicitly retained it on creation and @@ -6474,9 +6475,10 @@ pi_result piEnqueueEventsWait(pi_queue Queue, pi_uint32 NumEventsInWaitList, ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; + const auto &WaitList = (*Event)->WaitList; auto ZeCommandList = CommandList->first; ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); + (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); ZE_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); @@ -6822,7 +6824,6 @@ enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst, return Res; ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; bool IsInternal = OutEvent == nullptr; pi_event *Event = OutEvent ? OutEvent : &InternalEvent; @@ -6834,16 +6835,16 @@ enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst, (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; - if (TmpWaitList.Length) { - + const auto &WaitList = (*Event)->WaitList; + if (WaitList.Length) { ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); + (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); } zePrint("calling zeCommandListAppendMemoryCopy() with\n" " ZeEvent %#lx\n", pi_cast(ZeEvent)); - printZeEventList(TmpWaitList); + printZeEventList(WaitList); ZE_CALL(zeCommandListAppendMemoryCopy, (ZeCommandList, Dst, Src, Size, ZeEvent, 0, nullptr)); @@ -6896,15 +6897,16 @@ static pi_result enqueueMemCopyRectHelper( (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; + const auto &WaitList = (*Event)->WaitList; - if (TmpWaitList.Length) { + if (WaitList.Length) { ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); + (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); } zePrint("calling zeCommandListAppendMemoryCopy() with\n" " ZeEvent %#lx\n", pi_cast(ZeEvent)); - printZeEventList(TmpWaitList); + printZeEventList(WaitList); uint32_t SrcOriginX = pi_cast(SrcOrigin->x_bytes); uint32_t SrcOriginY = pi_cast(SrcOrigin->y_scalar); @@ -7152,10 +7154,11 @@ enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; + const auto &WaitList = (*Event)->WaitList; - if (TmpWaitList.Length) { + if (WaitList.Length) { ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); + (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); } ZE_CALL( @@ -7165,7 +7168,7 @@ enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, zePrint("calling zeCommandListAppendMemoryFill() with\n" " ZeEvent %#lx\n", pi_cast(ZeEvent)); - printZeEventList(TmpWaitList); + printZeEventList(WaitList); // Execute command list asynchronously, as the event will be used // to track down its completion. @@ -7574,10 +7577,11 @@ static pi_result enqueueMemImageCommandHelper( (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; + const auto &WaitList = (*Event)->WaitList; - if (TmpWaitList.Length) { + if (WaitList.Length) { ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); + (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); } if (CommandType == PI_COMMAND_TYPE_IMAGE_READ) { pi_mem SrcMem = pi_cast(const_cast(Src)); @@ -8571,10 +8575,11 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size, ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; + const auto &WaitList = (*Event)->WaitList; const auto &ZeCommandList = CommandList->first; - if (TmpWaitList.Length) { + if (WaitList.Length) { ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); + (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); } // TODO: figure out how to translate "flags" ZE_CALL(zeCommandListAppendMemoryPrefetch, (ZeCommandList, Ptr, Size)); @@ -8636,10 +8641,11 @@ pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; + const auto &WaitList = (*Event)->WaitList; - if (TmpWaitList.Length) { + if (WaitList.Length) { ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, TmpWaitList.Length, TmpWaitList.ZeEventList)); + (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); } ZE_CALL(zeCommandListAppendMemAdvise, diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 9beb285bd94a8..efb93faef6312 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -581,8 +581,6 @@ struct _pi_device : _pi_object { ZeCache> ZeDeviceCacheProperties; }; -struct _pi_ze_event_list_t; - // Structure describing the specific use of a command-list in a queue. // This is because command-lists are re-used across multiple queues // in the same context. From c8ceca3f73b9d594225d20c4af2346a7cd3954e1 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 3 Nov 2022 21:31:58 +0000 Subject: [PATCH 09/33] Remove redundant code --- sycl/plugins/level_zero/pi_level_zero.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index e5c4cc67b9ad0..850788648f23c 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1055,16 +1055,10 @@ pi_result _pi_context::finalize() { } } { - std::unordered_set Pools; std::scoped_lock Lock(ZeEventPoolCacheMutex); for (auto &ZePoolCache : ZeEventPoolCache) { - for (auto &ZePool : ZePoolCache) { - if (Pools.find(ZePool) != Pools.end()) - std::cout << "Removing two times" << std::endl; - + for (auto &ZePool : ZePoolCache) ZE_CALL(zeEventPoolDestroy, (ZePool)); - Pools.insert(ZePool); - } ZePoolCache.clear(); } } @@ -1409,7 +1403,6 @@ pi_result _pi_context::getAvailableCommandList( // Immediate commandlists have been pre-allocated and are always available. if (Queue->Device->useImmediateCommandLists()) { CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList(); - if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) return Res; return PI_SUCCESS; @@ -6837,6 +6830,7 @@ enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst, const auto &ZeCommandList = CommandList->first; const auto &WaitList = (*Event)->WaitList; if (WaitList.Length) { + ZE_CALL(zeCommandListAppendWaitOnEvents, (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); } From d649e28111802503533843b101d03a416ac0592b Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 4 Nov 2022 04:20:24 +0000 Subject: [PATCH 10/33] Revert "Simplify barrier insertion in the beginning of command list" This reverts commit 0448cb8c0df9a74e7395fd21e67cd3579cf72605. --- sycl/plugins/level_zero/pi_level_zero.cpp | 60 ++++++++++++++--------- sycl/plugins/level_zero/pi_level_zero.hpp | 12 ++++- 2 files changed, 49 insertions(+), 23 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 850788648f23c..c7c39422df8ed 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -766,8 +766,12 @@ pi_result _pi_queue::signalEvent(pi_command_list_ptr_t CommandList) { this, &SpecialEvent, PI_COMMAND_TYPE_USER, CommandList, /* IsDiscarded */ false, /* ForceHostVisible */ false)); - PI_CALL(piEventRelease(SpecialEvent)); - LastCommandEvent = SpecialEvent; + // We want a barrier in the beginning of a next command list waiting for this + // special event. + ActiveBarriers.push_back(SpecialEvent); + + // We don't need additional dependency through LastCommandEvent. + LastCommandEvent = nullptr; ZE_CALL(zeCommandListAppendSignalEvent, (CommandList->first, SpecialEvent->ZeEvent)); @@ -1104,9 +1108,24 @@ _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, std::vector &EventListToCleanup) { bool UseCopyEngine = CommandList->second.isCopy(this); - if (CommandList->second.SpecialEvent) { - EventListToCleanup.push_back(CommandList->second.SpecialEvent); - CommandList->second.SpecialEvent = nullptr; + if (isInOrderQueue() && isDiscardEvents()) { + // If there were discarded events in the command list then we have to + // release events from wait lists associated with them. + for (auto WaitList : CommandList->second.WaitLists) { + std::list EventsToBeReleased; + WaitList.collectEventsForReleaseAndDestroyPiZeEventList( + EventsToBeReleased); + + // Event may be in the wait list of more than one event. But we have to + // cleanup it only once, that's why use unordered_set to make it happen. + std::unordered_set Events; + std::copy(EventsToBeReleased.begin(), EventsToBeReleased.end(), + std::inserter(Events, Events.begin())); + + for (auto Event : Events) + EventListToCleanup.push_back(Event); + } + CommandList->second.WaitLists.clear(); } // Immediate commandlists do not have an associated fence. @@ -1996,20 +2015,6 @@ pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, if (ActiveBarriers.empty()) return PI_SUCCESS; - // For in-order queue every command depends on the previous one so we don't - // need to insert active barriers for every next command list. - // But we have to handle LastCommandEvent as an active barrier if we have - // discard_events mode. - if (isInOrderQueue() && isDiscardEvents() && LastCommandEvent) { - ZE_CALL(zeCommandListAppendBarrier, - (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); - CmdList->second.SpecialEvent = LastCommandEvent; - // This event will be released on command list completion. - PI_CALL(piEventRetain(CmdList->second.SpecialEvent)); - LastCommandEvent = nullptr; - return PI_SUCCESS; - } - // Create a wait-list and retain events. This will filter out finished events. _pi_ze_event_list_t ActiveBarriersWaitList; if (auto Res = ActiveBarriersWaitList.createAndRetainPiZeEventList( @@ -2021,9 +2026,13 @@ pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, for (pi_event &BarrierEvent : ActiveBarriers) PI_CALL(piEventReleaseInternal(BarrierEvent)); ActiveBarriers.clear(); - ActiveBarriers.insert( - ActiveBarriers.end(), ActiveBarriersWaitList.PiEventList, - ActiveBarriersWaitList.PiEventList + ActiveBarriersWaitList.Length); + + // For in-order queue every command depends on the previous one so we don't + // need to insert active barriers for every next command list. + if (!isInOrderQueue()) + ActiveBarriers.insert( + ActiveBarriers.end(), ActiveBarriersWaitList.PiEventList, + ActiveBarriersWaitList.PiEventList + ActiveBarriersWaitList.Length); // If there are more active barriers, insert a barrier on the command-list. We // do not need an event for finishing so we pass nullptr. @@ -2031,6 +2040,13 @@ pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, ZE_CALL(zeCommandListAppendBarrier, (CmdList->first, nullptr, ActiveBarriersWaitList.Length, ActiveBarriersWaitList.ZeEventList)); + + // Active barriers are released at queue synchronization, but for in-order + // queue we don't keep them till that point so store them in the command list, + // they will be released on completion of command list. + if (isInOrderQueue()) + CmdList->second.WaitLists.push_back(ActiveBarriersWaitList); + return PI_SUCCESS; } diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index efb93faef6312..0febbe3a3a8a2 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -581,6 +581,8 @@ struct _pi_device : _pi_object { ZeCache> ZeDeviceCacheProperties; }; +struct _pi_ze_event_list_t; + // Structure describing the specific use of a command-list in a queue. // This is because command-lists are re-used across multiple queues // in the same context. @@ -610,7 +612,7 @@ struct pi_command_list_info_t { // only have last one visible to the host. std::vector EventList{}; - pi_event SpecialEvent = nullptr; + std::list<_pi_ze_event_list_t> WaitLists; size_t size() const { return EventList.size(); } void append(pi_event Event) { EventList.push_back(Event); } @@ -1347,6 +1349,14 @@ struct _pi_ze_event_list_t { this->Length = other.Length; return *this; } + + _pi_ze_event_list_t(const _pi_ze_event_list_t &other) { + this->ZeEventList = other.ZeEventList; + this->PiEventList = other.PiEventList; + this->Length = other.Length; + } + + _pi_ze_event_list_t() {} }; struct _pi_event : _pi_object { From eb5b2cb8650f6913896b64871d43b3562a8ad6c6 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 4 Nov 2022 12:16:57 -0700 Subject: [PATCH 11/33] Always insert a barrier in the beginning of command list --- sycl/plugins/level_zero/pi_level_zero.cpp | 64 ++++++++--------------- sycl/plugins/level_zero/pi_level_zero.hpp | 14 +---- 2 files changed, 24 insertions(+), 54 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index c7c39422df8ed..819eb85222a49 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -766,12 +766,8 @@ pi_result _pi_queue::signalEvent(pi_command_list_ptr_t CommandList) { this, &SpecialEvent, PI_COMMAND_TYPE_USER, CommandList, /* IsDiscarded */ false, /* ForceHostVisible */ false)); - // We want a barrier in the beginning of a next command list waiting for this - // special event. - ActiveBarriers.push_back(SpecialEvent); - - // We don't need additional dependency through LastCommandEvent. - LastCommandEvent = nullptr; + PI_CALL(piEventRelease(SpecialEvent)); + LastCommandEvent = SpecialEvent; ZE_CALL(zeCommandListAppendSignalEvent, (CommandList->first, SpecialEvent->ZeEvent)); @@ -1108,26 +1104,6 @@ _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, std::vector &EventListToCleanup) { bool UseCopyEngine = CommandList->second.isCopy(this); - if (isInOrderQueue() && isDiscardEvents()) { - // If there were discarded events in the command list then we have to - // release events from wait lists associated with them. - for (auto WaitList : CommandList->second.WaitLists) { - std::list EventsToBeReleased; - WaitList.collectEventsForReleaseAndDestroyPiZeEventList( - EventsToBeReleased); - - // Event may be in the wait list of more than one event. But we have to - // cleanup it only once, that's why use unordered_set to make it happen. - std::unordered_set Events; - std::copy(EventsToBeReleased.begin(), EventsToBeReleased.end(), - std::inserter(Events, Events.begin())); - - for (auto Event : Events) - EventListToCleanup.push_back(Event); - } - CommandList->second.WaitLists.clear(); - } - // Immediate commandlists do not have an associated fence. if (CommandList->second.ZeFence != nullptr) { // Fence had been signalled meaning the associated command-list completed. @@ -1437,6 +1413,7 @@ pi_result _pi_context::getAvailableCommandList( (!ForcedCmdQueue || *ForcedCmdQueue == CommandBatch.OpenCommandList->second.ZeQueue)) { CommandList = CommandBatch.OpenCommandList; + PI_CALL(Queue->insertLastCommandEventBarrier(CommandList)); return PI_SUCCESS; } // If this command isn't allowed to be batched or doesn't match the forced @@ -1504,6 +1481,8 @@ pi_result _pi_context::getAvailableCommandList( .first; } ZeCommandListCache.erase(ZeCommandListIt); + if (auto Res = Queue->insertLastCommandEventBarrier(CommandList)) + return Res; if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) return Res; return PI_SUCCESS; @@ -1531,7 +1510,7 @@ pi_result _pi_context::getAvailableCommandList( true /* QueueLocked */); CommandList = it; CommandList->second.ZeFenceInUse = true; - if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) + if (auto Res = Queue->insertLastCommandEventBarrier(CommandList)) return Res; return PI_SUCCESS; } @@ -1575,6 +1554,7 @@ _pi_queue::createCommandList(bool UseCopyEngine, std::pair( ZeCommandList, {ZeFence, false, ZeCommandQueue, QueueGroupOrdinal})); + PI_CALL(insertLastCommandEventBarrier(CommandList)); PI_CALL(insertActiveBarriers(CommandList, UseCopyEngine)); return PI_SUCCESS; } @@ -2009,6 +1989,16 @@ pi_command_list_ptr_t _pi_queue::eventOpenCommandList(pi_event Event) { return CommandListMap.end(); } +pi_result _pi_queue::insertLastCommandEventBarrier(pi_command_list_ptr_t &CmdList) { + if (CmdList != LastCommandList && LastCommandEvent) { + CmdList->second.append(LastCommandEvent); + LastCommandEvent->RefCount.increment(); + ZE_CALL(zeCommandListAppendBarrier, (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); + LastCommandEvent = nullptr; + } + return PI_SUCCESS; +} + pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, bool UseCopyEngine) { // Early exit if there are no active barriers. @@ -2026,13 +2016,9 @@ pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, for (pi_event &BarrierEvent : ActiveBarriers) PI_CALL(piEventReleaseInternal(BarrierEvent)); ActiveBarriers.clear(); - - // For in-order queue every command depends on the previous one so we don't - // need to insert active barriers for every next command list. - if (!isInOrderQueue()) - ActiveBarriers.insert( - ActiveBarriers.end(), ActiveBarriersWaitList.PiEventList, - ActiveBarriersWaitList.PiEventList + ActiveBarriersWaitList.Length); + ActiveBarriers.insert( + ActiveBarriers.end(), ActiveBarriersWaitList.PiEventList, + ActiveBarriersWaitList.PiEventList + ActiveBarriersWaitList.Length); // If there are more active barriers, insert a barrier on the command-list. We // do not need an event for finishing so we pass nullptr. @@ -2041,12 +2027,6 @@ pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, (CmdList->first, nullptr, ActiveBarriersWaitList.Length, ActiveBarriersWaitList.ZeEventList)); - // Active barriers are released at queue synchronization, but for in-order - // queue we don't keep them till that point so store them in the command list, - // they will be released on completion of command list. - if (isInOrderQueue()) - CmdList->second.WaitLists.push_back(ActiveBarriersWaitList); - return PI_SUCCESS; } @@ -2108,7 +2088,7 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( try { if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr && - !CurQueue->LastCommandEvent->IsDiscarded) { + !CurQueue->LastCommandEvent->IsDiscarded && CurQueue->LastCommandList != CurQueue->CommandListMap.end()) { this->ZeEventList = new ze_event_handle_t[EventListLength + 1]; this->PiEventList = new pi_event[EventListLength + 1]; } else if (EventListLength > 0) { @@ -2201,7 +2181,7 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( // previous command has finished. The event associated with the last // enqueued command is added into the waitlist to ensure in-order semantics. if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr && - !CurQueue->LastCommandEvent->IsDiscarded) { + !CurQueue->LastCommandEvent->IsDiscarded && CurQueue->LastCommandList != CurQueue->CommandListMap.end()) { std::shared_lock Lock(CurQueue->LastCommandEvent->Mutex); this->ZeEventList[TmpListLength] = CurQueue->LastCommandEvent->ZeEvent; this->PiEventList[TmpListLength] = CurQueue->LastCommandEvent; diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 0febbe3a3a8a2..d15583405e5f7 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -581,8 +581,6 @@ struct _pi_device : _pi_object { ZeCache> ZeDeviceCacheProperties; }; -struct _pi_ze_event_list_t; - // Structure describing the specific use of a command-list in a queue. // This is because command-lists are re-used across multiple queues // in the same context. @@ -612,8 +610,6 @@ struct pi_command_list_info_t { // only have last one visible to the host. std::vector EventList{}; - std::list<_pi_ze_event_list_t> WaitLists; - size_t size() const { return EventList.size(); } void append(pi_event Event) { EventList.push_back(Event); } }; @@ -1093,6 +1089,8 @@ struct _pi_queue : _pi_object { pi_result insertActiveBarriers(pi_command_list_ptr_t &CmdList, bool UseCopyEngine); + pi_result insertLastCommandEventBarrier(pi_command_list_ptr_t &CmdList); + // A collection of currently active barriers. // These should be inserted into a command list whenever an available command // list is needed for a command. @@ -1349,14 +1347,6 @@ struct _pi_ze_event_list_t { this->Length = other.Length; return *this; } - - _pi_ze_event_list_t(const _pi_ze_event_list_t &other) { - this->ZeEventList = other.ZeEventList; - this->PiEventList = other.PiEventList; - this->Length = other.Length; - } - - _pi_ze_event_list_t() {} }; struct _pi_event : _pi_object { From 118132428d2fc7608e78e06c06ddb24eb17ebb50 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 4 Nov 2022 14:17:55 -0700 Subject: [PATCH 12/33] Fix mistake --- sycl/plugins/level_zero/pi_level_zero.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 819eb85222a49..566bee9efff3f 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1398,6 +1398,7 @@ pi_result _pi_context::getAvailableCommandList( // Immediate commandlists have been pre-allocated and are always available. if (Queue->Device->useImmediateCommandLists()) { CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList(); + PI_CALL(Queue->insertLastCommandEventBarrier(CommandList)); if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) return Res; return PI_SUCCESS; From 563b05f60d82e08ff78917bc8c8985cf287c054d Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 4 Nov 2022 14:25:57 -0700 Subject: [PATCH 13/33] Fix mistake --- sycl/plugins/level_zero/pi_level_zero.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 566bee9efff3f..cbbccdfb60d08 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2072,6 +2072,7 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( CurQueue->LastCommandList != CurQueue->CommandListMap.end() && CurQueue->LastCommandList != NextImmCmdList) { CurQueue->signalEvent(CurQueue->LastCommandList); + CurQueue->LastCommandList = CurQueue->CommandListMap.end(); } } } else { From 235e039165e794eba24b9660e2298dc9e1ba697c Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 4 Nov 2022 15:00:06 -0700 Subject: [PATCH 14/33] Add to cache in executeCommandList --- sycl/plugins/level_zero/pi_level_zero.cpp | 24 ++++++++--------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index cbbccdfb60d08..04437e801d6d4 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -661,8 +661,6 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, pi_result _pi_queue::setLastDiscardedEvent(pi_event Event) { try { - // We expect previous event to be in the cache. - assert(LastDiscardedEvent == nullptr); LastDiscardedEvent = new _pi_event(Event->ZeEvent, Event->ZeEventPool, Context, PI_COMMAND_TYPE_USER, true); } catch (const std::bad_alloc &) { @@ -679,17 +677,17 @@ pi_result _pi_queue::setLastDiscardedEvent(pi_event Event) { pi_result _pi_queue::resetLastDiscardedEvent(pi_command_list_ptr_t CommandList) { - if (LastCommandEvent && LastCommandEvent->IsDiscarded && - CommandList != CommandListMap.end()) { + if (LastCommandEvent && LastCommandEvent->IsDiscarded) { ZE_CALL(zeCommandListAppendBarrier, (CommandList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); ZE_CALL(zeCommandListAppendEventReset, (CommandList->first, LastCommandEvent->ZeEvent)); - // Remember last discarded event. Can't put it to the cache right now to - // avoid taking it as the next discarded event which will cause using same - // event two times in a row. We need to round robin between two events. - setLastDiscardedEvent(LastCommandEvent); + // Put previous discarded event to the cache. + if (LastDiscardedEvent) + PI_CALL(addEventToCache(LastDiscardedEvent)); + // Update last discarded event. It will be put to the cache after submission of the next command. + PI_CALL(setLastDiscardedEvent(LastCommandEvent)); } return PI_SUCCESS; } @@ -715,13 +713,6 @@ inline static pi_result createEventAndAssociateQueue( // If event is discarded then try to get event from the queue cache. *Event = IsInternal ? Queue->getEventFromCache(ForceHostVisible) : nullptr; - if (IsInternal && Queue->LastDiscardedEvent) { - // We've possibly got discarded event above so it is time to reset the last - // disarded event (if any) and put it to the cache. - PI_CALL(Queue->addEventToCache(Queue->LastDiscardedEvent)); - Queue->LastDiscardedEvent = nullptr; - } - if (*Event == nullptr) PI_CALL(EventCreate(Queue->Context, Queue, ForceHostVisible, Event)); @@ -1661,8 +1652,9 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, if (!CommandList->second.EventList.empty() && this->LastCommandEvent != CommandList->second.EventList.back()) { this->LastCommandEvent = CommandList->second.EventList.back(); - if (this->LastCommandEvent->IsDiscarded) + if (this->LastCommandEvent->IsDiscarded) { PI_CALL(resetLastDiscardedEvent(CommandList)); + } } this->LastCommandList = CommandList; From a32aac6b0acf0bd7690ba26a4ec15ec234c9641d Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Mon, 7 Nov 2022 17:50:48 -0800 Subject: [PATCH 15/33] Add guards and clarifying comments --- sycl/plugins/level_zero/pi_level_zero.cpp | 160 +++++++++++++--------- sycl/plugins/level_zero/pi_level_zero.hpp | 49 +++++-- 2 files changed, 134 insertions(+), 75 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 04437e801d6d4..2001e5765e9b6 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -675,8 +675,8 @@ pi_result _pi_queue::setLastDiscardedEvent(pi_event Event) { return PI_SUCCESS; } -pi_result -_pi_queue::resetLastDiscardedEvent(pi_command_list_ptr_t CommandList) { +pi_result _pi_queue::appendWaitAndResetLastDiscardedEvent( + pi_command_list_ptr_t CommandList) { if (LastCommandEvent && LastCommandEvent->IsDiscarded) { ZE_CALL(zeCommandListAppendBarrier, (CommandList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); @@ -686,7 +686,8 @@ _pi_queue::resetLastDiscardedEvent(pi_command_list_ptr_t CommandList) { // Put previous discarded event to the cache. if (LastDiscardedEvent) PI_CALL(addEventToCache(LastDiscardedEvent)); - // Update last discarded event. It will be put to the cache after submission of the next command. + // Update last discarded event. It will be put to the cache after submission + // of the next command. PI_CALL(setLastDiscardedEvent(LastCommandEvent)); } return PI_SUCCESS; @@ -707,6 +708,7 @@ inline static pi_result createEventAndAssociateQueue( pi_queue Queue, pi_event *Event, pi_command_type CommandType, pi_command_list_ptr_t CommandList, bool IsInternal = false, bool ForceHostVisible = false) { + if (!ForceHostVisible) ForceHostVisible = DeviceEventsSetting == AllHostVisible; @@ -752,16 +754,20 @@ inline static pi_result createEventAndAssociateQueue( } pi_result _pi_queue::signalEvent(pi_command_list_ptr_t CommandList) { - pi_event SpecialEvent; - PI_CALL(createEventAndAssociateQueue( - this, &SpecialEvent, PI_COMMAND_TYPE_USER, CommandList, - /* IsDiscarded */ false, /* ForceHostVisible */ false)); + // We signal new event at the end of command list only if we have queue with + // discard_events property and the last command event is discarded. + if (!(ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents() && + LastCommandEvent && LastCommandEvent->IsDiscarded)) + return PI_SUCCESS; - PI_CALL(piEventRelease(SpecialEvent)); - LastCommandEvent = SpecialEvent; + pi_event Event; + PI_CALL(createEventAndAssociateQueue( + this, &Event, PI_COMMAND_TYPE_USER, CommandList, + /* IsDiscarded */ false, /* ForceHostVisible */ false)) + PI_CALL(piEventReleaseInternal(Event)); + LastCommandEvent = Event; - ZE_CALL(zeCommandListAppendSignalEvent, - (CommandList->first, SpecialEvent->ZeEvent)); + ZE_CALL(zeCommandListAppendSignalEvent, (CommandList->first, Event->ZeEvent)); return PI_SUCCESS; } @@ -1111,6 +1117,16 @@ _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, std::move(std::begin(EventList), std::end(EventList), std::back_inserter(EventListToCleanup)); EventList.clear(); + // We may have additional events to cleanup if queue has discarded events. + // These events are waited by barrier inserted in the beginning of command + // list. + auto &StartingBarrierEvents = CommandList->second.StartingBarrierEvents; + if (!StartingBarrierEvents.empty()) { + std::move(std::begin(StartingBarrierEvents), + std::end(StartingBarrierEvents), + std::back_inserter(EventListToCleanup)); + StartingBarrierEvents.clear(); + } // Standard commandlists move in and out of the cache as they are recycled. // Immediate commandlists are always available. @@ -1389,7 +1405,7 @@ pi_result _pi_context::getAvailableCommandList( // Immediate commandlists have been pre-allocated and are always available. if (Queue->Device->useImmediateCommandLists()) { CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList(); - PI_CALL(Queue->insertLastCommandEventBarrier(CommandList)); + PI_CALL(Queue->insertStartBarrierWaitingForLastEvent(CommandList)); if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) return Res; return PI_SUCCESS; @@ -1405,7 +1421,7 @@ pi_result _pi_context::getAvailableCommandList( (!ForcedCmdQueue || *ForcedCmdQueue == CommandBatch.OpenCommandList->second.ZeQueue)) { CommandList = CommandBatch.OpenCommandList; - PI_CALL(Queue->insertLastCommandEventBarrier(CommandList)); + PI_CALL(Queue->insertStartBarrierWaitingForLastEvent(CommandList)); return PI_SUCCESS; } // If this command isn't allowed to be batched or doesn't match the forced @@ -1473,7 +1489,7 @@ pi_result _pi_context::getAvailableCommandList( .first; } ZeCommandListCache.erase(ZeCommandListIt); - if (auto Res = Queue->insertLastCommandEventBarrier(CommandList)) + if (auto Res = Queue->insertStartBarrierWaitingForLastEvent(CommandList)) return Res; if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) return Res; @@ -1502,7 +1518,7 @@ pi_result _pi_context::getAvailableCommandList( true /* QueueLocked */); CommandList = it; CommandList->second.ZeFenceInUse = true; - if (auto Res = Queue->insertLastCommandEventBarrier(CommandList)) + if (auto Res = Queue->insertStartBarrierWaitingForLastEvent(CommandList)) return Res; return PI_SUCCESS; } @@ -1546,7 +1562,7 @@ _pi_queue::createCommandList(bool UseCopyEngine, std::pair( ZeCommandList, {ZeFence, false, ZeCommandQueue, QueueGroupOrdinal})); - PI_CALL(insertLastCommandEventBarrier(CommandList)); + PI_CALL(insertStartBarrierWaitingForLastEvent(CommandList)); PI_CALL(insertActiveBarriers(CommandList, UseCopyEngine)); return PI_SUCCESS; } @@ -1652,8 +1668,8 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, if (!CommandList->second.EventList.empty() && this->LastCommandEvent != CommandList->second.EventList.back()) { this->LastCommandEvent = CommandList->second.EventList.back(); - if (this->LastCommandEvent->IsDiscarded) { - PI_CALL(resetLastDiscardedEvent(CommandList)); + if (ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents()) { + PI_CALL(appendWaitAndResetLastDiscardedEvent(CommandList)); } } @@ -1686,8 +1702,8 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, } adjustBatchSizeForFullBatch(UseCopyEngine); + CommandBatch.OpenCommandList = CommandListMap.end(); } - CommandBatch.OpenCommandList = CommandListMap.end(); } auto &ZeCommandQueue = CommandList->second.ZeQueue; @@ -1763,11 +1779,14 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // each event in the EventList. PI_CALL(piEventReleaseInternal(HostVisibleEvent)); - if (isInOrderQueue() && isDiscardEvents()) { - // If we have in-order queue with discarded events then we want to treat this event as regular event and use it as a dependency for the next command. + if (ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents()) { + // If we have in-order queue with discarded events then we want to + // treat this event as regular event. We insert a barrier in the next + // command list to wait for this event. LastCommandEvent = HostVisibleEvent; } else { - // For all other queues treat this as a special event and indicate no cleanup is needed. + // For all other queues treat this as a special event and indicate no + // cleanup is needed. PI_CALL(piEventReleaseInternal(HostVisibleEvent)); HostVisibleEvent->CleanedUp = true; } @@ -1775,19 +1794,22 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // Finally set to signal the host-visible event at the end of the // command-list after a barrier that waits for all commands // completion. - if (LastCommandEvent && LastCommandEvent->IsDiscarded) { - // If we the last event is discarded then we already have a barrier inserted, so just signal event. + if (ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents() && + LastCommandEvent && LastCommandEvent->IsDiscarded) { + // If we the last event is discarded then we already have a barrier + // inserted, so just signal the event. ZE_CALL(zeCommandListAppendSignalEvent, (CommandList->first, HostVisibleEvent->ZeEvent)); } else { ZE_CALL(zeCommandListAppendBarrier, (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr)); } - } else if (this->LastCommandEvent && - this->LastCommandEvent->IsDiscarded) { + } else { + // If we don't have host visible proxy then signal event if needed. this->signalEvent(CommandList); } - } else if (this->LastCommandEvent && this->LastCommandEvent->IsDiscarded) { + } else { + // If we don't have host visible proxy then signal event if needed. this->signalEvent(CommandList); } @@ -1982,11 +2004,18 @@ pi_command_list_ptr_t _pi_queue::eventOpenCommandList(pi_event Event) { return CommandListMap.end(); } -pi_result _pi_queue::insertLastCommandEventBarrier(pi_command_list_ptr_t &CmdList) { - if (CmdList != LastCommandList && LastCommandEvent) { - CmdList->second.append(LastCommandEvent); +pi_result _pi_queue::insertStartBarrierWaitingForLastEvent( + pi_command_list_ptr_t &CmdList) { + // If current command list is different from the last command list then insert + // a barrier waiting for the last command event. + if (ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents() && + CmdList != LastCommandList && LastCommandEvent) { + // We want this event to live long enough so increment its reference count. + // It will be decremented when command list is reset. LastCommandEvent->RefCount.increment(); - ZE_CALL(zeCommandListAppendBarrier, (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); + CmdList->second.StartingBarrierEvents.push_back(LastCommandEvent); + ZE_CALL(zeCommandListAppendBarrier, + (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); LastCommandEvent = nullptr; } return PI_SUCCESS; @@ -2015,11 +2044,10 @@ pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, // If there are more active barriers, insert a barrier on the command-list. We // do not need an event for finishing so we pass nullptr. - if (ActiveBarriersWaitList.Length) + if (!ActiveBarriers.empty()) ZE_CALL(zeCommandListAppendBarrier, (CmdList->first, nullptr, ActiveBarriersWaitList.Length, ActiveBarriersWaitList.ZeEventList)); - return PI_SUCCESS; } @@ -2050,39 +2078,57 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( this->ZeEventList = nullptr; this->PiEventList = nullptr; - if (CurQueue->isInOrderQueue()) { + if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) { if (CurQueue->Device->useImmediateCommandLists()) { - if (CurQueue->isDiscardEvents()) { - // If we have an in-order queue where some events are discarded and if - // new command list is different from the last used then signal new - // event from the last immediate command list. It is going to be waited - // in the new immediate command list. + if (ReuseDiscardedEvents && CurQueue->isDiscardEvents()) { + // If queue is in-order with discarded events and if + // new command list is different from the last used command list then + // signal new event from the last immediate command list. We are going + // to insert a barrier in the new command list waiting for that event. auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine); auto NextImmCmdList = QueueGroup.ImmCmdLists[QueueGroup.NextIndex]; - if (CurQueue->LastCommandEvent != nullptr && - CurQueue->LastCommandEvent->IsDiscarded && - CurQueue->LastCommandList != CurQueue->CommandListMap.end() && + if (CurQueue->LastCommandList != CurQueue->CommandListMap.end() && CurQueue->LastCommandList != NextImmCmdList) { CurQueue->signalEvent(CurQueue->LastCommandList); + // Mark the last command list as "closed" event though we don't really + // close immediate command list. It just indicates that we are + // switching command lists. This will be taken into account below - we + // don't need to add the last command event into the wait list in this + // case. CurQueue->LastCommandList = CurQueue->CommandListMap.end(); } } } else { - // Close open command list if command is going to be submitted to a - // different command list. - if (CurQueue->LastCommandEvent != nullptr && - CurQueue->LastCommandList != CurQueue->CommandListMap.end() && - CurQueue->LastCommandList->second.isCopy(CurQueue) != UseCopyEngine) { + // Ensure LastCommandEvent's batch is submitted if it is differrent + // from the one this command is going to. + const auto &OpenCommandList = + CurQueue->eventOpenCommandList(CurQueue->LastCommandEvent); + if (OpenCommandList != CurQueue->CommandListMap.end() && + OpenCommandList->second.isCopy(CurQueue) != UseCopyEngine) { + if (auto Res = CurQueue->executeOpenCommandList( - CurQueue->LastCommandList->second.isCopy(CurQueue))) + OpenCommandList->second.isCopy(CurQueue))) return Res; } } } + bool IncludeLastCommandEvent = + CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr; + + // If the last command event is not nullptr we still don't need to include + // last command event in the wait list in the two cases: If the last event is + // discarded then we already have a barrier waiting for that event. If the + // last command list is closed then we are going to insert a barrier at the + // beginning of the next command list. + if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() && + (CurQueue->LastCommandEvent->IsDiscarded || + CurQueue->LastCommandList == CurQueue->CommandListMap.end())) { + IncludeLastCommandEvent = false; + } + try { - if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr && - !CurQueue->LastCommandEvent->IsDiscarded && CurQueue->LastCommandList != CurQueue->CommandListMap.end()) { + if (IncludeLastCommandEvent) { this->ZeEventList = new ze_event_handle_t[EventListLength + 1]; this->PiEventList = new pi_event[EventListLength + 1]; } else if (EventListLength > 0) { @@ -2174,8 +2220,7 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( // For in-order queues, every command should be executed only after the // previous command has finished. The event associated with the last // enqueued command is added into the waitlist to ensure in-order semantics. - if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr && - !CurQueue->LastCommandEvent->IsDiscarded && CurQueue->LastCommandList != CurQueue->CommandListMap.end()) { + if (IncludeLastCommandEvent) { std::shared_lock Lock(CurQueue->LastCommandEvent->Mutex); this->ZeEventList[TmpListLength] = CurQueue->LastCommandEvent->ZeEvent; this->PiEventList[TmpListLength] = CurQueue->LastCommandEvent; @@ -6538,12 +6583,7 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, // We use the same approach if // SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a // positive value. - // We also use the same approach if we have in-order queue because inserted - // barrier will depend on last command event and every next command will - // depend on event signalled by barrier, so no need to populate ActiveBarriers - // in this case as well. - if (NumEventsInWaitList || !UseMultipleCmdlistBarriers || - Queue->isInOrderQueue()) { + if (NumEventsInWaitList || !UseMultipleCmdlistBarriers) { // Retain the events as they will be owned by the result event. _pi_ze_event_list_t TmpWaitList; if (auto Res = TmpWaitList.createAndRetainPiZeEventList( @@ -6566,10 +6606,8 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, if (auto Res = Queue->executeCommandList(CmdList, false, OkToBatch)) return Res; - if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) { + if (UseMultipleCmdlistBarriers) { // Retain and save the resulting event for future commands. - // This is redundant for in-order queues because we separately handle - // dependency chain between commands in in-order queue. (*Event)->RefCount.increment(); Queue->ActiveBarriers.push_back(*Event); } diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index d15583405e5f7..b98d2bedb0fcf 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -610,6 +610,13 @@ struct pi_command_list_info_t { // only have last one visible to the host. std::vector EventList{}; + // Used in ReuseDiscardedEvents mode only. + // Contains events waited by barrier inserted after switching to this command + // list from another command list. This events need to be cleaned up after + // command list reset. Such events are stored separately because we don't want + // such event to affect batching heuristics. + std::list StartingBarrierEvents; + size_t size() const { return EventList.size(); } void append(pi_event Event) { EventList.push_back(Event); } }; @@ -918,13 +925,22 @@ struct _pi_queue : _pi_object { // command is enqueued. pi_event LastCommandEvent = nullptr; + // Holds a pointer to the last discarded event which was reset. + // We don't want to put discarded event to the cache as soon as it is reset to + // avoid using the same discarded event two times in a row. So we remember it + // and put to the cache for reuse only after submission of the next command + // with discarded event. LastCommandEvent can't be used for this purpose + // because references event canbe released and LastCommandEvent become + // nullptr. LastDiscardedEvent points to the new pi_event object created using + // reset native handle, so this new object can't be lost and will be used for + // the next command. pi_event LastDiscardedEvent = nullptr; + // Set LastDiscardedEvent data member. It creates new pi_event using native + // handle from the provided pi_event object. pi_result setLastDiscardedEvent(pi_event Event); // Keep track of the last command list used by in-order queue. - // This is needed because we need to handle the change of the command list in - // a specific way. pi_command_list_ptr_t LastCommandList = CommandListMap.end(); // Caches of events for reuse. @@ -939,17 +955,18 @@ struct _pi_queue : _pi_object { // Add event to the queue's cache. pi_result addEventToCache(pi_event Event); - // Append command to provided command list to reset the last discarded event. - // If we have in-order and discard_events mode we reset and reuse events in - // scope of the same command lists. This method allows to wait for the last - // discarded event, reset it. - pi_result resetLastDiscardedEvent(pi_command_list_ptr_t); - - // Append command to provided command list to signal new event. - // While we submit commands in scope of the same command list we can reuse - // events but when we switch to a different command list we currently use a - // new event. This method is used to signal new event from the last used - // command list. This new event will be waited in new command list. + // Append command to provided command list to wait for and reset the last + // discarded event. If we have in-order and discard_events mode we reset and + // reuse discarded events in scope of the same command list. This method + // allows to wait for the last discarded event and reset it after command + // submission. + pi_result appendWaitAndResetLastDiscardedEvent(pi_command_list_ptr_t); + + // For in-order queue append command to the command list to signal new event + // if the last event in the command list is discarded. While we submit + // commands in scope of the same command list we can reset and reuse events + // but when we switch to a different command list we currently need to signal + // new event and wait for it in the new command list using barrier. pi_result signalEvent(pi_command_list_ptr_t); // Kernel is not necessarily submitted for execution during @@ -1089,7 +1106,11 @@ struct _pi_queue : _pi_object { pi_result insertActiveBarriers(pi_command_list_ptr_t &CmdList, bool UseCopyEngine); - pi_result insertLastCommandEventBarrier(pi_command_list_ptr_t &CmdList); + // Insert a barrier waiting for the last command event into the beginning of + // command list if queue is in-order and has discard_events property. It + // allows to reset and reuse event handles in scope of each command list. + pi_result + insertStartBarrierWaitingForLastEvent(pi_command_list_ptr_t &CmdList); // A collection of currently active barriers. // These should be inserted into a command list whenever an available command From 7c9f81984da86e88a64d73edc9ec000411c29002 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Wed, 16 Nov 2022 01:48:50 -0800 Subject: [PATCH 16/33] Get rid of LastDiscardedEvent and its methods --- sycl/plugins/level_zero/pi_level_zero.cpp | 43 +++++++++-------------- sycl/plugins/level_zero/pi_level_zero.hpp | 15 -------- 2 files changed, 17 insertions(+), 41 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 5b60553668d7e..407c36e90311a 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -659,22 +659,6 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, if (!(condition)) \ return error; -pi_result _pi_queue::setLastDiscardedEvent(pi_event Event) { - try { - LastDiscardedEvent = new _pi_event(Event->ZeEvent, Event->ZeEventPool, - Context, PI_COMMAND_TYPE_USER, true); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - if (Event->isHostVisible()) - LastDiscardedEvent->HostVisibleEvent = LastDiscardedEvent; - - return PI_SUCCESS; -} - pi_result _pi_queue::appendWaitAndResetLastDiscardedEvent( pi_command_list_ptr_t CommandList) { if (LastCommandEvent && LastCommandEvent->IsDiscarded) { @@ -683,13 +667,23 @@ pi_result _pi_queue::appendWaitAndResetLastDiscardedEvent( ZE_CALL(zeCommandListAppendEventReset, (CommandList->first, LastCommandEvent->ZeEvent)); - // Put previous discarded event to the cache. - if (LastDiscardedEvent) - PI_CALL(addEventToCache(LastDiscardedEvent)); - // Update last discarded event. It will be put to the cache after submission - // of the next command. - PI_CALL(setLastDiscardedEvent(LastCommandEvent)); + // Create copy of pi_event but with the same ze_event_handle_t. We are going to use this pi_event for the next command with discarded event. + pi_event PiEvent; + try { + PiEvent = new _pi_event(LastCommandEvent->ZeEvent, LastCommandEvent->ZeEventPool, + Context, PI_COMMAND_TYPE_USER, true); + } catch (const std::bad_alloc &) { + return PI_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + + if (LastCommandEvent->isHostVisible()) + PiEvent->HostVisibleEvent = PiEvent; + + PI_CALL(addEventToCache(PiEvent)); } + return PI_SUCCESS; } @@ -774,7 +768,7 @@ pi_result _pi_queue::signalEvent(pi_command_list_ptr_t CommandList) { pi_event _pi_queue::getEventFromCache(bool HostVisible) { auto Cache = getEventCache(HostVisible); - if (Cache->empty()) + if (Cache->size() < 2) return nullptr; auto It = Cache->begin(); @@ -3898,9 +3892,6 @@ static pi_result piQueueReleaseInternal(pi_queue Queue) { if (!Queue->RefCount.decrementAndTest()) return PI_SUCCESS; - if (Queue->LastDiscardedEvent) - PI_CALL(Queue->addEventToCache(Queue->LastDiscardedEvent)); - for (auto Cache : Queue->EventCaches) for (auto Event : Cache) PI_CALL(piEventReleaseInternal(Event)); diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 5cf46f176a6f8..2ff0d039f75df 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -925,21 +925,6 @@ struct _pi_queue : _pi_object { // command is enqueued. pi_event LastCommandEvent = nullptr; - // Holds a pointer to the last discarded event which was reset. - // We don't want to put discarded event to the cache as soon as it is reset to - // avoid using the same discarded event two times in a row. So we remember it - // and put to the cache for reuse only after submission of the next command - // with discarded event. LastCommandEvent can't be used for this purpose - // because references event canbe released and LastCommandEvent become - // nullptr. LastDiscardedEvent points to the new pi_event object created using - // reset native handle, so this new object can't be lost and will be used for - // the next command. - pi_event LastDiscardedEvent = nullptr; - - // Set LastDiscardedEvent data member. It creates new pi_event using native - // handle from the provided pi_event object. - pi_result setLastDiscardedEvent(pi_event Event); - // Keep track of the last command list used by in-order queue. pi_command_list_ptr_t LastCommandList = CommandListMap.end(); From 75db6a12c9fdbc91134257ff91dfd5522ba83d36 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Wed, 16 Nov 2022 01:56:03 -0800 Subject: [PATCH 17/33] Get rid of StartingBarrierEvents despite of affecting batching heuristics --- sycl/plugins/level_zero/pi_level_zero.cpp | 12 +----------- sycl/plugins/level_zero/pi_level_zero.hpp | 7 ------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 407c36e90311a..2bc5ac994f367 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1125,16 +1125,6 @@ _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, } } } - // We may have additional events to cleanup if queue has discarded events. - // These events are waited by barrier inserted in the beginning of command - // list. - auto &StartingBarrierEvents = CommandList->second.StartingBarrierEvents; - if (!StartingBarrierEvents.empty()) { - std::move(std::begin(StartingBarrierEvents), - std::end(StartingBarrierEvents), - std::back_inserter(EventListToCleanup)); - StartingBarrierEvents.clear(); - } // Standard commandlists move in and out of the cache as they are recycled. // Immediate commandlists are always available. @@ -2021,7 +2011,7 @@ pi_result _pi_queue::insertStartBarrierWaitingForLastEvent( // We want this event to live long enough so increment its reference count. // It will be decremented when command list is reset. LastCommandEvent->RefCount.increment(); - CmdList->second.StartingBarrierEvents.push_back(LastCommandEvent); + CmdList->second.EventList.push_back(LastCommandEvent); ZE_CALL(zeCommandListAppendBarrier, (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); LastCommandEvent = nullptr; diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 2ff0d039f75df..195d93bfb6bc6 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -610,13 +610,6 @@ struct pi_command_list_info_t { // only have last one visible to the host. std::vector EventList{}; - // Used in ReuseDiscardedEvents mode only. - // Contains events waited by barrier inserted after switching to this command - // list from another command list. This events need to be cleaned up after - // command list reset. Such events are stored separately because we don't want - // such event to affect batching heuristics. - std::list StartingBarrierEvents; - size_t size() const { return EventList.size(); } void append(pi_event Event) { EventList.push_back(Event); } }; From 3f337051dd3d907794e263b06bf5d19fa4b40c82 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Wed, 16 Nov 2022 02:33:00 -0800 Subject: [PATCH 18/33] Make signalled event to be referenced by first command of the next cmd list --- sycl/plugins/level_zero/pi_level_zero.cpp | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 2bc5ac994f367..e7c09ebb7937b 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2008,10 +2008,6 @@ pi_result _pi_queue::insertStartBarrierWaitingForLastEvent( // a barrier waiting for the last command event. if (ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents() && CmdList != LastCommandList && LastCommandEvent) { - // We want this event to live long enough so increment its reference count. - // It will be decremented when command list is reset. - LastCommandEvent->RefCount.increment(); - CmdList->second.EventList.push_back(LastCommandEvent); ZE_CALL(zeCommandListAppendBarrier, (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); LastCommandEvent = nullptr; @@ -2088,12 +2084,6 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( if (CurQueue->LastCommandList != CurQueue->CommandListMap.end() && CurQueue->LastCommandList != NextImmCmdList) { CurQueue->signalEvent(CurQueue->LastCommandList); - // Mark the last command list as "closed" event though we don't really - // close immediate command list. It just indicates that we are - // switching command lists. This will be taken into account below - we - // don't need to add the last command event into the wait list in this - // case. - CurQueue->LastCommandList = CurQueue->CommandListMap.end(); } } } else { @@ -2114,16 +2104,9 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( bool IncludeLastCommandEvent = CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr; - // If the last command event is not nullptr we still don't need to include - // last command event in the wait list in the two cases: If the last event is - // discarded then we already have a barrier waiting for that event. If the - // last command list is closed then we are going to insert a barrier at the - // beginning of the next command list. - if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() && - (CurQueue->LastCommandEvent->IsDiscarded || - CurQueue->LastCommandList == CurQueue->CommandListMap.end())) { + // If the last event is discarded then we already have a barrier waiting for that event, so don't need to include the last command event into the wait list. + if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() && CurQueue->LastCommandEvent->IsDiscarded) IncludeLastCommandEvent = false; - } try { if (IncludeLastCommandEvent) { From ec1ba6b5db40105f82478c069852cf23bf4a8a23 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Wed, 16 Nov 2022 08:56:46 -0800 Subject: [PATCH 19/33] Add comments and rename methods --- sycl/plugins/level_zero/pi_level_zero.cpp | 31 +++++++++++++---------- sycl/plugins/level_zero/pi_level_zero.hpp | 20 +++++++++------ 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index e7c09ebb7937b..dd23b1e52c78f 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -659,7 +659,7 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, if (!(condition)) \ return error; -pi_result _pi_queue::appendWaitAndResetLastDiscardedEvent( +pi_result _pi_queue::appendWaitAndResetIfLastEventDiscarded( pi_command_list_ptr_t CommandList) { if (LastCommandEvent && LastCommandEvent->IsDiscarded) { ZE_CALL(zeCommandListAppendBarrier, @@ -747,7 +747,7 @@ inline static pi_result createEventAndAssociateQueue( return PI_SUCCESS; } -pi_result _pi_queue::signalEvent(pi_command_list_ptr_t CommandList) { +pi_result _pi_queue::signalEventFromCmdListIfLastEventDiscarded(pi_command_list_ptr_t CommandList) { // We signal new event at the end of command list only if we have queue with // discard_events property and the last command event is discarded. if (!(ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents() && @@ -766,11 +766,14 @@ pi_result _pi_queue::signalEvent(pi_command_list_ptr_t CommandList) { } pi_event _pi_queue::getEventFromCache(bool HostVisible) { - auto Cache = getEventCache(HostVisible); + auto Cache = HostVisible ? &EventCaches[0] : &EventCaches[1]; + // If we don't have any events, return nullptr. + // If we have only a single event then it was used by the last command and we can't use it now because we have to enforce round robin between two events. if (Cache->size() < 2) return nullptr; + // If there are two events then return an event from the beginning of the list since event of the last command is added to the end of the list. auto It = Cache->begin(); pi_event RetEvent = *It; Cache->erase(It); @@ -778,7 +781,7 @@ pi_event _pi_queue::getEventFromCache(bool HostVisible) { } pi_result _pi_queue::addEventToCache(pi_event Event) { - auto Cache = getEventCache(Event->isHostVisible()); + auto Cache = Event->isHostVisible() ? &EventCaches[0] : &EventCaches[1]; Cache->emplace_back(Event); return PI_SUCCESS; } @@ -1403,7 +1406,7 @@ pi_result _pi_context::getAvailableCommandList( // Immediate commandlists have been pre-allocated and are always available. if (Queue->Device->useImmediateCommandLists()) { CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList(); - PI_CALL(Queue->insertStartBarrierWaitingForLastEvent(CommandList)); + PI_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList)); if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) return Res; return PI_SUCCESS; @@ -1419,7 +1422,7 @@ pi_result _pi_context::getAvailableCommandList( (!ForcedCmdQueue || *ForcedCmdQueue == CommandBatch.OpenCommandList->second.ZeQueue)) { CommandList = CommandBatch.OpenCommandList; - PI_CALL(Queue->insertStartBarrierWaitingForLastEvent(CommandList)); + PI_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList)); return PI_SUCCESS; } // If this command isn't allowed to be batched or doesn't match the forced @@ -1487,7 +1490,7 @@ pi_result _pi_context::getAvailableCommandList( .first; } ZeCommandListCache.erase(ZeCommandListIt); - if (auto Res = Queue->insertStartBarrierWaitingForLastEvent(CommandList)) + if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList)) return Res; if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) return Res; @@ -1516,7 +1519,7 @@ pi_result _pi_context::getAvailableCommandList( true /* QueueLocked */); CommandList = it; CommandList->second.ZeFenceInUse = true; - if (auto Res = Queue->insertStartBarrierWaitingForLastEvent(CommandList)) + if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList)) return Res; return PI_SUCCESS; } @@ -1560,7 +1563,7 @@ _pi_queue::createCommandList(bool UseCopyEngine, std::pair( ZeCommandList, {ZeFence, false, ZeCommandQueue, QueueGroupOrdinal})); - PI_CALL(insertStartBarrierWaitingForLastEvent(CommandList)); + PI_CALL(insertStartBarrierIfDiscardEventsMode(CommandList)); PI_CALL(insertActiveBarriers(CommandList, UseCopyEngine)); return PI_SUCCESS; } @@ -1667,7 +1670,7 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, this->LastCommandEvent != CommandList->second.EventList.back()) { this->LastCommandEvent = CommandList->second.EventList.back(); if (ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents()) { - PI_CALL(appendWaitAndResetLastDiscardedEvent(CommandList)); + PI_CALL(appendWaitAndResetIfLastEventDiscarded(CommandList)); } } @@ -1804,11 +1807,11 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, } } else { // If we don't have host visible proxy then signal event if needed. - this->signalEvent(CommandList); + this->signalEventFromCmdListIfLastEventDiscarded(CommandList); } } else { // If we don't have host visible proxy then signal event if needed. - this->signalEvent(CommandList); + this->signalEventFromCmdListIfLastEventDiscarded(CommandList); } // Close the command list and have it ready for dispatch. @@ -2002,7 +2005,7 @@ pi_command_list_ptr_t _pi_queue::eventOpenCommandList(pi_event Event) { return CommandListMap.end(); } -pi_result _pi_queue::insertStartBarrierWaitingForLastEvent( +pi_result _pi_queue::insertStartBarrierIfDiscardEventsMode( pi_command_list_ptr_t &CmdList) { // If current command list is different from the last command list then insert // a barrier waiting for the last command event. @@ -2083,7 +2086,7 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( auto NextImmCmdList = QueueGroup.ImmCmdLists[QueueGroup.NextIndex]; if (CurQueue->LastCommandList != CurQueue->CommandListMap.end() && CurQueue->LastCommandList != NextImmCmdList) { - CurQueue->signalEvent(CurQueue->LastCommandList); + CurQueue->signalEventFromCmdListIfLastEventDiscarded(CurQueue->LastCommandList); } } } else { diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 195d93bfb6bc6..b30b1e496e1ca 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -918,14 +918,18 @@ struct _pi_queue : _pi_object { // command is enqueued. pi_event LastCommandEvent = nullptr; - // Keep track of the last command list used by in-order queue. + // This data member is used only for in-order queue with discard_events property. + // For in-order queues with discarded events we reset and reuse events in scope of each command list but to switch between command lists we have to use new event. + // This data member keeps track of the last used command list and allows to handle switch of immediate command lists because immediate command lists are never closed unlike regular command lists. pi_command_list_ptr_t LastCommandList = CommandListMap.end(); - // Caches of events for reuse. + // This data member is used only for in-order queue with discard_events property. + // It is a vector of 2 lists: for host-visible and device-scope events. They are separated to allow faster access to stored events depending on requested type of event. + // Each list contains events which can be reused in scope of command list. Two events are enough for reset and reuse inside each command list moreover those two events can be used for all command lists in the queue, thus those lists are going to contain two elements each at maximum. + // We release leftover events in the cache at the queue destruction. std::vector> EventCaches{2}; - auto getEventCache(bool HostVisible) { - return HostVisible ? &EventCaches[0] : &EventCaches[1]; - } + + // The following 4 methods are used only for in-order queues with discard_events property. // Get event from the queue's cache. pi_event getEventFromCache(bool HostVisible); @@ -938,14 +942,14 @@ struct _pi_queue : _pi_object { // reuse discarded events in scope of the same command list. This method // allows to wait for the last discarded event and reset it after command // submission. - pi_result appendWaitAndResetLastDiscardedEvent(pi_command_list_ptr_t); + pi_result appendWaitAndResetIfLastEventDiscarded(pi_command_list_ptr_t); // For in-order queue append command to the command list to signal new event // if the last event in the command list is discarded. While we submit // commands in scope of the same command list we can reset and reuse events // but when we switch to a different command list we currently need to signal // new event and wait for it in the new command list using barrier. - pi_result signalEvent(pi_command_list_ptr_t); + pi_result signalEventFromCmdListIfLastEventDiscarded(pi_command_list_ptr_t); // Kernel is not necessarily submitted for execution during // piEnqueueKernelLaunch, it may be batched. That's why we need to save the @@ -1088,7 +1092,7 @@ struct _pi_queue : _pi_object { // command list if queue is in-order and has discard_events property. It // allows to reset and reuse event handles in scope of each command list. pi_result - insertStartBarrierWaitingForLastEvent(pi_command_list_ptr_t &CmdList); + insertStartBarrierIfDiscardEventsMode(pi_command_list_ptr_t &CmdList); // A collection of currently active barriers. // These should be inserted into a command list whenever an available command From 074982775eb5f045095e86a02fabbdc53e0e60c1 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 16 Nov 2022 08:58:34 -0800 Subject: [PATCH 20/33] Formatting --- sycl/plugins/level_zero/pi_level_zero.cpp | 27 +++++++++++++++-------- sycl/plugins/level_zero/pi_level_zero.hpp | 25 ++++++++++++++------- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index dd23b1e52c78f..93edd54e31720 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -667,11 +667,13 @@ pi_result _pi_queue::appendWaitAndResetIfLastEventDiscarded( ZE_CALL(zeCommandListAppendEventReset, (CommandList->first, LastCommandEvent->ZeEvent)); - // Create copy of pi_event but with the same ze_event_handle_t. We are going to use this pi_event for the next command with discarded event. + // Create copy of pi_event but with the same ze_event_handle_t. We are going + // to use this pi_event for the next command with discarded event. pi_event PiEvent; try { - PiEvent = new _pi_event(LastCommandEvent->ZeEvent, LastCommandEvent->ZeEventPool, - Context, PI_COMMAND_TYPE_USER, true); + PiEvent = new _pi_event(LastCommandEvent->ZeEvent, + LastCommandEvent->ZeEventPool, Context, + PI_COMMAND_TYPE_USER, true); } catch (const std::bad_alloc &) { return PI_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -747,7 +749,8 @@ inline static pi_result createEventAndAssociateQueue( return PI_SUCCESS; } -pi_result _pi_queue::signalEventFromCmdListIfLastEventDiscarded(pi_command_list_ptr_t CommandList) { +pi_result _pi_queue::signalEventFromCmdListIfLastEventDiscarded( + pi_command_list_ptr_t CommandList) { // We signal new event at the end of command list only if we have queue with // discard_events property and the last command event is discarded. if (!(ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents() && @@ -769,11 +772,13 @@ pi_event _pi_queue::getEventFromCache(bool HostVisible) { auto Cache = HostVisible ? &EventCaches[0] : &EventCaches[1]; // If we don't have any events, return nullptr. - // If we have only a single event then it was used by the last command and we can't use it now because we have to enforce round robin between two events. + // If we have only a single event then it was used by the last command and we + // can't use it now because we have to enforce round robin between two events. if (Cache->size() < 2) return nullptr; - // If there are two events then return an event from the beginning of the list since event of the last command is added to the end of the list. + // If there are two events then return an event from the beginning of the list + // since event of the last command is added to the end of the list. auto It = Cache->begin(); pi_event RetEvent = *It; Cache->erase(It); @@ -2086,7 +2091,8 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( auto NextImmCmdList = QueueGroup.ImmCmdLists[QueueGroup.NextIndex]; if (CurQueue->LastCommandList != CurQueue->CommandListMap.end() && CurQueue->LastCommandList != NextImmCmdList) { - CurQueue->signalEventFromCmdListIfLastEventDiscarded(CurQueue->LastCommandList); + CurQueue->signalEventFromCmdListIfLastEventDiscarded( + CurQueue->LastCommandList); } } } else { @@ -2107,8 +2113,11 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( bool IncludeLastCommandEvent = CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr; - // If the last event is discarded then we already have a barrier waiting for that event, so don't need to include the last command event into the wait list. - if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() && CurQueue->LastCommandEvent->IsDiscarded) + // If the last event is discarded then we already have a barrier waiting for + // that event, so don't need to include the last command event into the wait + // list. + if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() && + CurQueue->LastCommandEvent->IsDiscarded) IncludeLastCommandEvent = false; try { diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index b30b1e496e1ca..a297f42b4a4cd 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -918,18 +918,27 @@ struct _pi_queue : _pi_object { // command is enqueued. pi_event LastCommandEvent = nullptr; - // This data member is used only for in-order queue with discard_events property. - // For in-order queues with discarded events we reset and reuse events in scope of each command list but to switch between command lists we have to use new event. - // This data member keeps track of the last used command list and allows to handle switch of immediate command lists because immediate command lists are never closed unlike regular command lists. + // This data member is used only for in-order queue with discard_events + // property. For in-order queues with discarded events we reset and reuse + // events in scope of each command list but to switch between command lists we + // have to use new event. This data member keeps track of the last used + // command list and allows to handle switch of immediate command lists because + // immediate command lists are never closed unlike regular command lists. pi_command_list_ptr_t LastCommandList = CommandListMap.end(); - // This data member is used only for in-order queue with discard_events property. - // It is a vector of 2 lists: for host-visible and device-scope events. They are separated to allow faster access to stored events depending on requested type of event. - // Each list contains events which can be reused in scope of command list. Two events are enough for reset and reuse inside each command list moreover those two events can be used for all command lists in the queue, thus those lists are going to contain two elements each at maximum. - // We release leftover events in the cache at the queue destruction. + // This data member is used only for in-order queue with discard_events + // property. It is a vector of 2 lists: for host-visible and device-scope + // events. They are separated to allow faster access to stored events + // depending on requested type of event. Each list contains events which can + // be reused in scope of command list. Two events are enough for reset and + // reuse inside each command list moreover those two events can be used for + // all command lists in the queue, thus those lists are going to contain two + // elements each at maximum. We release leftover events in the cache at the + // queue destruction. std::vector> EventCaches{2}; - // The following 4 methods are used only for in-order queues with discard_events property. + // The following 4 methods are used only for in-order queues with + // discard_events property. // Get event from the queue's cache. pi_event getEventFromCache(bool HostVisible); From e46d7ace2ca6d14d458ef26f2b277f597aeae033 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Thu, 17 Nov 2022 10:24:20 -0800 Subject: [PATCH 21/33] Update comments --- sycl/plugins/level_zero/pi_level_zero.cpp | 2 +- sycl/plugins/level_zero/pi_level_zero.hpp | 138 ++++++++++++++-------- 2 files changed, 90 insertions(+), 50 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 93edd54e31720..69a0d30b00d0f 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -667,7 +667,7 @@ pi_result _pi_queue::appendWaitAndResetIfLastEventDiscarded( ZE_CALL(zeCommandListAppendEventReset, (CommandList->first, LastCommandEvent->ZeEvent)); - // Create copy of pi_event but with the same ze_event_handle_t. We are going + // Create new pi_event but with the same ze_event_handle_t. We are going // to use this pi_event for the next command with discarded event. pi_event PiEvent; try { diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index a297f42b4a4cd..8c4f4fd664d7c 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -609,7 +609,6 @@ struct pi_command_list_info_t { // TODO: use this for optimizing events in the same command-list, e.g. // only have last one visible to the host. std::vector EventList{}; - size_t size() const { return EventList.size(); } void append(pi_event Event) { EventList.push_back(Event); } }; @@ -918,48 +917,6 @@ struct _pi_queue : _pi_object { // command is enqueued. pi_event LastCommandEvent = nullptr; - // This data member is used only for in-order queue with discard_events - // property. For in-order queues with discarded events we reset and reuse - // events in scope of each command list but to switch between command lists we - // have to use new event. This data member keeps track of the last used - // command list and allows to handle switch of immediate command lists because - // immediate command lists are never closed unlike regular command lists. - pi_command_list_ptr_t LastCommandList = CommandListMap.end(); - - // This data member is used only for in-order queue with discard_events - // property. It is a vector of 2 lists: for host-visible and device-scope - // events. They are separated to allow faster access to stored events - // depending on requested type of event. Each list contains events which can - // be reused in scope of command list. Two events are enough for reset and - // reuse inside each command list moreover those two events can be used for - // all command lists in the queue, thus those lists are going to contain two - // elements each at maximum. We release leftover events in the cache at the - // queue destruction. - std::vector> EventCaches{2}; - - // The following 4 methods are used only for in-order queues with - // discard_events property. - - // Get event from the queue's cache. - pi_event getEventFromCache(bool HostVisible); - - // Add event to the queue's cache. - pi_result addEventToCache(pi_event Event); - - // Append command to provided command list to wait for and reset the last - // discarded event. If we have in-order and discard_events mode we reset and - // reuse discarded events in scope of the same command list. This method - // allows to wait for the last discarded event and reset it after command - // submission. - pi_result appendWaitAndResetIfLastEventDiscarded(pi_command_list_ptr_t); - - // For in-order queue append command to the command list to signal new event - // if the last event in the command list is discarded. While we submit - // commands in scope of the same command list we can reset and reuse events - // but when we switch to a different command list we currently need to signal - // new event and wait for it in the new command list using barrier. - pi_result signalEventFromCmdListIfLastEventDiscarded(pi_command_list_ptr_t); - // Kernel is not necessarily submitted for execution during // piEnqueueKernelLaunch, it may be batched. That's why we need to save the // list of kernels which is going to be submitted but have not been submitted @@ -1097,12 +1054,6 @@ struct _pi_queue : _pi_object { pi_result insertActiveBarriers(pi_command_list_ptr_t &CmdList, bool UseCopyEngine); - // Insert a barrier waiting for the last command event into the beginning of - // command list if queue is in-order and has discard_events property. It - // allows to reset and reuse event handles in scope of each command list. - pi_result - insertStartBarrierIfDiscardEventsMode(pi_command_list_ptr_t &CmdList); - // A collection of currently active barriers. // These should be inserted into a command list whenever an available command // list is needed for a command. @@ -1122,6 +1073,95 @@ struct _pi_queue : _pi_object { // Indicates that the queue is healthy and all operations on it are OK. bool Healthy{true}; + + // The following data structures and methods are used only for handling + // in-order queue with discard_events property. Some commands in such queue + // may have discarded event. Which means that event is not visible outside of + // the plugin. It is possible to reset and reuse discarded events in the same + // in-order queue because of the dependency between commands. We don't have to + // wait event completion to do this. We use the following 2-event model to + // reuse events inside each command list: + // + // Operation1 = zeCommantListAppendMemoryCopy (signal ze_event1) + // zeCommandListAppendBarrier(wait for ze_event1) + // zeCommandListAppendEventReset(ze_event1) + // # Create new pi_event using ze_event1 and append to the cache. + // + // Operation2 = zeCommandListAppendMemoryCopy (signal ze_event2) + // zeCommandListAppendBarrier(wait for ze_event2) + // zeCommandListAppendEventReset(ze_event2) + // # Create new pi_event using ze_event2 and append to the cache. + // + // # Get pi_event from the beginning of the cache because there are two events + // # there. So it is guaranteed that we do round-robin between two events - + // # event from the last command is appended to the cache. + // Operation3 = zeCommandListAppendMemoryCopy (signal ze_event1) + // # The same ze_event1 is used for Operation1 and Operation3. + // + // When we switch to a different command list we need to signal new event and + // wait for it in the new command list using barrier. + // [CmdList1] + // Operation1 = zeCommantListAppendMemoryCopy (signal event1) + // zeCommandListAppendBarrier(wait for event1) + // zeCommandListAppendEventReset(event1) + // zeCommandListAppendSignalEvent(NewEvent) + // + // [CmdList2] + // zeCommandListAppendBarrier(wait for NewEvent) + // + // This barrier guarantees that command list execution starts only after + // completion of previous command list which signals aforementioned event. It + // allows to reset and reuse same event handles inside all command lists in + // scope of the queue. It means that we need 2 reusable events of each type + // (host-visible and device-scope) per queue at maximum. + + // This data member keeps track of the last used command list and allows to + // handle switch of immediate command lists because immediate command lists + // are never closed unlike regular command lists. + pi_command_list_ptr_t LastCommandList = CommandListMap.end(); + + // Vector of 2 lists of reusable events: host-visible and device-scope. + // They are separated to allow faster access to stored events depending on + // requested type of event. Each list contains events which can be reused + // inside all command lists in the queue as described in the 2-event model. + // Leftover events in the cache are relased at the queue destruction. + std::vector> EventCaches{2}; + + // Get event from the queue's cache. + // Returns nullptr if the cache doesn't contain any reusable events or if the + // cache contains only one event which corresponds to the previous command and + // can't be used for the current command because we can't use the same event + // two times in a row and have to do round-robin between two events. Otherwise + // it picks an event from the beginning of the cache and returns it. Event + // from the last command is always appended to the end of the list. + pi_event getEventFromCache(bool HostVisible); + + // Put pi_event to the cache. Provided pi_event object is not used by + // any command but its ZeEvent is used by many pi_event objects. + // Commands to wait and reset ZeEvent must be submitted to the queue before + // calling this method. + pi_result addEventToCache(pi_event Event); + + // Append command to provided command list to wait and reset the last event if + // it is discarded and create new pi_event wrapper using the same native event + // and put it to the cache. We call this method after each command submission + // to make native event available to use by next commands. + pi_result appendWaitAndResetIfLastEventDiscarded(pi_command_list_ptr_t); + + // Append command to the command list to signal new event if the last event in + // the command list is discarded. While we submit commands in scope of the + // same command list we can reset and reuse events but when we switch to a + // different command list we currently need to signal new event and wait for + // it in the new command list using barrier. + pi_result signalEventFromCmdListIfLastEventDiscarded(pi_command_list_ptr_t); + + // Insert a barrier waiting for the last command event into the beginning of + // command list. This barrier guarantees that command list execution starts + // only after completion of previous command list which signals aforementioned + // event. It allows to reset and reuse same event handles inside all command + // lists in the queue. + pi_result + insertStartBarrierIfDiscardEventsMode(pi_command_list_ptr_t &CmdList); }; struct _pi_mem : _pi_object { From c60f72c77245ab7dc77d4ab42b6c6774f6faae76 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Thu, 17 Nov 2022 13:35:35 -0800 Subject: [PATCH 22/33] Rename LastCommandList->LastUsedCommandList --- sycl/plugins/level_zero/pi_level_zero.cpp | 12 ++++++------ sycl/plugins/level_zero/pi_level_zero.hpp | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 69a0d30b00d0f..a4c04dc67aaae 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1679,7 +1679,7 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, } } - this->LastCommandList = CommandList; + this->LastUsedCommandList = CommandList; if (!Device->useImmediateCommandLists()) { // Batch if allowed to, but don't batch if we know there are no kernels @@ -1821,7 +1821,7 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // Close the command list and have it ready for dispatch. ZE_CALL(zeCommandListClose, (CommandList->first)); - this->LastCommandList = CommandListMap.end(); + this->LastUsedCommandList = CommandListMap.end(); // Offload command list to the GPU for asynchronous execution auto ZeCommandList = CommandList->first; auto ZeResult = ZE_CALL_NOCHECK( @@ -2015,7 +2015,7 @@ pi_result _pi_queue::insertStartBarrierIfDiscardEventsMode( // If current command list is different from the last command list then insert // a barrier waiting for the last command event. if (ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents() && - CmdList != LastCommandList && LastCommandEvent) { + CmdList != LastUsedCommandList && LastCommandEvent) { ZE_CALL(zeCommandListAppendBarrier, (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); LastCommandEvent = nullptr; @@ -2089,10 +2089,10 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( // to insert a barrier in the new command list waiting for that event. auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine); auto NextImmCmdList = QueueGroup.ImmCmdLists[QueueGroup.NextIndex]; - if (CurQueue->LastCommandList != CurQueue->CommandListMap.end() && - CurQueue->LastCommandList != NextImmCmdList) { + if (CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() && + CurQueue->LastUsedCommandList != NextImmCmdList) { CurQueue->signalEventFromCmdListIfLastEventDiscarded( - CurQueue->LastCommandList); + CurQueue->LastUsedCommandList); } } } else { diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 8c4f4fd664d7c..7f84c1e03c7ad 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -1118,7 +1118,7 @@ struct _pi_queue : _pi_object { // This data member keeps track of the last used command list and allows to // handle switch of immediate command lists because immediate command lists // are never closed unlike regular command lists. - pi_command_list_ptr_t LastCommandList = CommandListMap.end(); + pi_command_list_ptr_t LastUsedCommandList = CommandListMap.end(); // Vector of 2 lists of reusable events: host-visible and device-scope. // They are separated to allow faster access to stored events depending on From 5bd9c1913d5bcdbd2dadacc65eda3413370afc17 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Thu, 17 Nov 2022 13:44:56 -0800 Subject: [PATCH 23/33] Document env variable --- sycl/doc/EnvironmentVariables.md | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 046358b3f157c..502077247e127 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -249,6 +249,7 @@ variables in production code. | `SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS` | Integer | When set to a positive value enables use of multiple Level Zero commandlists when submitting barriers. Default is 1. | | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL` | Integer | When set to a positive value enables use of a copy engine for memory fill operations. Default is 0. | | `SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION` | Integer | When set to "0" tells to use single root-device allocation for all devices in a context where all devices have same root. Otherwise performs regular buffer migration. Default is 1. | +| `SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS` | Integer | When set to a positive value enables the mode when discarded Level Zero events are reset and reused in scope of the same in-order queue based on the dependency chain between commands. Default is 1. | ## Debugging variables for CUDA Plugin From 849d1f6145ea1a7c1dd1b4127040bd1c7b8276cf Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Thu, 17 Nov 2022 13:52:19 -0800 Subject: [PATCH 24/33] Rename appendWaitAndResetIfLastEventDiscarded->resetDiscardedEvent --- sycl/plugins/level_zero/pi_level_zero.cpp | 5 ++--- sycl/plugins/level_zero/pi_level_zero.hpp | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index a4c04dc67aaae..29dfa4eeac8ac 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -659,8 +659,7 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, if (!(condition)) \ return error; -pi_result _pi_queue::appendWaitAndResetIfLastEventDiscarded( - pi_command_list_ptr_t CommandList) { +pi_result _pi_queue::resetDiscardedEvent(pi_command_list_ptr_t CommandList) { if (LastCommandEvent && LastCommandEvent->IsDiscarded) { ZE_CALL(zeCommandListAppendBarrier, (CommandList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); @@ -1675,7 +1674,7 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, this->LastCommandEvent != CommandList->second.EventList.back()) { this->LastCommandEvent = CommandList->second.EventList.back(); if (ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents()) { - PI_CALL(appendWaitAndResetIfLastEventDiscarded(CommandList)); + PI_CALL(resetDiscardedEvent(CommandList)); } } diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 7f84c1e03c7ad..aaeb4fbfd1ba6 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -1146,7 +1146,7 @@ struct _pi_queue : _pi_object { // it is discarded and create new pi_event wrapper using the same native event // and put it to the cache. We call this method after each command submission // to make native event available to use by next commands. - pi_result appendWaitAndResetIfLastEventDiscarded(pi_command_list_ptr_t); + pi_result resetDiscardedEvent(pi_command_list_ptr_t); // Append command to the command list to signal new event if the last event in // the command list is discarded. While we submit commands in scope of the From 6e42f6d916a6cd45d435d1cfe594710923a7c73e Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Thu, 17 Nov 2022 14:01:43 -0800 Subject: [PATCH 25/33] Rename addEventToCache/getEventFromCache for readability --- sycl/plugins/level_zero/pi_level_zero.cpp | 16 ++++++++-------- sycl/plugins/level_zero/pi_level_zero.hpp | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 29dfa4eeac8ac..ebb5b3a364158 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -682,7 +682,7 @@ pi_result _pi_queue::resetDiscardedEvent(pi_command_list_ptr_t CommandList) { if (LastCommandEvent->isHostVisible()) PiEvent->HostVisibleEvent = PiEvent; - PI_CALL(addEventToCache(PiEvent)); + PI_CALL(addEventToQueueCache(PiEvent)); } return PI_SUCCESS; @@ -708,7 +708,7 @@ inline static pi_result createEventAndAssociateQueue( ForceHostVisible = DeviceEventsSetting == AllHostVisible; // If event is discarded then try to get event from the queue cache. - *Event = IsInternal ? Queue->getEventFromCache(ForceHostVisible) : nullptr; + *Event = IsInternal ? Queue->getEventFromQueueCache(ForceHostVisible) : nullptr; if (*Event == nullptr) PI_CALL(EventCreate(Queue->Context, Queue, ForceHostVisible, Event)); @@ -767,7 +767,7 @@ pi_result _pi_queue::signalEventFromCmdListIfLastEventDiscarded( return PI_SUCCESS; } -pi_event _pi_queue::getEventFromCache(bool HostVisible) { +pi_event _pi_queue::getEventFromQueueCache(bool HostVisible) { auto Cache = HostVisible ? &EventCaches[0] : &EventCaches[1]; // If we don't have any events, return nullptr. @@ -784,7 +784,7 @@ pi_event _pi_queue::getEventFromCache(bool HostVisible) { return RetEvent; } -pi_result _pi_queue::addEventToCache(pi_event Event) { +pi_result _pi_queue::addEventToQueueCache(pi_event Event) { auto Cache = Event->isHostVisible() ? &EventCaches[0] : &EventCaches[1]; Cache->emplace_back(Event); return PI_SUCCESS; @@ -5769,7 +5769,7 @@ pi_result _pi_event::reset() { return PI_SUCCESS; } -pi_event _pi_context::getEventFromCache(bool HostVisible, bool WithProfiling) { +pi_event _pi_context::getEventFromContextCache(bool HostVisible, bool WithProfiling) { std::scoped_lock Lock(EventCacheMutex); auto Cache = getEventCache(HostVisible, WithProfiling); if (Cache->empty()) @@ -5781,7 +5781,7 @@ pi_event _pi_context::getEventFromCache(bool HostVisible, bool WithProfiling) { return Event; } -void _pi_context::addEventToCache(pi_event Event) { +void _pi_context::addEventToContextCache(pi_event Event) { std::scoped_lock Lock(EventCacheMutex); auto Cache = getEventCache(Event->isHostVisible(), Event->isProfilingEnabled()); @@ -5800,7 +5800,7 @@ static pi_result EventCreate(pi_context Context, pi_queue Queue, !Queue || (Queue->Properties & PI_QUEUE_PROFILING_ENABLE) != 0; if (auto CachedEvent = - Context->getEventFromCache(HostVisible, ProfilingEnabled)) { + Context->getEventFromContextCache(HostVisible, ProfilingEnabled)) { *RetEvent = CachedEvent; return PI_SUCCESS; } @@ -6257,7 +6257,7 @@ static pi_result piEventReleaseInternal(pi_event Event) { if (DisableEventsCaching || !Event->OwnZeEvent) { delete Event; } else { - Event->Context->addEventToCache(Event); + Event->Context->addEventToContextCache(Event); } // We intentionally incremented the reference counter when an event is diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index aaeb4fbfd1ba6..bcfcd0b1bd895 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -761,10 +761,10 @@ struct _pi_context : _pi_object { std::unordered_map MemAllocs; // Get pi_event from cache. - pi_event getEventFromCache(bool HostVisible, bool WithProfiling); + pi_event getEventFromContextCache(bool HostVisible, bool WithProfiling); // Add pi_event to cache. - void addEventToCache(pi_event); + void addEventToContextCache(pi_event); private: // If context contains one device then return this device. @@ -1134,13 +1134,13 @@ struct _pi_queue : _pi_object { // two times in a row and have to do round-robin between two events. Otherwise // it picks an event from the beginning of the cache and returns it. Event // from the last command is always appended to the end of the list. - pi_event getEventFromCache(bool HostVisible); + pi_event getEventFromQueueCache(bool HostVisible); // Put pi_event to the cache. Provided pi_event object is not used by // any command but its ZeEvent is used by many pi_event objects. // Commands to wait and reset ZeEvent must be submitted to the queue before // calling this method. - pi_result addEventToCache(pi_event Event); + pi_result addEventToQueueCache(pi_event Event); // Append command to provided command list to wait and reset the last event if // it is discarded and create new pi_event wrapper using the same native event From 6cecfb413c8d9a01623900cd30fdc42b4678a851 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Thu, 17 Nov 2022 14:38:43 -0800 Subject: [PATCH 26/33] Create helper query and add several comments --- sycl/plugins/level_zero/pi_level_zero.cpp | 31 ++++++++++++++--------- sycl/plugins/level_zero/pi_level_zero.hpp | 4 +++ 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index ebb5b3a364158..49ca0a584aebc 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -659,6 +659,8 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, if (!(condition)) \ return error; +bool _pi_queue::doReuseDiscardedEvents() { return doReuseDiscardedEvents(); } + pi_result _pi_queue::resetDiscardedEvent(pi_command_list_ptr_t CommandList) { if (LastCommandEvent && LastCommandEvent->IsDiscarded) { ZE_CALL(zeCommandListAppendBarrier, @@ -708,7 +710,8 @@ inline static pi_result createEventAndAssociateQueue( ForceHostVisible = DeviceEventsSetting == AllHostVisible; // If event is discarded then try to get event from the queue cache. - *Event = IsInternal ? Queue->getEventFromQueueCache(ForceHostVisible) : nullptr; + *Event = + IsInternal ? Queue->getEventFromQueueCache(ForceHostVisible) : nullptr; if (*Event == nullptr) PI_CALL(EventCreate(Queue->Context, Queue, ForceHostVisible, Event)); @@ -717,7 +720,9 @@ inline static pi_result createEventAndAssociateQueue( (*Event)->CommandType = CommandType; (*Event)->IsDiscarded = IsInternal; // Discarded event doesn't own ze_event, it is used by multiple pi_event - // objects. + // objects. We destroy corresponding ze_event by releasing events from the + // events cache at queue destruction. Event in the cache owns the Level Zero + // event. if (IsInternal) (*Event)->OwnZeEvent = false; @@ -752,8 +757,8 @@ pi_result _pi_queue::signalEventFromCmdListIfLastEventDiscarded( pi_command_list_ptr_t CommandList) { // We signal new event at the end of command list only if we have queue with // discard_events property and the last command event is discarded. - if (!(ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents() && - LastCommandEvent && LastCommandEvent->IsDiscarded)) + if (!(doReuseDiscardedEvents() && LastCommandEvent && + LastCommandEvent->IsDiscarded)) return PI_SUCCESS; pi_event Event; @@ -1669,11 +1674,12 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, bool CurrentlyEmpty = !PrintPiTrace && this->LastCommandEvent == nullptr; // The list can be empty if command-list only contains signals of proxy - // events. + // events. We need to process the last command event only if new command was + // submitted to the command list. if (!CommandList->second.EventList.empty() && this->LastCommandEvent != CommandList->second.EventList.back()) { this->LastCommandEvent = CommandList->second.EventList.back(); - if (ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents()) { + if (doReuseDiscardedEvents()) { PI_CALL(resetDiscardedEvent(CommandList)); } } @@ -1784,7 +1790,7 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // each event in the EventList. PI_CALL(piEventReleaseInternal(HostVisibleEvent)); - if (ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents()) { + if (doReuseDiscardedEvents()) { // If we have in-order queue with discarded events then we want to // treat this event as regular event. We insert a barrier in the next // command list to wait for this event. @@ -1799,8 +1805,8 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // Finally set to signal the host-visible event at the end of the // command-list after a barrier that waits for all commands // completion. - if (ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents() && - LastCommandEvent && LastCommandEvent->IsDiscarded) { + if (doReuseDiscardedEvents() && LastCommandEvent && + LastCommandEvent->IsDiscarded) { // If we the last event is discarded then we already have a barrier // inserted, so just signal the event. ZE_CALL(zeCommandListAppendSignalEvent, @@ -2013,8 +2019,8 @@ pi_result _pi_queue::insertStartBarrierIfDiscardEventsMode( pi_command_list_ptr_t &CmdList) { // If current command list is different from the last command list then insert // a barrier waiting for the last command event. - if (ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents() && - CmdList != LastUsedCommandList && LastCommandEvent) { + if (doReuseDiscardedEvents() && CmdList != LastUsedCommandList && + LastCommandEvent) { ZE_CALL(zeCommandListAppendBarrier, (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); LastCommandEvent = nullptr; @@ -5769,7 +5775,8 @@ pi_result _pi_event::reset() { return PI_SUCCESS; } -pi_event _pi_context::getEventFromContextCache(bool HostVisible, bool WithProfiling) { +pi_event _pi_context::getEventFromContextCache(bool HostVisible, + bool WithProfiling) { std::scoped_lock Lock(EventCacheMutex); auto Cache = getEventCache(HostVisible, WithProfiling); if (Cache->empty()) diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index bcfcd0b1bd895..f20a8b8ea4e43 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -1162,6 +1162,10 @@ struct _pi_queue : _pi_object { // lists in the queue. pi_result insertStartBarrierIfDiscardEventsMode(pi_command_list_ptr_t &CmdList); + + // Helper method telling whether we need to reuse discarded event in this + // queue. + bool doReuseDiscardedEvents(); }; struct _pi_mem : _pi_object { From 17b0775c2668334a3795b3e27df46f636361bbb1 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Thu, 17 Nov 2022 14:56:11 -0800 Subject: [PATCH 27/33] Add query only mode for getQueueIndex --- sycl/plugins/level_zero/pi_level_zero.cpp | 18 ++++++++++++------ sycl/plugins/level_zero/pi_level_zero.hpp | 5 ++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 49ca0a584aebc..7d5550d74c204 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1863,12 +1863,15 @@ bool _pi_queue::isBatchingAllowed(bool IsCopy) const { // Return the index of the next queue to use based on a // round robin strategy and the queue group ordinal. uint32_t _pi_queue::pi_queue_group_t::getQueueIndex(uint32_t *QueueGroupOrdinal, - uint32_t *QueueIndex) { - + uint32_t *QueueIndex, + bool QueryOnly) { auto CurrentIndex = NextIndex; - ++NextIndex; - if (NextIndex > UpperIndex) - NextIndex = LowerIndex; + + if (!QueryOnly) { + ++NextIndex; + if (NextIndex > UpperIndex) + NextIndex = LowerIndex; + } // Find out the right queue group ordinal (first queue might be "main" or // "link") @@ -2093,7 +2096,10 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( // signal new event from the last immediate command list. We are going // to insert a barrier in the new command list waiting for that event. auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine); - auto NextImmCmdList = QueueGroup.ImmCmdLists[QueueGroup.NextIndex]; + uint32_t *QueueGroupOrdinal, *QueueIndex; + auto NextIndex = QueueGroup.getQueueIndex(QueueGroupOrdinal, QueueIndex, + /*QueryOnly */ true); + auto NextImmCmdList = QueueGroup.ImmCmdLists[NextIndex]; if (CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() && CurQueue->LastUsedCommandList != NextImmCmdList) { CurQueue->signalEventFromCmdListIfLastEventDiscarded( diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index f20a8b8ea4e43..733fc43cea223 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -863,7 +863,10 @@ struct _pi_queue : _pi_object { // Return the index of the next queue to use based on a // round robin strategy and the queue group ordinal. - uint32_t getQueueIndex(uint32_t *QueueGroupOrdinal, uint32_t *QueueIndex); + // If QueryOnly is true then return index values but don't update internal + // index data members of the queue. + uint32_t getQueueIndex(uint32_t *QueueGroupOrdinal, uint32_t *QueueIndex, + bool QueryOnly = false); // Get the ordinal for a command queue handle. int32_t getCmdQueueOrdinal(ze_command_queue_handle_t CmdQueue); From cacce1b54d5eb25e02dfffb04cba7b43bef4ce63 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Thu, 17 Nov 2022 15:01:28 -0800 Subject: [PATCH 28/33] Add a comment about batch closure --- sycl/plugins/level_zero/pi_level_zero.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 7d5550d74c204..832fc5cdb4e1c 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2108,7 +2108,9 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( } } else { // Ensure LastCommandEvent's batch is submitted if it is differrent - // from the one this command is going to. + // from the one this command is going to. If we reuse discarded events + // then signalEventFromCmdListIfLastEventDiscarded will be called at batch + // close if needed. const auto &OpenCommandList = CurQueue->eventOpenCommandList(CurQueue->LastCommandEvent); if (OpenCommandList != CurQueue->CommandListMap.end() && From b5a1c29af7a6e0a2902e6146d7f72b29035bc740 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Thu, 17 Nov 2022 15:17:17 -0800 Subject: [PATCH 29/33] Remove unnecessary and confusing conditions --- sycl/plugins/level_zero/pi_level_zero.cpp | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 832fc5cdb4e1c..6370ca1d3d6bd 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -685,6 +685,9 @@ pi_result _pi_queue::resetDiscardedEvent(pi_command_list_ptr_t CommandList) { PiEvent->HostVisibleEvent = PiEvent; PI_CALL(addEventToQueueCache(PiEvent)); + + // We handled dependency from last command event, so set it to nullptr. + LastCommandEvent = nullptr; } return PI_SUCCESS; @@ -1674,10 +1677,8 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, bool CurrentlyEmpty = !PrintPiTrace && this->LastCommandEvent == nullptr; // The list can be empty if command-list only contains signals of proxy - // events. We need to process the last command event only if new command was - // submitted to the command list. - if (!CommandList->second.EventList.empty() && - this->LastCommandEvent != CommandList->second.EventList.back()) { + // events. + if (!CommandList->second.EventList.empty()) { this->LastCommandEvent = CommandList->second.EventList.back(); if (doReuseDiscardedEvents()) { PI_CALL(resetDiscardedEvent(CommandList)); @@ -2123,18 +2124,8 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( } } - bool IncludeLastCommandEvent = - CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr; - - // If the last event is discarded then we already have a barrier waiting for - // that event, so don't need to include the last command event into the wait - // list. - if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() && - CurQueue->LastCommandEvent->IsDiscarded) - IncludeLastCommandEvent = false; - try { - if (IncludeLastCommandEvent) { + if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) { this->ZeEventList = new ze_event_handle_t[EventListLength + 1]; this->PiEventList = new pi_event[EventListLength + 1]; } else if (EventListLength > 0) { @@ -2226,7 +2217,7 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( // For in-order queues, every command should be executed only after the // previous command has finished. The event associated with the last // enqueued command is added into the waitlist to ensure in-order semantics. - if (IncludeLastCommandEvent) { + if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) { std::shared_lock Lock(CurQueue->LastCommandEvent->Mutex); this->ZeEventList[TmpListLength] = CurQueue->LastCommandEvent->ZeEvent; this->PiEventList[TmpListLength] = CurQueue->LastCommandEvent; From a8358f5b64b6852d96e13225007a4ca025d58e39 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Thu, 17 Nov 2022 15:20:08 -0800 Subject: [PATCH 30/33] Revert "Remove unnecessary and confusing conditions" This reverts commit b5a1c29af7a6e0a2902e6146d7f72b29035bc740. --- sycl/plugins/level_zero/pi_level_zero.cpp | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 6370ca1d3d6bd..832fc5cdb4e1c 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -685,9 +685,6 @@ pi_result _pi_queue::resetDiscardedEvent(pi_command_list_ptr_t CommandList) { PiEvent->HostVisibleEvent = PiEvent; PI_CALL(addEventToQueueCache(PiEvent)); - - // We handled dependency from last command event, so set it to nullptr. - LastCommandEvent = nullptr; } return PI_SUCCESS; @@ -1677,8 +1674,10 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, bool CurrentlyEmpty = !PrintPiTrace && this->LastCommandEvent == nullptr; // The list can be empty if command-list only contains signals of proxy - // events. - if (!CommandList->second.EventList.empty()) { + // events. We need to process the last command event only if new command was + // submitted to the command list. + if (!CommandList->second.EventList.empty() && + this->LastCommandEvent != CommandList->second.EventList.back()) { this->LastCommandEvent = CommandList->second.EventList.back(); if (doReuseDiscardedEvents()) { PI_CALL(resetDiscardedEvent(CommandList)); @@ -2124,8 +2123,18 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( } } + bool IncludeLastCommandEvent = + CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr; + + // If the last event is discarded then we already have a barrier waiting for + // that event, so don't need to include the last command event into the wait + // list. + if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() && + CurQueue->LastCommandEvent->IsDiscarded) + IncludeLastCommandEvent = false; + try { - if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) { + if (IncludeLastCommandEvent) { this->ZeEventList = new ze_event_handle_t[EventListLength + 1]; this->PiEventList = new pi_event[EventListLength + 1]; } else if (EventListLength > 0) { @@ -2217,7 +2226,7 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( // For in-order queues, every command should be executed only after the // previous command has finished. The event associated with the last // enqueued command is added into the waitlist to ensure in-order semantics. - if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) { + if (IncludeLastCommandEvent) { std::shared_lock Lock(CurQueue->LastCommandEvent->Mutex); this->ZeEventList[TmpListLength] = CurQueue->LastCommandEvent->ZeEvent; this->PiEventList[TmpListLength] = CurQueue->LastCommandEvent; From e9aeab3676afba88949da88d0cf24a800e82e315 Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Thu, 17 Nov 2022 15:28:42 -0800 Subject: [PATCH 31/33] Fix _pi_queue::doReuseDiscardedEvents method --- sycl/plugins/level_zero/pi_level_zero.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 832fc5cdb4e1c..e61d95ddc363b 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -659,7 +659,9 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, if (!(condition)) \ return error; -bool _pi_queue::doReuseDiscardedEvents() { return doReuseDiscardedEvents(); } +bool _pi_queue::doReuseDiscardedEvents() { + return ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents(); +} pi_result _pi_queue::resetDiscardedEvent(pi_command_list_ptr_t CommandList) { if (LastCommandEvent && LastCommandEvent->IsDiscarded) { @@ -2096,9 +2098,10 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( // signal new event from the last immediate command list. We are going // to insert a barrier in the new command list waiting for that event. auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine); - uint32_t *QueueGroupOrdinal, *QueueIndex; - auto NextIndex = QueueGroup.getQueueIndex(QueueGroupOrdinal, QueueIndex, - /*QueryOnly */ true); + uint32_t QueueGroupOrdinal, QueueIndex; + auto NextIndex = + QueueGroup.getQueueIndex(&QueueGroupOrdinal, &QueueIndex, + /*QueryOnly */ true); auto NextImmCmdList = QueueGroup.ImmCmdLists[NextIndex]; if (CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() && CurQueue->LastUsedCommandList != NextImmCmdList) { @@ -2130,7 +2133,7 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( // that event, so don't need to include the last command event into the wait // list. if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() && - CurQueue->LastCommandEvent->IsDiscarded) + CurQueue->LastCommandEvent && CurQueue->LastCommandEvent->IsDiscarded) IncludeLastCommandEvent = false; try { From 8bd411dbe73c72393b3e0fe1483228867c05132a Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Thu, 17 Nov 2022 16:08:47 -0800 Subject: [PATCH 32/33] Add TODO to treat host proxy event as regular event --- sycl/plugins/level_zero/pi_level_zero.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index e61d95ddc363b..c619327d4bdbe 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1800,6 +1800,7 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, } else { // For all other queues treat this as a special event and indicate no // cleanup is needed. + // TODO: always treat this host event as a regular event. PI_CALL(piEventReleaseInternal(HostVisibleEvent)); HostVisibleEvent->CleanedUp = true; } From 6f8d5ee3e9298c8a97151bb44321017e2e3be78c Mon Sep 17 00:00:00 2001 From: "Gainullin, Artur" Date: Thu, 17 Nov 2022 16:27:24 -0800 Subject: [PATCH 33/33] Add clarifying comments on conditions --- sycl/plugins/level_zero/pi_level_zero.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index c619327d4bdbe..62b3d6cde9d4d 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1676,8 +1676,10 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, bool CurrentlyEmpty = !PrintPiTrace && this->LastCommandEvent == nullptr; // The list can be empty if command-list only contains signals of proxy - // events. We need to process the last command event only if new command was - // submitted to the command list. + // events. It is possible that executeCommandList is called twice for the same + // command list without new appended command. We don't to want process the + // same last command event twice that's why additionally check that new + // command was appended to the command list. if (!CommandList->second.EventList.empty() && this->LastCommandEvent != CommandList->second.EventList.back()) { this->LastCommandEvent = CommandList->second.EventList.back(); @@ -2131,8 +2133,8 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr; // If the last event is discarded then we already have a barrier waiting for - // that event, so don't need to include the last command event into the wait - // list. + // that event, so must not include the last command event into the wait + // list because it will cause waiting for event which was reset. if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() && CurQueue->LastCommandEvent && CurQueue->LastCommandEvent->IsDiscarded) IncludeLastCommandEvent = false;