diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 46e2e33607..4be81e98f4 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -933,6 +933,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( MustSignalWaitEvent = false; } } + // Given WaitEvent was created without specifying Counting Events, then this + // event can be signalled on the host. if (MustSignalWaitEvent) { ZE2UR_CALL(zeEventHostSignal, (CommandBuffer->WaitEvent->ZeEvent)); } diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index f9cf5009fb..4880c14c4b 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -471,7 +471,8 @@ static const uint32_t MaxNumEventsPerPool = [] { ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible, - bool ProfilingEnabled, ur_device_handle_t Device) { + bool ProfilingEnabled, ur_device_handle_t Device, + bool CounterBasedEventEnabled, bool UsingImmCmdList) { // Lock while updating event pool machinery. std::scoped_lock Lock(ZeEventPoolCacheMutex); @@ -481,7 +482,8 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( ZeDevice = Device->ZeDevice; } std::list *ZePoolCache = - getZeEventPoolCache(HostVisible, ProfilingEnabled, ZeDevice); + getZeEventPoolCache(HostVisible, ProfilingEnabled, + CounterBasedEventEnabled, UsingImmCmdList, ZeDevice); if (!ZePoolCache->empty()) { if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) { @@ -506,15 +508,27 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( Index = 0; // Create one event ZePool per MaxNumEventsPerPool events if (*ZePool == nullptr) { + ze_event_pool_counter_based_exp_desc_t counterBasedExt = { + ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC}; ZeStruct ZeEventPoolDesc; ZeEventPoolDesc.count = MaxNumEventsPerPool; ZeEventPoolDesc.flags = 0; + ZeEventPoolDesc.pNext = nullptr; if (HostVisible) ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; if (ProfilingEnabled) ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; logger::debug("ze_event_pool_desc_t flags set to: {}", ZeEventPoolDesc.flags); + if (CounterBasedEventEnabled) { + if (UsingImmCmdList) { + counterBasedExt.flags = ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_IMMEDIATE; + } else { + counterBasedExt.flags = + ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_NON_IMMEDIATE; + } + ZeEventPoolDesc.pNext = &counterBasedExt; + } std::vector ZeDevices; if (ZeDevice) { @@ -540,7 +554,8 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( } ur_event_handle_t ur_context_handle_t_::getEventFromContextCache( - bool HostVisible, bool WithProfiling, ur_device_handle_t Device) { + bool HostVisible, bool WithProfiling, ur_device_handle_t Device, + bool CounterBasedEventEnabled) { std::scoped_lock Lock(EventCacheMutex); auto Cache = getEventCache(HostVisible, WithProfiling, Device); if (Cache->empty()) @@ -548,6 +563,9 @@ ur_event_handle_t ur_context_handle_t_::getEventFromContextCache( auto It = Cache->begin(); ur_event_handle_t Event = *It; + if (Event->CounterBasedEventsEnabled != CounterBasedEventEnabled) { + return nullptr; + } Cache->erase(It); // We have to reset event before using it. Event->reset(); @@ -579,13 +597,16 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) { } ze_device_handle_t ZeDevice = nullptr; + bool UsingImmediateCommandlists = + !Event->UrQueue || Event->UrQueue->UsingImmCmdLists; if (!Event->IsMultiDevice && Event->UrQueue) { ZeDevice = Event->UrQueue->Device->ZeDevice; } std::list *ZePoolCache = getZeEventPoolCache( - Event->isHostVisible(), Event->isProfilingEnabled(), ZeDevice); + Event->isHostVisible(), Event->isProfilingEnabled(), + Event->CounterBasedEventsEnabled, UsingImmediateCommandlists, ZeDevice); // Put the empty pool to the cache of the pools. if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) @@ -683,8 +704,8 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList( // Make sure to acquire the lock before checking the size, or there // will be a race condition. std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex); - // Under mutex since operator[] does insertion on the first usage for every - // unique ZeDevice. + // Under mutex since operator[] does insertion on the first usage for + // every unique ZeDevice. auto &ZeCommandListCache = UseCopyEngine ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice] diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp index 6e4244eea0..ff173aa984 100644 --- a/source/adapters/level_zero/context.hpp +++ b/source/adapters/level_zero/context.hpp @@ -146,9 +146,9 @@ struct ur_context_handle_t_ : _ur_object { // head. // // Cache of event pools to which host-visible events are added to. - std::vector> ZeEventPoolCache{4}; + std::vector> ZeEventPoolCache{12}; std::vector> - ZeEventPoolCacheDeviceMap{4}; + ZeEventPoolCacheDeviceMap{12}; // This map will be used to determine if a pool is full or not // by storing number of empty slots available in the pool. @@ -199,48 +199,73 @@ struct ur_context_handle_t_ : _ur_object { ur_result_t getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &, bool HostVisible, bool ProfilingEnabled, - ur_device_handle_t Device); + ur_device_handle_t Device, + bool CounterBasedEventEnabled, + bool UsingImmCmdList); // Get ur_event_handle_t from cache. ur_event_handle_t getEventFromContextCache(bool HostVisible, bool WithProfiling, - ur_device_handle_t Device); + ur_device_handle_t Device, + bool CounterBasedEventEnabled); // Add ur_event_handle_t to cache. void addEventToContextCache(ur_event_handle_t); + enum EventPoolCacheType { + HostVisibleCacheType, + HostInvisibleCacheType, + HostVisibleCounterBasedRegularCacheType, + HostInvisibleCounterBasedRegularCacheType, + HostVisibleCounterBasedImmediateCacheType, + HostInvisibleCounterBasedImmediateCacheType + }; + std::list * getZeEventPoolCache(bool HostVisible, bool WithProfiling, + bool CounterBasedEventEnabled, bool UsingImmediateCmdList, ze_device_handle_t ZeDevice) { - if (HostVisible) { - if (ZeDevice) { - auto ZeEventPoolCacheMap = WithProfiling - ? &ZeEventPoolCacheDeviceMap[0] - : &ZeEventPoolCacheDeviceMap[1]; - if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) { - ZeEventPoolCache.emplace_back(); - ZeEventPoolCacheMap->insert( - std::make_pair(ZeDevice, ZeEventPoolCache.size() - 1)); - } - return &ZeEventPoolCache[(*ZeEventPoolCacheMap)[ZeDevice]]; - } else { - return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1]; + EventPoolCacheType CacheType; + + calculateCacheIndex(HostVisible, CounterBasedEventEnabled, + UsingImmediateCmdList, CacheType); + if (ZeDevice) { + auto ZeEventPoolCacheMap = + WithProfiling ? &ZeEventPoolCacheDeviceMap[CacheType * 2] + : &ZeEventPoolCacheDeviceMap[CacheType * 2 + 1]; + if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) { + ZeEventPoolCache.emplace_back(); + ZeEventPoolCacheMap->insert( + std::make_pair(ZeDevice, ZeEventPoolCache.size() - 1)); } + return &ZeEventPoolCache[(*ZeEventPoolCacheMap)[ZeDevice]]; } else { - if (ZeDevice) { - auto ZeEventPoolCacheMap = WithProfiling - ? &ZeEventPoolCacheDeviceMap[2] - : &ZeEventPoolCacheDeviceMap[3]; - if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) { - ZeEventPoolCache.emplace_back(); - ZeEventPoolCacheMap->insert( - std::make_pair(ZeDevice, ZeEventPoolCache.size() - 1)); - } - return &ZeEventPoolCache[(*ZeEventPoolCacheMap)[ZeDevice]]; - } else { - return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3]; - } + return WithProfiling ? &ZeEventPoolCache[CacheType * 2] + : &ZeEventPoolCache[CacheType * 2 + 1]; + } + } + + ur_result_t calculateCacheIndex(bool HostVisible, + bool CounterBasedEventEnabled, + bool UsingImmediateCmdList, + EventPoolCacheType &CacheType) { + if (CounterBasedEventEnabled && HostVisible && !UsingImmediateCmdList) { + CacheType = HostVisibleCounterBasedRegularCacheType; + } else if (CounterBasedEventEnabled && !HostVisible && + !UsingImmediateCmdList) { + CacheType = HostInvisibleCounterBasedRegularCacheType; + } else if (CounterBasedEventEnabled && HostVisible && + UsingImmediateCmdList) { + CacheType = HostVisibleCounterBasedImmediateCacheType; + } else if (CounterBasedEventEnabled && !HostVisible && + UsingImmediateCmdList) { + CacheType = HostInvisibleCounterBasedImmediateCacheType; + } else if (!CounterBasedEventEnabled && HostVisible) { + CacheType = HostVisibleCacheType; + } else { + CacheType = HostInvisibleCacheType; } + return UR_RESULT_SUCCESS; } // Decrement number of events living in the pool upon event destroy diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 97ffe2f19e..6ef3b5d312 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -130,7 +130,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( if (OutEvent) { Queue->LastCommandEvent = reinterpret_cast(*OutEvent); - ZE2UR_CALL(zeEventHostSignal, ((*OutEvent)->ZeEvent)); + if (!(*OutEvent)->CounterBasedEventsEnabled) + ZE2UR_CALL(zeEventHostSignal, ((*OutEvent)->ZeEvent)); (*OutEvent)->Completed = true; } } @@ -766,7 +767,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urExtEventCreate( UR_CALL(EventCreate(Context, nullptr, false, true, Event)); (*Event)->RefCountExternal++; - ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); + if (!(*Event)->CounterBasedEventsEnabled) + ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); return UR_RESULT_SUCCESS; } @@ -784,7 +786,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( UR_CALL(EventCreate(Context, nullptr, false, true, Event)); (*Event)->RefCountExternal++; - ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); + if (!(*Event)->CounterBasedEventsEnabled) + ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); return UR_RESULT_SUCCESS; } @@ -1061,9 +1064,11 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked, // ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, bool IsMultiDevice, bool HostVisible, - ur_event_handle_t *RetEvent) { + ur_event_handle_t *RetEvent, + bool CounterBasedEventEnabled) { bool ProfilingEnabled = !Queue || Queue->isProfilingEnabled(); + bool UsingImmediateCommandlists = !Queue || Queue->UsingImmCmdLists; ur_device_handle_t Device = nullptr; @@ -1072,7 +1077,7 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, } if (auto CachedEvent = Context->getEventFromContextCache( - HostVisible, ProfilingEnabled, Device)) { + HostVisible, ProfilingEnabled, Device, CounterBasedEventEnabled)) { *RetEvent = CachedEvent; return UR_RESULT_SUCCESS; } @@ -1083,14 +1088,15 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, size_t Index = 0; if (auto Res = Context->getFreeSlotInExistingOrNewPool( - ZeEventPool, Index, HostVisible, ProfilingEnabled, Device)) + ZeEventPool, Index, HostVisible, ProfilingEnabled, Device, + CounterBasedEventEnabled, UsingImmediateCommandlists)) return Res; ZeStruct ZeEventDesc; ZeEventDesc.index = Index; ZeEventDesc.wait = 0; - if (HostVisible) { + if (HostVisible || CounterBasedEventEnabled) { ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; } else { // @@ -1115,7 +1121,7 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } - + (*RetEvent)->CounterBasedEventsEnabled = CounterBasedEventEnabled; if (HostVisible) (*RetEvent)->HostVisibleEvent = reinterpret_cast(*RetEvent); @@ -1137,8 +1143,8 @@ ur_result_t ur_event_handle_t_::reset() { if (!isHostVisible()) HostVisibleEvent = nullptr; - - ZE2UR_CALL(zeEventHostReset, (ZeEvent)); + if (!CounterBasedEventsEnabled) + ZE2UR_CALL(zeEventHostReset, (ZeEvent)); return UR_RESULT_SUCCESS; } @@ -1339,7 +1345,8 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( zeCommandListAppendWaitOnEvents(ZeCommandList, 1u, &EventList[I]->ZeEvent); - zeEventHostSignal(MultiDeviceZeEvent); + if (!MultiDeviceEvent->CounterBasedEventsEnabled) + zeEventHostSignal(MultiDeviceZeEvent); UR_CALL(Queue->executeCommandList(CommandList, /* IsBlocking */ false, /* OkToBatchCommand */ true)); diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index a566c77825..a141300a22 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -31,7 +31,8 @@ extern "C" { ur_result_t urEventReleaseInternal(ur_event_handle_t Event); ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, bool IsMultiDevice, bool HostVisible, - ur_event_handle_t *RetEvent); + ur_event_handle_t *RetEvent, + bool CounterBasedEventEnabled = false); } // extern "C" // This is an experimental option that allows to disable caching of events in @@ -226,6 +227,8 @@ struct ur_event_handle_t_ : _ur_object { // completion batch for this event. Only used for out-of-order immediate // command lists. std::optional completionBatch; + // Keeps track of whether we are using Counter-based Events. + bool CounterBasedEventsEnabled = false; }; // Helper function to implement zeHostSynchronize. diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp index 39a970063f..82ecd7043b 100644 --- a/source/adapters/level_zero/memory.cpp +++ b/source/adapters/level_zero/memory.cpp @@ -944,7 +944,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( } // Signal this event - ZE2UR_CALL(zeEventHostSignal, (ZeEvent)); + if (!(*Event)->CounterBasedEventsEnabled) + ZE2UR_CALL(zeEventHostSignal, (ZeEvent)); (*Event)->Completed = true; return UR_RESULT_SUCCESS; } @@ -1078,8 +1079,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( if (Buffer->MapHostPtr) memcpy(ZeHandleDst + MapInfo.Offset, MappedPtr, MapInfo.Size); - // Signal this event - ZE2UR_CALL(zeEventHostSignal, (ZeEvent)); + // Signal this event if it is not using counter based events + if (!(*Event)->CounterBasedEventsEnabled) + ZE2UR_CALL(zeEventHostSignal, (ZeEvent)); (*Event)->Completed = true; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp index 20ed58258c..d508d85c82 100644 --- a/source/adapters/level_zero/platform.cpp +++ b/source/adapters/level_zero/platform.cpp @@ -199,6 +199,14 @@ ur_result_t ur_platform_handle_t_::initialize() { ZeDriverModuleProgramExtensionFound = true; } } + // Check if extension is available for Counting Events. + if (strncmp(extension.name, ZE_EVENT_POOL_COUNTER_BASED_EXP_NAME, + strlen(ZE_EVENT_POOL_COUNTER_BASED_EXP_NAME) + 1) == 0) { + if (extension.version == + ZE_EVENT_POOL_COUNTER_BASED_EXP_VERSION_CURRENT) { + ZeDriverEventPoolCountingEventsExtensionFound = true; + } + } zeDriverExtensionMap[extension.name] = extension.version; } diff --git a/source/adapters/level_zero/platform.hpp b/source/adapters/level_zero/platform.hpp index d2ef19fd7e..fa9bc6b735 100644 --- a/source/adapters/level_zero/platform.hpp +++ b/source/adapters/level_zero/platform.hpp @@ -35,6 +35,7 @@ struct ur_platform_handle_t_ : public _ur_platform { // Flags to tell whether various Level Zero platform extensions are available. bool ZeDriverGlobalOffsetExtensionFound{false}; bool ZeDriverModuleProgramExtensionFound{false}; + bool ZeDriverEventPoolCountingEventsExtensionFound{false}; // Cache UR devices for reuse std::vector> URDevicesCache; diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index c6aaf4b034..65ab3892eb 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1167,6 +1167,20 @@ ur_queue_handle_t_::ur_queue_handle_t_( ComputeCommandBatch.QueueBatchSize = ZeCommandListBatchComputeConfig.startSize(); CopyCommandBatch.QueueBatchSize = ZeCommandListBatchCopyConfig.startSize(); + + static const bool useDriverCounterBasedEvents = [Device] { + const char *UrRet = std::getenv("UR_L0_USE_DRIVER_COUNTER_BASED_EVENTS"); + if (!UrRet) { + if (Device->isPVC()) + return true; + return false; + } + return std::atoi(UrRet) != 0; + }(); + this->CounterBasedEventsEnabled = + UsingImmCmdLists && isInOrderQueue() && Device->useDriverInOrderLists() && + useDriverCounterBasedEvents && + Device->Platform->ZeDriverEventPoolCountingEventsExtensionFound; } void ur_queue_handle_t_::adjustBatchSizeForFullBatch(bool IsCopy) { @@ -1447,8 +1461,10 @@ ur_queue_handle_t_::resetDiscardedEvent(ur_command_list_ptr_t CommandList) { if (LastCommandEvent && LastCommandEvent->IsDiscarded) { ZE2UR_CALL(zeCommandListAppendBarrier, (CommandList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); - ZE2UR_CALL(zeCommandListAppendEventReset, - (CommandList->first, LastCommandEvent->ZeEvent)); + if (!CounterBasedEventsEnabled) { + ZE2UR_CALL(zeCommandListAppendEventReset, + (CommandList->first, LastCommandEvent->ZeEvent)); + } // Create new ur_event_handle_t but with the same ze_event_handle_t. We are // going to use this ur_event_handle_t for the next command with discarded @@ -1750,7 +1766,8 @@ ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue, if (*Event == nullptr) UR_CALL(EventCreate(Queue->Context, Queue, IsMultiDevice, - HostVisible.value(), Event)); + HostVisible.value(), Event, + Queue->CounterBasedEventsEnabled)); (*Event)->UrQueue = Queue; (*Event)->CommandType = CommandType; diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index 03922bd2dc..5cb061be5f 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -371,6 +371,9 @@ struct ur_queue_handle_t_ : _ur_object { // Keeps the properties of this queue. ur_queue_flags_t Properties; + // Keeps track of whether we are using Counter-based Events + bool CounterBasedEventsEnabled = false; + // Map of all command lists used in this queue. ur_command_list_map_t CommandListMap;