From c709bd5dac9931afe6b4d62fc6969c21aa6c048a Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Thu, 17 Jul 2025 10:38:23 +0100 Subject: [PATCH 1/9] Implement urProgramCreateWithIL (#19470) Offload consumes both IL and binaries through the same entrypoint, so there's no point in drawing a distinction. --- source/adapters/offload/device.cpp | 2 ++ source/adapters/offload/program.cpp | 16 ++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/source/adapters/offload/device.cpp b/source/adapters/offload/device.cpp index ebe0405b89..b5d4fdc571 100644 --- a/source/adapters/offload/device.cpp +++ b/source/adapters/offload/device.cpp @@ -76,6 +76,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(uint32_t{1}); case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: return ReturnValue(uint32_t{3}); + case UR_DEVICE_INFO_COMPILER_AVAILABLE: + return ReturnValue(true); // Unimplemented features case UR_DEVICE_INFO_PROGRAM_SET_SPECIALIZATION_CONSTANTS: case UR_DEVICE_INFO_GLOBAL_VARIABLE_SUPPORT: diff --git a/source/adapters/offload/program.cpp b/source/adapters/offload/program.cpp index cf497c571f..e889f59ef8 100644 --- a/source/adapters/offload/program.cpp +++ b/source/adapters/offload/program.cpp @@ -125,6 +125,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL +urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, + size_t length, const ur_program_properties_t *pProperties, + ur_program_handle_t *phProgram) { + // Liboffload consumes both IR and binaries through the same entrypoint + return urProgramCreateWithBinary(hContext, 1, &hContext->Device, &length, + reinterpret_cast(&pIL), + pProperties, phProgram); +} + UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t, ur_program_handle_t, const char *) { @@ -147,12 +157,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(ur_context_handle_t, return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urProgramCreateWithIL(ur_context_handle_t, const void *, size_t, - const ur_program_properties_t *, ur_program_handle_t *) { - return UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE; -} - UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { From b3aa5662d32b5c09ff3f4bfff4d2cc18e97ea26c Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 17 Jul 2025 13:03:21 +0100 Subject: [PATCH 2/9] Fix command buffer update with shared kernel handles (#19479) - Fix issue where updating multiple nodes with the same UR kernel handle would give incorrect results due to arg caching. - Add SYCL E2E test based on the example in the issue that reported this. Addresses issue reported in #19450 --- source/adapters/cuda/command_buffer.cpp | 6 ++---- source/adapters/hip/command_buffer.cpp | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 3078605d42..19082b8947 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -1347,14 +1347,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( UR_CHECK_ERROR(validateCommandDesc(hCommandBuffer, pUpdateKernelLaunch[i])); } - // Store changes in config struct in command handle object + // Store changes in config struct in command handle object and propagate + // changes to CUDA graph for (uint32_t i = 0; i < numKernelUpdates; i++) { UR_CHECK_ERROR(updateCommand(pUpdateKernelLaunch[i])); UR_CHECK_ERROR(updateKernelArguments(pUpdateKernelLaunch[i])); - } - // Propagate changes to CUDA driver API - for (uint32_t i = 0; i < numKernelUpdates; i++) { const auto &UpdateCommandDesc = pUpdateKernelLaunch[i]; // If no work-size is provided make sure we pass nullptr to setKernelParams diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index c8aac5b772..abac82900f 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -984,14 +984,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( UR_CHECK_ERROR(validateCommandDesc(hCommandBuffer, pUpdateKernelLaunch[i])); } - // Store changes in config struct in command handle object + // Store changes in config struct in command handle object and propagate + // changes to HIP Graph. for (uint32_t i = 0; i < numKernelUpdates; i++) { UR_CHECK_ERROR(updateCommand(pUpdateKernelLaunch[i])); UR_CHECK_ERROR(updateKernelArguments(pUpdateKernelLaunch[i])); - } - // Propagate changes to HIP driver API - for (uint32_t i = 0; i < numKernelUpdates; i++) { const auto &UpdateCommandDesc = pUpdateKernelLaunch[i]; // If no worksize is provided make sure we pass nullptr to setKernelParams From fa11ae35f29c0ee29fc85525595895f2a40c5dd7 Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Thu, 17 Jul 2025 13:42:32 +0100 Subject: [PATCH 3/9] Add documentation in stream queue (#19471) This patch mostly just adds some more documentation in the stream queue implementation, but it also does a few other things: * Re-order members in more coherent groups. * Use `ur__t` instead of `ur__t_ *`, they are identical but the first one is nicer. * Remove unused `get()` member function. --- source/common/cuda-hip/stream_queue.hpp | 104 +++++++++++++++++------- 1 file changed, 73 insertions(+), 31 deletions(-) diff --git a/source/common/cuda-hip/stream_queue.hpp b/source/common/cuda-hip/stream_queue.hpp index 2547070fa9..be52421437 100644 --- a/source/common/cuda-hip/stream_queue.hpp +++ b/source/common/cuda-hip/stream_queue.hpp @@ -30,47 +30,90 @@ struct stream_queue_t { static constexpr int DefaultNumComputeStreams = CS; static constexpr int DefaultNumTransferStreams = TS; + // Mutex to guard modifications to the ComputeStreams vector, and + // NumComputeStreams. + std::mutex ComputeStreamMutex; std::vector ComputeStreams; + // Number of compute streams that have been created + unsigned int NumComputeStreams{0}; + + // Mutex to guard modifications to the TransferStreams vector, and + // NumTransferStreams. + std::mutex TransferStreamMutex; std::vector TransferStreams; + // Number of transfer streams that have been created + unsigned int NumTransferStreams{0}; + + // The stream indices are incremented every time we return a stream. This + // means that they encode both the index of the next stream in the round + // robin, as well as which iteration of the round robin we're on. Dividing + // the stream index by the size of the associated stream vector will give the + // number of round robins we've done as quotient, and the index of the next + // stream to use as remainder. + std::atomic_uint32_t ComputeStreamIndex{0}; + std::atomic_uint32_t TransferStreamIndex{0}; + + // The LastSync indices keep track of the index based on ComputeStreamIndex + // or TransferStreamIndex of the last stream that was synchronized during a + // syncStreams operation. + unsigned int LastSyncComputeStreams{0}; + unsigned int LastSyncTransferStreams{0}; + // Stream used for recording EvQueue, which holds information about when the // command in question is enqueued on host, as opposed to started. It is // created only if profiling is enabled - either for queue or per event. native_type HostSubmitTimeStream{0}; + // Flag to keep track of the creation og HostSubmitTimeStream, it is created + // either in the queue constructor when profiling is enabled or whenever it + // is requested for the first time through timestamp entry points. std::once_flag HostSubmitTimeStreamFlag; - // delay_compute_ keeps track of which streams have been recently reused and + + // DelayCompute keeps track of which streams have been recently reused and // their next use should be delayed. If a stream has been recently reused it // will be skipped the next time it would be selected round-robin style. When // skipped, its delay flag is cleared. std::vector DelayCompute; - // keep track of which streams have applied barrier + + // ComputeStreamSyncMutex is used to guard compute streams when they are + // being re-used. + // + // When ComputeStreamSyncMutex and ComputeStreamMutex both need to be + // locked at the same time, ComputeStreamSyncMutex should be locked first + // to avoid deadlocks. + std::mutex ComputeStreamSyncMutex; + + // Guards barrier insertion in urEnqueueEventsWaitWithBarrier. + std::mutex BarrierMutex; + BarrierEventT BarrierEvent = nullptr; + BarrierEventT BarrierTmpEvent = nullptr; + + // Keep track of which streams have applied barrier. std::vector ComputeAppliedBarrier; std::vector TransferAppliedBarrier; - ur_context_handle_t_ *Context; - ur_device_handle_t_ *Device; + + ur_context_handle_t Context; + ur_device_handle_t Device; + + // Reference count for the queue object. ur::RefCount RefCount; + + // Event count used to give events an ordering used in the event class + // forLatestEvents. std::atomic_uint32_t EventCount{0}; - std::atomic_uint32_t ComputeStreamIndex{0}; - std::atomic_uint32_t TransferStreamIndex{0}; - unsigned int NumComputeStreams{0}; - unsigned int NumTransferStreams{0}; - unsigned int LastSyncComputeStreams{0}; - unsigned int LastSyncTransferStreams{0}; + + // Queue flags in the native API format as well as UR format. unsigned int Flags; ur_queue_flags_t URFlags; + + // Priority of this queue, matches underlying API priority. int Priority; - // When ComputeStreamSyncMutex and ComputeStreamMutex both need to be - // locked at the same time, ComputeStreamSyncMutex should be locked first - // to avoid deadlocks - std::mutex ComputeStreamSyncMutex; - std::mutex ComputeStreamMutex; - std::mutex TransferStreamMutex; - std::mutex BarrierMutex; + + // Tracks if the queue owns the underlying native streams, this may happen + // for queues created from interop. bool HasOwnership; - BarrierEventT BarrierEvent = nullptr; - BarrierEventT BarrierTmpEvent = nullptr; - stream_queue_t(bool IsOutOfOrder, ur_context_handle_t_ *Context, - ur_device_handle_t_ *Device, unsigned int Flags, + stream_queue_t(bool IsOutOfOrder, ur_context_handle_t Context, + ur_device_handle_t Device, unsigned int Flags, ur_queue_flags_t URFlags, int Priority) : ComputeStreams(IsOutOfOrder ? DefaultNumComputeStreams : 1), TransferStreams(IsOutOfOrder ? DefaultNumTransferStreams : 0), @@ -87,16 +130,16 @@ struct stream_queue_t { } } - // Create a queue from a native handle - stream_queue_t(native_type stream, ur_context_handle_t_ *Context, - ur_device_handle_t_ *Device, unsigned int Flags, + // Create a queue from a native handle. + stream_queue_t(native_type stream, ur_context_handle_t Context, + ur_device_handle_t Device, unsigned int Flags, ur_queue_flags_t URFlags, bool BackendOwns) - : ComputeStreams(1, stream), TransferStreams(0), + : ComputeStreams(1, stream), NumComputeStreams{1}, TransferStreams(0), DelayCompute(this->ComputeStreams.size(), false), ComputeAppliedBarrier(this->ComputeStreams.size()), TransferAppliedBarrier(this->TransferStreams.size()), Context{Context}, - Device{Device}, NumComputeStreams{1}, Flags(Flags), URFlags(URFlags), - Priority(0), HasOwnership{BackendOwns} { + Device{Device}, Flags(Flags), URFlags(URFlags), Priority(0), + HasOwnership{BackendOwns} { urContextRetain(Context); // Create timing stream if profiling is enabled. @@ -107,6 +150,7 @@ struct stream_queue_t { ~stream_queue_t() { urContextRelease(Context); } + // Methods defined by the specific adapters. void computeStreamWaitForBarrierIfNeeded(native_type Strean, uint32_t StreamI); void transferStreamWaitForBarrierIfNeeded(native_type Stream, @@ -206,9 +250,6 @@ struct stream_queue_t { return Result; } - native_type get() { return getNextComputeStream(); }; - ur_device_handle_t getDevice() const noexcept { return Device; }; - native_type getHostSubmitTimeStream() { return HostSubmitTimeStream; } bool hasBeenSynchronized(uint32_t StreamToken) { @@ -345,7 +386,8 @@ struct stream_queue_t { } } - ur_context_handle_t_ *getContext() const { return Context; }; + ur_device_handle_t getDevice() const noexcept { return Device; }; + ur_context_handle_t getContext() const noexcept { return Context; }; uint32_t getNextEventId() noexcept { return ++EventCount; } From 9f61368306ded1bed410e501812d7f4c642d305d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?= Date: Thu, 17 Jul 2025 15:56:43 +0200 Subject: [PATCH 4/9] async_malloc use allocation size for zeVirtualMemQueryPageSize (#19402) In L0 we need to call [zeVirtualMemQueryPageSize](https://oneapi-src.github.io/level-zero-spec/level-zero/latest/core/api.html#ze__api_8h_1afe411cceb631a3a176463ee05fda3dd7) with the actual allocation size for the virtual/physical allocations to align correctly. Right now we check alignment without passing any size: https://github.com/intel/llvm/blob/sycl/sycl/source/detail/graph/memory_pool.cpp#L45 This ends up translating to 1 byte in the call to L0: https://github.com/oneapi-src/unified-runtime/blob/de05f984aa19458a4993d2a2709e3b79d82f1a37/source/adapters/level_zero/virtual_mem.cpp#L32-L37 and for large allocations a wrong alignment is used and L0 reports ZE_RESULT_ERROR_UNSUPPORTED_SIZE upon zePhysicalMemCreate call (UR fails with UR_RESULT_ERROR_INVALID_VALUE then). The UR API should change to accept a size. This PR exposes this issue in a unittest and fixes it. --- include/ur_api.h | 4 + include/ur_ddi.h | 4 +- include/ur_print.hpp | 5 + scripts/core/virtual_memory.yml | 3 + source/adapters/cuda/virtual_mem.cpp | 1 + source/adapters/hip/virtual_mem.cpp | 4 +- .../level_zero/ur_interface_loader.hpp | 4 +- source/adapters/level_zero/virtual_mem.cpp | 7 +- source/adapters/mock/ur_mockddi.cpp | 6 +- source/adapters/native_cpu/virtual_mem.cpp | 4 +- source/adapters/opencl/virtual_mem.cpp | 4 +- .../sanitizer_common/sanitizer_utils.cpp | 7 +- source/loader/layers/tracing/ur_trcddi.cpp | 11 ++- source/loader/layers/validation/ur_valddi.cpp | 8 +- source/loader/ur_ldrddi.cpp | 7 +- source/loader/ur_libapi.cpp | 7 +- source/ur_api.cpp | 3 + .../enqueue/urEnqueueKernelLaunch.cpp | 4 +- .../testing/include/uur/fixtures.h | 15 ++- .../urVirtualMemGranularityGetInfo.cpp | 93 ++++++++++--------- 20 files changed, 126 insertions(+), 75 deletions(-) diff --git a/include/ur_api.h b/include/ur_api.h index 577bb4d5b2..8baf407095 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -4993,6 +4993,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. @@ -15324,6 +15327,7 @@ typedef struct ur_loader_init_params_t { typedef struct ur_virtual_mem_granularity_get_info_params_t { ur_context_handle_t *phContext; ur_device_handle_t *phDevice; + size_t *pallocationSize; ur_virtual_mem_granularity_info_t *ppropName; size_t *ppropSize; void **ppPropValue; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index cb944b6c39..5f58d4c560 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1834,8 +1834,8 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetUsmP2PExpProcAddrTable_t)( /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urVirtualMemGranularityGetInfo typedef ur_result_t(UR_APICALL *ur_pfnVirtualMemGranularityGetInfo_t)( - ur_context_handle_t, ur_device_handle_t, ur_virtual_mem_granularity_info_t, - size_t, void *, size_t *); + ur_context_handle_t, ur_device_handle_t, size_t, + ur_virtual_mem_granularity_info_t, size_t, void *, size_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urVirtualMemReserve diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 7fc43237a2..c7dc701db3 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -20319,6 +20319,11 @@ inline std::ostream &operator<<( ur::details::printPtr(os, *(params->phDevice)); + os << ", "; + os << ".allocationSize = "; + + os << *(params->pallocationSize); + os << ", "; os << ".propName = "; diff --git a/scripts/core/virtual_memory.yml b/scripts/core/virtual_memory.yml index 61fca47d1b..ec34ca4895 100644 --- a/scripts/core/virtual_memory.yml +++ b/scripts/core/virtual_memory.yml @@ -41,6 +41,9 @@ params: [in][optional] is the device to get the granularity from, if the device is null then the granularity is suitable for all devices in context. + - type: size_t + name: allocationSize + desc: "[in] allocation size in bytes for which the alignment is being queried." - type: $x_virtual_mem_granularity_info_t name: propName desc: "[in] type of the info to query." diff --git a/source/adapters/cuda/virtual_mem.cpp b/source/adapters/cuda/virtual_mem.cpp index 29908ad1d4..38f70e031d 100644 --- a/source/adapters/cuda/virtual_mem.cpp +++ b/source/adapters/cuda/virtual_mem.cpp @@ -18,6 +18,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( ur_context_handle_t, ur_device_handle_t hDevice, + [[maybe_unused]] size_t allocationSize, ur_virtual_mem_granularity_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); diff --git a/source/adapters/hip/virtual_mem.cpp b/source/adapters/hip/virtual_mem.cpp index 12cf9f838e..1effbbfa06 100644 --- a/source/adapters/hip/virtual_mem.cpp +++ b/source/adapters/hip/virtual_mem.cpp @@ -14,8 +14,8 @@ #include "physical_mem.hpp" UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( - ur_context_handle_t, ur_device_handle_t, ur_virtual_mem_granularity_info_t, - size_t, void *, size_t *) { + ur_context_handle_t, ur_device_handle_t, size_t, + ur_virtual_mem_granularity_info_t, size_t, void *, size_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp index 5e9fad25cb..bbbe1fce96 100644 --- a/source/adapters/level_zero/ur_interface_loader.hpp +++ b/source/adapters/level_zero/ur_interface_loader.hpp @@ -165,8 +165,8 @@ ur_result_t urUSMPoolGetInfo(ur_usm_pool_handle_t hPool, void *pPropValue, size_t *pPropSizeRet); ur_result_t urVirtualMemGranularityGetInfo( ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_virtual_mem_granularity_info_t propName, size_t propSize, - void *pPropValue, size_t *pPropSizeRet); + size_t allocationSize, ur_virtual_mem_granularity_info_t propName, + size_t propSize, void *pPropValue, size_t *pPropSizeRet); ur_result_t urVirtualMemReserve(ur_context_handle_t hContext, const void *pStart, size_t size, void **ppStart); diff --git a/source/adapters/level_zero/virtual_mem.cpp b/source/adapters/level_zero/virtual_mem.cpp index f61c8fd43f..0488d21023 100644 --- a/source/adapters/level_zero/virtual_mem.cpp +++ b/source/adapters/level_zero/virtual_mem.cpp @@ -23,8 +23,8 @@ namespace ur::level_zero { ur_result_t urVirtualMemGranularityGetInfo( ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_virtual_mem_granularity_info_t propName, size_t propSize, - void *pPropValue, size_t *pPropSizeRet) { + size_t allocationSize, ur_virtual_mem_granularity_info_t propName, + size_t propSize, void *pPropValue, size_t *pPropSizeRet) { UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { case UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM: @@ -34,7 +34,8 @@ ur_result_t urVirtualMemGranularityGetInfo( // aligned size. size_t PageSize; ZE2UR_CALL(zeVirtualMemQueryPageSize, - (hContext->getZeHandle(), hDevice->ZeDevice, 1, &PageSize)); + (hContext->getZeHandle(), hDevice->ZeDevice, allocationSize, + &PageSize)); return ReturnValue(PageSize); } default: diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index fb5529f95b..3ab79444f3 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -2729,6 +2729,9 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. @@ -2744,7 +2747,8 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( ur_result_t result = UR_RESULT_SUCCESS; ur_virtual_mem_granularity_get_info_params_t params = { - &hContext, &hDevice, &propName, &propSize, &pPropValue, &pPropSizeRet}; + &hContext, &hDevice, &allocationSize, &propName, + &propSize, &pPropValue, &pPropSizeRet}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( diff --git a/source/adapters/native_cpu/virtual_mem.cpp b/source/adapters/native_cpu/virtual_mem.cpp index 131b480ac1..6697902564 100644 --- a/source/adapters/native_cpu/virtual_mem.cpp +++ b/source/adapters/native_cpu/virtual_mem.cpp @@ -13,8 +13,8 @@ #include "physical_mem.hpp" UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( - ur_context_handle_t, ur_device_handle_t, ur_virtual_mem_granularity_info_t, - size_t, void *, size_t *) { + ur_context_handle_t, ur_device_handle_t, size_t, + ur_virtual_mem_granularity_info_t, size_t, void *, size_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/adapters/opencl/virtual_mem.cpp b/source/adapters/opencl/virtual_mem.cpp index 7c411d9b7b..c7db068eca 100644 --- a/source/adapters/opencl/virtual_mem.cpp +++ b/source/adapters/opencl/virtual_mem.cpp @@ -13,8 +13,8 @@ #include "physical_mem.hpp" UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( - ur_context_handle_t, ur_device_handle_t, ur_virtual_mem_granularity_info_t, - size_t, void *, size_t *) { + ur_context_handle_t, ur_device_handle_t, size_t, + ur_virtual_mem_granularity_info_t, size_t, void *, size_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp index 3539a2d2a5..f8f7c58bf5 100644 --- a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp +++ b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp @@ -272,10 +272,13 @@ size_t GetKernelPrivateMemorySize(ur_kernel_handle_t Kernel, size_t GetVirtualMemGranularity(ur_context_handle_t Context, ur_device_handle_t Device) { size_t Size; + const size_t allocationSize = + 1; // probably we want to use actual allocation size [[maybe_unused]] auto Result = getContext()->urDdiTable.VirtualMem.pfnGranularityGetInfo( - Context, Device, UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED, - sizeof(Size), &Size, nullptr); + Context, Device, allocationSize, + UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED, sizeof(Size), &Size, + nullptr); assert(Result == UR_RESULT_SUCCESS); return Size; } diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index e0d57228e4..0abbb7604c 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -2236,6 +2236,9 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. @@ -2255,7 +2258,8 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; ur_virtual_mem_granularity_get_info_params_t params = { - &hContext, &hDevice, &propName, &propSize, &pPropValue, &pPropSizeRet}; + &hContext, &hDevice, &allocationSize, &propName, + &propSize, &pPropValue, &pPropSizeRet}; uint64_t instance = getContext()->notify_begin(UR_FUNCTION_VIRTUAL_MEM_GRANULARITY_GET_INFO, "urVirtualMemGranularityGetInfo", ¶ms); @@ -2263,8 +2267,9 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( auto &logger = getContext()->logger; UR_LOG_L(logger, INFO, " ---> urVirtualMemGranularityGetInfo\n"); - ur_result_t result = pfnGranularityGetInfo( - hContext, hDevice, propName, propSize, pPropValue, pPropSizeRet); + ur_result_t result = + pfnGranularityGetInfo(hContext, hDevice, allocationSize, propName, + propSize, pPropValue, pPropSizeRet); getContext()->notify_end(UR_FUNCTION_VIRTUAL_MEM_GRANULARITY_GET_INFO, "urVirtualMemGranularityGetInfo", ¶ms, &result, diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 979eb3ef22..b61356afd2 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -2182,6 +2182,9 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. @@ -2228,8 +2231,9 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( URLOG_CTX_INVALID_REFERENCE(hDevice); } - ur_result_t result = pfnGranularityGetInfo( - hContext, hDevice, propName, propSize, pPropValue, pPropSizeRet); + ur_result_t result = + pfnGranularityGetInfo(hContext, hDevice, allocationSize, propName, + propSize, pPropValue, pPropSizeRet); return result; } diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index 1cf13837bd..74712c5c4d 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -1238,6 +1238,9 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. @@ -1258,8 +1261,8 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( return UR_RESULT_ERROR_UNINITIALIZED; // forward to device-platform - return pfnGranularityGetInfo(hContext, hDevice, propName, propSize, - pPropValue, pPropSizeRet); + return pfnGranularityGetInfo(hContext, hDevice, allocationSize, propName, + propSize, pPropValue, pPropSizeRet); } /////////////////////////////////////////////////////////////////////////////// diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 1261145424..cad6de4dd9 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -2725,6 +2725,9 @@ ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. @@ -2742,8 +2745,8 @@ ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( if (nullptr == pfnGranularityGetInfo) return UR_RESULT_ERROR_UNINITIALIZED; - return pfnGranularityGetInfo(hContext, hDevice, propName, propSize, - pPropValue, pPropSizeRet); + return pfnGranularityGetInfo(hContext, hDevice, allocationSize, propName, + propSize, pPropValue, pPropSizeRet); } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/source/ur_api.cpp b/source/ur_api.cpp index cc69811f57..426ca95027 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -2410,6 +2410,9 @@ ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. diff --git a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp index 327728bb5a..fa3eb3f4b5 100644 --- a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp +++ b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp @@ -491,11 +491,11 @@ struct urEnqueueKernelLaunchWithVirtualMemory : uur::urKernelExecutionTest { GTEST_SKIP() << "Virtual memory is not supported."; } + alloc_size = 1024; ASSERT_SUCCESS(urVirtualMemGranularityGetInfo( - context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, + context, device, alloc_size, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, sizeof(granularity), &granularity, nullptr)); - alloc_size = 1024; virtual_page_size = uur::RoundUpToNearestFactor(alloc_size, granularity); ASSERT_SUCCESS(urPhysicalMemCreate(context, device, virtual_page_size, diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h index b67eddd8f8..fff0be4a01 100644 --- a/test/conformance/testing/include/uur/fixtures.h +++ b/test/conformance/testing/include/uur/fixtures.h @@ -976,9 +976,12 @@ struct urVirtualMemGranularityTest : urContextTest { GTEST_SKIP() << "Virtual memory is not supported."; } + const size_t allocationSize = + 1; // assuming allocations in test are small enough and minimal granularity is used ASSERT_SUCCESS(urVirtualMemGranularityGetInfo( - context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - sizeof(granularity), &granularity, nullptr)); + context, device, allocationSize, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, sizeof(granularity), + &granularity, nullptr)); } size_t granularity; }; @@ -995,10 +998,12 @@ struct urVirtualMemGranularityTestWithParam : urContextTestWithParam { if (!virtual_memory_support) { GTEST_SKIP() << "Virtual memory is not supported."; } - + const size_t allocationSize = + 1; // assuming allocations in test are small and use smallest granularity ASSERT_SUCCESS(urVirtualMemGranularityGetInfo( - this->context, this->device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - sizeof(granularity), &granularity, nullptr)); + this->context, this->device, allocationSize, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, sizeof(granularity), + &granularity, nullptr)); ASSERT_NE(granularity, 0); } diff --git a/test/conformance/virtual_memory/urVirtualMemGranularityGetInfo.cpp b/test/conformance/virtual_memory/urVirtualMemGranularityGetInfo.cpp index 0507b8903a..cd4e3ed076 100644 --- a/test/conformance/virtual_memory/urVirtualMemGranularityGetInfo.cpp +++ b/test/conformance/virtual_memory/urVirtualMemGranularityGetInfo.cpp @@ -20,89 +20,96 @@ struct urVirtualMemGranularityGetInfoTest : uur::urContextTest { UUR_INSTANTIATE_DEVICE_TEST_SUITE(urVirtualMemGranularityGetInfoTest); -TEST_P(urVirtualMemGranularityGetInfoTest, SuccessMinimum) { +void urVirtualMemGranularityGetInfoTest_successCase( + ur_context_handle_t context, ur_device_handle_t device, + const ur_virtual_mem_granularity_info_t property_name, + const size_t allocation_size) { size_t property_size = 0; - const ur_virtual_mem_granularity_info_t property_name = - UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM; ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urVirtualMemGranularityGetInfo(context, device, property_name, 0, nullptr, - &property_size), + urVirtualMemGranularityGetInfo(context, device, allocation_size, + property_name, 0, nullptr, &property_size), property_name); ASSERT_EQ(sizeof(size_t), property_size); size_t property_value = 0; ASSERT_QUERY_RETURNS_VALUE( - urVirtualMemGranularityGetInfo(context, device, property_name, - property_size, &property_value, nullptr), + urVirtualMemGranularityGetInfo(context, device, allocation_size, + property_name, property_size, + &property_value, nullptr), property_value); ASSERT_GT(property_value, 0); } -TEST_P(urVirtualMemGranularityGetInfoTest, SuccessRecommended) { - size_t property_size = 0; - const ur_virtual_mem_granularity_info_t property_name = - UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED; +TEST_P(urVirtualMemGranularityGetInfoTest, SuccessMinimum_smallAllocation) { + urVirtualMemGranularityGetInfoTest_successCase( + context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 1); +} - ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urVirtualMemGranularityGetInfo(context, device, property_name, 0, nullptr, - &property_size), - property_name); - ASSERT_EQ(sizeof(size_t), property_size); +TEST_P(urVirtualMemGranularityGetInfoTest, SuccessMinimum_largeAllocation) { + urVirtualMemGranularityGetInfoTest_successCase( + context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 191439360); +} - size_t property_value = 0; - ASSERT_QUERY_RETURNS_VALUE( - urVirtualMemGranularityGetInfo(context, device, property_name, - property_size, &property_value, nullptr), - property_value); +TEST_P(urVirtualMemGranularityGetInfoTest, SuccessRecommended_smallAllocation) { + urVirtualMemGranularityGetInfoTest_successCase( + context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 19); +} - ASSERT_GT(property_value, 0); +TEST_P(urVirtualMemGranularityGetInfoTest, SuccessRecommended_largeAllocation) { + urVirtualMemGranularityGetInfoTest_successCase( + context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 211739367); } TEST_P(urVirtualMemGranularityGetInfoTest, InvalidNullHandleContext) { size_t property_size = 0; - ASSERT_EQ_RESULT(urVirtualMemGranularityGetInfo( - nullptr, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - 0, nullptr, &property_size), - UR_RESULT_ERROR_INVALID_NULL_HANDLE); + ASSERT_EQ_RESULT( + urVirtualMemGranularityGetInfo(nullptr, device, 1, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 0, + nullptr, &property_size), + UR_RESULT_ERROR_INVALID_NULL_HANDLE); } TEST_P(urVirtualMemGranularityGetInfoTest, InvalidEnumeration) { size_t property_size = 0; ASSERT_EQ_RESULT(urVirtualMemGranularityGetInfo( - context, device, + context, device, 1, UR_VIRTUAL_MEM_GRANULARITY_INFO_FORCE_UINT32, 0, nullptr, &property_size), UR_RESULT_ERROR_INVALID_ENUMERATION); } TEST_P(urVirtualMemGranularityGetInfoTest, InvalidNullPointerPropSizeRet) { - ASSERT_EQ_RESULT(urVirtualMemGranularityGetInfo( - context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - 0, nullptr, nullptr), - UR_RESULT_ERROR_INVALID_NULL_POINTER); + ASSERT_EQ_RESULT( + urVirtualMemGranularityGetInfo(context, device, 1, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 0, + nullptr, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); } TEST_P(urVirtualMemGranularityGetInfoTest, InvalidNullPointerPropValue) { - ASSERT_EQ_RESULT(urVirtualMemGranularityGetInfo( - context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - sizeof(size_t), nullptr, nullptr), - UR_RESULT_ERROR_INVALID_NULL_POINTER); + ASSERT_EQ_RESULT( + urVirtualMemGranularityGetInfo(context, device, 1, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, + sizeof(size_t), nullptr, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); } TEST_P(urVirtualMemGranularityGetInfoTest, InvalidPropSizeZero) { size_t minimum = 0; - ASSERT_EQ_RESULT(urVirtualMemGranularityGetInfo( - context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - 0, &minimum, nullptr), - UR_RESULT_ERROR_INVALID_SIZE); + ASSERT_EQ_RESULT( + urVirtualMemGranularityGetInfo(context, device, 1, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 0, + &minimum, nullptr), + UR_RESULT_ERROR_INVALID_SIZE); } TEST_P(urVirtualMemGranularityGetInfoTest, InvalidSizePropSizeSmall) { size_t minimum = 0; - ASSERT_EQ_RESULT(urVirtualMemGranularityGetInfo( - context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - sizeof(size_t) - 1, &minimum, nullptr), - UR_RESULT_ERROR_INVALID_SIZE); + ASSERT_EQ_RESULT( + urVirtualMemGranularityGetInfo(context, device, 1, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, + sizeof(size_t) - 1, &minimum, nullptr), + UR_RESULT_ERROR_INVALID_SIZE); } From 2ef81fe8367effd49a7035af9e45f53fbe0f7ad8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Chor=C4=85=C5=BCewicz?= Date: Thu, 17 Jul 2025 07:06:27 -0700 Subject: [PATCH 5/9] Coverity fixes (#19489) - catch potential exception in ~ur_exp_command_buffer_handle_t_ - move ZeUSMImport definition to adapter.cpp: ZeUSMImport is used by global adapter constructor (on Windows). It needs to be initialized before the global adapter. According to Coverity: The constructor of global object "GlobalAdapter" itself makes use of global object "ZeUSMImport" defined in another compilation unit. The order of construction is unspecified, so "GlobalAdapter" might be created before "ZeUSMImport" is available. --- source/adapters/level_zero/adapter.cpp | 2 ++ source/adapters/level_zero/common.cpp | 2 -- source/adapters/level_zero/v2/command_buffer.cpp | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/source/adapters/level_zero/adapter.cpp b/source/adapters/level_zero/adapter.cpp index e338a4c439..388af44695 100644 --- a/source/adapters/level_zero/adapter.cpp +++ b/source/adapters/level_zero/adapter.cpp @@ -21,6 +21,8 @@ #include #endif +ZeUSMImportExtension ZeUSMImport; + // Due to multiple DLLMain definitions with SYCL, Global Adapter is init at // variable creation. #if defined(_WIN32) diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp index 8ed6d7e579..0433a2d52d 100644 --- a/source/adapters/level_zero/common.cpp +++ b/source/adapters/level_zero/common.cpp @@ -84,8 +84,6 @@ bool setEnvVar(const char *name, const char *value) { return true; } -ZeUSMImportExtension ZeUSMImport; - void zeParseError(ze_result_t ZeError, const char *&ErrorString) { switch (ZeError) { #define ZE_ERRCASE(ERR) \ diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 92118587d4..b4c2674bd3 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -166,7 +166,7 @@ ur_result_t ur_exp_command_buffer_handle_t_::registerExecutionEventUnlocked( return UR_RESULT_SUCCESS; } -ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { +ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() try { UR_CALL_NOCHECK(commandListManager.lock()->releaseSubmittedKernels()); if (currentExecution) { @@ -175,6 +175,9 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { for (auto &event : syncPoints) { event->release(); } +} catch (...) { + UR_LOG(DEBUG, "ur_exp_command_buffer_handle_t_ destructor failed with: {}", + exceptionToResult(std::current_exception())); } ur_result_t ur_exp_command_buffer_handle_t_::applyUpdateCommands( From 49e0f92e4c3d43b17c04d90626d57c734bb9d4ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Chor=C4=85=C5=BCewicz?= Date: Thu, 17 Jul 2025 07:25:37 -0700 Subject: [PATCH 6/9] import host ptr when SYCL_USM_HOSTPTR_IMPORT=1 and fix mapHostPtr (#19297) --- source/adapters/level_zero/v2/memory.cpp | 44 +++++++++++++++--------- source/adapters/level_zero/v2/memory.hpp | 19 +++++----- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/source/adapters/level_zero/v2/memory.cpp b/source/adapters/level_zero/v2/memory.cpp index 9c39a97d16..1b6855e630 100644 --- a/source/adapters/level_zero/v2/memory.cpp +++ b/source/adapters/level_zero/v2/memory.cpp @@ -55,14 +55,11 @@ void ur_usm_handle_t::unmapHostPtr(void * /*pMappedPtr*/, ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( ur_context_handle_t hContext, void *hostPtr, size_t size, - host_ptr_action_t hostPtrAction, device_access_mode_t accessMode) + device_access_mode_t accessMode) : ur_mem_buffer_t(hContext, size, accessMode) { - bool hostPtrImported = false; - if (hostPtrAction == host_ptr_action_t::import) { - hostPtrImported = - maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, - hContext->getZeHandle(), hostPtr, size); - } + bool hostPtrImported = + maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, + hContext->getZeHandle(), hostPtr, size); if (hostPtrImported) { this->ptr = usm_unique_ptr_t(hostPtr, [hContext](void *ptr) { @@ -201,8 +198,23 @@ ur_discrete_buffer_handle_t::ur_discrete_buffer_handle_t( device_access_mode_t accessMode) : ur_mem_buffer_t(hContext, size, accessMode), deviceAllocations(hContext->getPlatform()->getNumDevices()), - activeAllocationDevice(nullptr), mapToPtr(hostPtr), hostAllocations() { + activeAllocationDevice(nullptr), mapToPtr(nullptr, nullptr), + hostAllocations() { if (hostPtr) { + // Try importing the pointer to speed up memory copies for map/unmap + bool hostPtrImported = + maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, + hContext->getZeHandle(), hostPtr, size); + + if (hostPtrImported) { + mapToPtr = usm_unique_ptr_t(hostPtr, [hContext](void *ptr) { + ZeUSMImport.doZeUSMRelease( + hContext->getPlatform()->ZeDriverHandleExpTranslated, ptr); + }); + } else { + mapToPtr = usm_unique_ptr_t(hostPtr, [](void *) {}); + } + auto initialDevice = hContext->getDevices()[0]; UR_CALL_THROWS(migrateBufferTo(initialDevice, hostPtr, size)); } @@ -305,18 +317,18 @@ void *ur_discrete_buffer_handle_t::mapHostPtr(ur_map_flags_t flags, TRACK_SCOPE_LATENCY("ur_discrete_buffer_handle_t::mapHostPtr"); // TODO: use async alloc? - void *ptr = mapToPtr; + void *ptr = mapToPtr.get(); if (!ptr) { UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &ptr)); } usm_unique_ptr_t mappedPtr = - usm_unique_ptr_t(ptr, [ownsAlloc = bool(mapToPtr), this](void *p) { + usm_unique_ptr_t(ptr, [ownsAlloc = !bool(mapToPtr), this](void *p) { if (ownsAlloc) { auto ret = hContext->getDefaultUSMPool()->free(p); if (ret != UR_RESULT_SUCCESS) { - UR_LOG(ERR, "Failed to mapped memory: {}", ret); + UR_LOG(ERR, "Failed to free mapped memory: {}", ret); } } }); @@ -541,16 +553,16 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext, // ignore the flag for now. } + if (flags & UR_MEM_FLAG_USE_HOST_POINTER) { + // To speed up copies, we always import the host ptr to USM memory + } + void *hostPtr = pProperties ? pProperties->pHost : nullptr; auto accessMode = ur_mem_buffer_t::getDeviceAccessMode(flags); if (useHostBuffer(hContext)) { - auto hostPtrAction = - flags & UR_MEM_FLAG_USE_HOST_POINTER - ? ur_integrated_buffer_handle_t::host_ptr_action_t::import - : ur_integrated_buffer_handle_t::host_ptr_action_t::copy; *phBuffer = ur_mem_handle_t_::create( - hContext, hostPtr, size, hostPtrAction, accessMode); + hContext, hostPtr, size, accessMode); } else { *phBuffer = ur_mem_handle_t_::create( hContext, hostPtr, size, accessMode); diff --git a/source/adapters/level_zero/v2/memory.hpp b/source/adapters/level_zero/v2/memory.hpp index 7201df57c9..61b0a00f40 100644 --- a/source/adapters/level_zero/v2/memory.hpp +++ b/source/adapters/level_zero/v2/memory.hpp @@ -28,7 +28,7 @@ struct ur_mem_buffer_t : ur_object { enum class device_access_mode_t { read_write, read_only, write_only }; ur_mem_buffer_t(ur_context_handle_t hContext, size_t size, - device_access_mode_t accesMode); + device_access_mode_t accessMode); virtual ~ur_mem_buffer_t() = default; virtual ur_shared_mutex &getMutex(); @@ -90,14 +90,11 @@ struct ur_usm_handle_t : ur_mem_buffer_t { // For integrated devices the buffer has been allocated in host memory // and can be accessed by the device without copying. struct ur_integrated_buffer_handle_t : ur_mem_buffer_t { - enum class host_ptr_action_t { import, copy }; - ur_integrated_buffer_handle_t(ur_context_handle_t hContext, void *hostPtr, - size_t size, host_ptr_action_t useHostPtr, - device_access_mode_t accesMode); + size_t size, device_access_mode_t accessMode); ur_integrated_buffer_handle_t(ur_context_handle_t hContext, void *hostPtr, - size_t size, device_access_mode_t accesMode, + size_t size, device_access_mode_t accessMode, bool ownHostPtr); ~ur_integrated_buffer_handle_t(); @@ -134,13 +131,13 @@ struct ur_discrete_buffer_handle_t : ur_mem_buffer_t { // first device in the context. Otherwise, the buffer is allocated on // firt getDevicePtr call. ur_discrete_buffer_handle_t(ur_context_handle_t hContext, void *hostPtr, - size_t size, device_access_mode_t accesMode); + size_t size, device_access_mode_t accessMode); ~ur_discrete_buffer_handle_t(); // Create buffer on top of existing device memory. ur_discrete_buffer_handle_t(ur_context_handle_t hContext, ur_device_handle_t hDevice, void *devicePtr, - size_t size, device_access_mode_t accesMode, + size_t size, device_access_mode_t accessMode, void *writeBackMemory, bool ownDevicePtr); void *getDevicePtr(ur_device_handle_t, device_access_mode_t, size_t offset, @@ -166,7 +163,7 @@ struct ur_discrete_buffer_handle_t : ur_mem_buffer_t { void *writeBackPtr = nullptr; // If not null, mapHostPtr should map memory to this ptr - void *mapToPtr = nullptr; + usm_unique_ptr_t mapToPtr; std::vector hostAllocations; @@ -178,7 +175,7 @@ struct ur_discrete_buffer_handle_t : ur_mem_buffer_t { struct ur_shared_buffer_handle_t : ur_mem_buffer_t { ur_shared_buffer_handle_t(ur_context_handle_t hContext, void *devicePtr, - size_t size, device_access_mode_t accesMode, + size_t size, device_access_mode_t accessMode, bool ownDevicePtr); void *getDevicePtr(ur_device_handle_t, device_access_mode_t, size_t offset, @@ -196,7 +193,7 @@ struct ur_shared_buffer_handle_t : ur_mem_buffer_t { struct ur_mem_sub_buffer_t : ur_mem_buffer_t { ur_mem_sub_buffer_t(ur_mem_handle_t hParent, size_t offset, size_t size, - device_access_mode_t accesMode); + device_access_mode_t accessMode); ~ur_mem_sub_buffer_t(); void *getDevicePtr(ur_device_handle_t, device_access_mode_t, size_t offset, From a8015fbbea1d090b2801af0fb307ecc5fc5ebb16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20=C5=9Awi=C4=99cicki?= Date: Thu, 17 Jul 2025 16:26:17 +0200 Subject: [PATCH 7/9] Enable e2e tests (#19493) L0v2 adapter was marked as unsupported in some async alloc tests due to missing features. Those tests were failing because of the missing functions: `urUSMPoolGetInfoExp`, `urUSMPoolSetInfoExp`, `urUSMPoolCreateExp`, `urUSMPoolDestroyExp`, `urUSMPoolGetDefaultDevicePoolExp`. Closes #18488 --- source/adapters/level_zero/usm.cpp | 22 ++++- source/adapters/level_zero/v2/api.cpp | 22 ----- source/adapters/level_zero/v2/usm.cpp | 122 ++++++++++++++++++++++++-- source/adapters/level_zero/v2/usm.hpp | 3 + 4 files changed, 139 insertions(+), 30 deletions(-) diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp index 41024be998..ca2b462067 100644 --- a/source/adapters/level_zero/usm.cpp +++ b/source/adapters/level_zero/usm.cpp @@ -607,8 +607,26 @@ ur_result_t UR_APICALL urUSMPoolDestroyExp(ur_context_handle_t /*Context*/, return UR_RESULT_SUCCESS; } -ur_result_t UR_APICALL urUSMPoolSetInfoExp(ur_usm_pool_handle_t, - ur_usm_pool_info_t, void *, size_t) { +ur_result_t UR_APICALL urUSMPoolSetInfoExp(ur_usm_pool_handle_t /*Pool*/, + ur_usm_pool_info_t PropName, + void * /*PropValue*/, + size_t PropSize) { + if (PropSize < sizeof(size_t)) { + return UR_RESULT_ERROR_INVALID_SIZE; + } + + switch (PropName) { + // TODO: Support for pool release threshold and maximum size hints. + case UR_USM_POOL_INFO_RELEASE_THRESHOLD_EXP: + case UR_USM_POOL_INFO_MAXIMUM_SIZE_EXP: + // TODO: Allow user to overwrite pool peak statistics. + case UR_USM_POOL_INFO_RESERVED_HIGH_EXP: + case UR_USM_POOL_INFO_USED_HIGH_EXP: + break; + default: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp index 4d43e249a6..7835b3e93d 100644 --- a/source/adapters/level_zero/v2/api.cpp +++ b/source/adapters/level_zero/v2/api.cpp @@ -41,28 +41,6 @@ ur_result_t urEventSetCallback(ur_event_handle_t hEvent, return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t UR_APICALL urUSMPoolCreateExp(ur_context_handle_t hContext, - ur_device_handle_t hDevice, - ur_usm_pool_desc_t *PoolDesc, - ur_usm_pool_handle_t *pPool) { - UR_LOG(ERR, "{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t UR_APICALL urUSMPoolDestroyExp(ur_context_handle_t hContext, - ur_device_handle_t hDevice, - ur_usm_pool_handle_t hPool) { - UR_LOG(ERR, "{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t UR_APICALL urUSMPoolSetInfoExp(ur_usm_pool_handle_t hPool, - ur_usm_pool_info_t propName, - void *pPropValue, size_t propSize) { - UR_LOG(ERR, "{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t UR_APICALL urUSMPoolGetDevicePoolExp(ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_usm_pool_handle_t *pPool) { diff --git a/source/adapters/level_zero/v2/usm.cpp b/source/adapters/level_zero/v2/usm.cpp index 216e0b0e25..0d49a8ad0a 100644 --- a/source/adapters/level_zero/v2/usm.cpp +++ b/source/adapters/level_zero/v2/usm.cpp @@ -188,6 +188,68 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t hContext, } } +ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_usm_pool_desc_t *pPoolDesc) + : hContext(hContext) { + // TODO: handle UR_USM_POOL_FLAG_ZERO_INITIALIZE_BLOCK from pPoolDesc + auto disjointPoolConfigs = initializeDisjointPoolConfig(); + + if (disjointPoolConfigs.has_value()) { + if (auto limits = find_stype_node(pPoolDesc)) { + for (auto &config : disjointPoolConfigs.value().Configs) { + config.MaxPoolableSize = limits->maxPoolableSize; + config.SlabMinSize = limits->minDriverAllocSize; + } + } + } else { + // If pooling is disabled, do nothing. + UR_LOG(INFO, "USM pooling is disabled. Skiping pool limits adjustment."); + } + + // Create pool descriptor for single device provided + std::vector descriptors; + { + auto &desc = descriptors.emplace_back(); + desc.poolHandle = this; + desc.hContext = hContext; + desc.hDevice = hDevice; + desc.type = UR_USM_TYPE_DEVICE; + } + { + auto &desc = descriptors.emplace_back(); + desc.poolHandle = this; + desc.hContext = hContext; + desc.hDevice = hDevice; + desc.type = UR_USM_TYPE_SHARED; + desc.deviceReadOnly = false; + } + { + auto &desc = descriptors.emplace_back(); + desc.poolHandle = this; + desc.hContext = hContext; + desc.hDevice = hDevice; + desc.type = UR_USM_TYPE_SHARED; + desc.deviceReadOnly = true; + } + + for (auto &desc : descriptors) { + std::unique_ptr usmPool; + if (disjointPoolConfigs.has_value()) { + auto &poolConfig = + disjointPoolConfigs.value().Configs[descToDisjoinPoolMemType(desc)]; + auto pool = usm::makeDisjointPool(makeProvider(desc), poolConfig); + usmPool = std::make_unique(this, std::move(pool)); + } else { + auto pool = usm::makeProxyPool(makeProvider(desc)); + usmPool = std::make_unique(this, std::move(pool)); + } + UMF_CALL_THROWS( + umfPoolSetTag(usmPool->umfPool.get(), usmPool.get(), nullptr)); + poolManager.addPool(desc, std::move(usmPool)); + } +} + ur_context_handle_t ur_usm_pool_handle_t_::getContextHandle() const { return hContext; } @@ -358,27 +420,27 @@ size_t ur_usm_pool_handle_t_::getTotalReservedSize() { } size_t ur_usm_pool_handle_t_::getPeakReservedSize() { - size_t totalAllocatedSize = 0; + size_t maxPeakSize = 0; umf_result_t umfRet = UMF_RESULT_SUCCESS; poolManager.forEachPool([&](UsmPool *p) { umf_memory_provider_handle_t hProvider = nullptr; - size_t allocatedSize = 0; + size_t peakSize = 0; umfRet = umfPoolGetMemoryProvider(p->umfPool.get(), &hProvider); if (umfRet != UMF_RESULT_SUCCESS) { return false; } - umfRet = umfCtlGet("umf.provider.by_handle.{}.stats.peak_memory", - &allocatedSize, sizeof(allocatedSize), hProvider); + umfRet = umfCtlGet("umf.provider.by_handle.{}.stats.peak_memory", &peakSize, + sizeof(peakSize), hProvider); if (umfRet != UMF_RESULT_SUCCESS) { return false; } - totalAllocatedSize += allocatedSize; + maxPeakSize = std::max(maxPeakSize, peakSize); return true; }); - return umfRet == UMF_RESULT_SUCCESS ? totalAllocatedSize : 0; + return umfRet == UMF_RESULT_SUCCESS ? maxPeakSize : 0; } size_t ur_usm_pool_handle_t_::getTotalUsedSize() { @@ -460,6 +522,32 @@ ur_result_t urUSMPoolGetInfo( return exceptionToResult(std::current_exception()); } +ur_result_t urUSMPoolCreateExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_usm_pool_desc_t *pPoolDesc, + ur_usm_pool_handle_t *pPool) try { + *pPool = new ur_usm_pool_handle_t_(hContext, hDevice, pPoolDesc); + hContext->addUsmPool(*pPool); + return UR_RESULT_SUCCESS; +} catch (umf_result_t e) { + return umf::umf2urResult(e); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t urUSMPoolDestroyExp(ur_context_handle_t, ur_device_handle_t, + ur_usm_pool_handle_t hPool) try { + if (hPool->RefCount.release()) { + hPool->getContextHandle()->removeUsmPool(hPool); + delete hPool; + } + return UR_RESULT_SUCCESS; +} catch (umf_result_t e) { + return umf::umf2urResult(e); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + ur_result_t urUSMPoolGetInfoExp(ur_usm_pool_handle_t hPool, ur_usm_pool_info_t propName, void *pPropValue, size_t *pPropSizeRet) { @@ -497,6 +585,28 @@ ur_result_t urUSMPoolGetInfoExp(ur_usm_pool_handle_t hPool, return UR_RESULT_SUCCESS; } +ur_result_t urUSMPoolSetInfoExp(ur_usm_pool_handle_t /*hPool*/, + ur_usm_pool_info_t propName, + void * /*pPropValue*/, size_t propSize) { + if (propSize < sizeof(size_t)) { + return UR_RESULT_ERROR_INVALID_SIZE; + } + + switch (propName) { + // TODO: Support for pool release threshold and maximum size hints. + case UR_USM_POOL_INFO_RELEASE_THRESHOLD_EXP: + case UR_USM_POOL_INFO_MAXIMUM_SIZE_EXP: + // TODO: Allow user to overwrite pool peak statistics. + case UR_USM_POOL_INFO_RESERVED_HIGH_EXP: + case UR_USM_POOL_INFO_USED_HIGH_EXP: + break; + default: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + + return UR_RESULT_SUCCESS; +} + ur_result_t urUSMPoolGetDefaultDevicePoolExp(ur_context_handle_t hContext, ur_device_handle_t, ur_usm_pool_handle_t *pPool) { diff --git a/source/adapters/level_zero/v2/usm.hpp b/source/adapters/level_zero/v2/usm.hpp index 5b498b361c..825ecb5fcd 100644 --- a/source/adapters/level_zero/v2/usm.hpp +++ b/source/adapters/level_zero/v2/usm.hpp @@ -55,6 +55,9 @@ struct AllocationStats { struct ur_usm_pool_handle_t_ : ur_object { ur_usm_pool_handle_t_(ur_context_handle_t hContext, ur_usm_pool_desc_t *pPoolDes); + ur_usm_pool_handle_t_(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_usm_pool_desc_t *pPoolDes); ur_context_handle_t getContextHandle() const; From b038c0f2328538da204c74b93ab0555ab0789829 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Thu, 17 Jul 2025 17:55:18 +0100 Subject: [PATCH 8/9] Enable Offload backend in E2E tests (#19417) Enable the Offload backend in the E2E tests. The Offload UR adapter is still experimental and a WIP, the purpose of E2E testing now is purely to help develop the adapter and liboffload itself. The Offload adapter is not built by default. --- source/adapters/offload/device.cpp | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/source/adapters/offload/device.cpp b/source/adapters/offload/device.cpp index b5d4fdc571..76ea3e6c4f 100644 --- a/source/adapters/offload/device.cpp +++ b/source/adapters/offload/device.cpp @@ -67,6 +67,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_PLATFORM: return ReturnValue(hDevice->Platform); break; + case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: + case UR_DEVICE_INFO_USM_HOST_SUPPORT: case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: return ReturnValue(UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS); case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: @@ -78,6 +80,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(uint32_t{3}); case UR_DEVICE_INFO_COMPILER_AVAILABLE: return ReturnValue(true); + case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: + // TODO: Implement subgroups in Offload + return ReturnValue(1); // Unimplemented features case UR_DEVICE_INFO_PROGRAM_SET_SPECIALIZATION_CONSTANTS: case UR_DEVICE_INFO_GLOBAL_VARIABLE_SUPPORT: @@ -85,12 +90,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: case UR_DEVICE_INFO_IMAGE_SUPPORT: case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: + case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: + // TODO: Atomic queries in Offload + case UR_DEVICE_INFO_ATOMIC_64: + case UR_DEVICE_INFO_IMAGE_SRGB: + case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: + case UR_DEVICE_INFO_LINKER_AVAILABLE: return ReturnValue(false); case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: - case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: - case UR_DEVICE_INFO_USM_HOST_SUPPORT: case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: return ReturnValue(uint32_t{0}); + case UR_DEVICE_INFO_QUEUE_PROPERTIES: + case UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES: + return ReturnValue(0); + case UR_DEVICE_INFO_SUPPORTED_PARTITIONS: { + if (pPropSizeRet) { + *pPropSizeRet = 0; + } + return UR_RESULT_SUCCESS; + } default: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } From 451d6b49470d0167c62803c8e4322bc60cd9ba53 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 18 Jul 2025 00:44:28 +0000 Subject: [PATCH 9/9] Update intel/llvm mirror base commit to 25323c85 --- .github/intel-llvm-mirror-base-commit | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/intel-llvm-mirror-base-commit b/.github/intel-llvm-mirror-base-commit index 7090223e16..948fb08ab2 100644 --- a/.github/intel-llvm-mirror-base-commit +++ b/.github/intel-llvm-mirror-base-commit @@ -1 +1 @@ -004f38eaec3db5b5c72fabd1e7f5b82a405eecff +25323c85d7091f92bea2c057202612ff941a36d2