diff --git a/.github/intel-llvm-mirror-base-commit b/.github/intel-llvm-mirror-base-commit index 7090223e16..948fb08ab2 100644 --- a/.github/intel-llvm-mirror-base-commit +++ b/.github/intel-llvm-mirror-base-commit @@ -1 +1 @@ -004f38eaec3db5b5c72fabd1e7f5b82a405eecff +25323c85d7091f92bea2c057202612ff941a36d2 diff --git a/include/ur_api.h b/include/ur_api.h index 577bb4d5b2..8baf407095 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -4993,6 +4993,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. @@ -15324,6 +15327,7 @@ typedef struct ur_loader_init_params_t { typedef struct ur_virtual_mem_granularity_get_info_params_t { ur_context_handle_t *phContext; ur_device_handle_t *phDevice; + size_t *pallocationSize; ur_virtual_mem_granularity_info_t *ppropName; size_t *ppropSize; void **ppPropValue; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index cb944b6c39..5f58d4c560 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1834,8 +1834,8 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetUsmP2PExpProcAddrTable_t)( /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urVirtualMemGranularityGetInfo typedef ur_result_t(UR_APICALL *ur_pfnVirtualMemGranularityGetInfo_t)( - ur_context_handle_t, ur_device_handle_t, ur_virtual_mem_granularity_info_t, - size_t, void *, size_t *); + ur_context_handle_t, ur_device_handle_t, size_t, + ur_virtual_mem_granularity_info_t, size_t, void *, size_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urVirtualMemReserve diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 7fc43237a2..c7dc701db3 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -20319,6 +20319,11 @@ inline std::ostream &operator<<( ur::details::printPtr(os, *(params->phDevice)); + os << ", "; + os << ".allocationSize = "; + + os << *(params->pallocationSize); + os << ", "; os << ".propName = "; diff --git a/scripts/core/virtual_memory.yml b/scripts/core/virtual_memory.yml index 61fca47d1b..ec34ca4895 100644 --- a/scripts/core/virtual_memory.yml +++ b/scripts/core/virtual_memory.yml @@ -41,6 +41,9 @@ params: [in][optional] is the device to get the granularity from, if the device is null then the granularity is suitable for all devices in context. + - type: size_t + name: allocationSize + desc: "[in] allocation size in bytes for which the alignment is being queried." - type: $x_virtual_mem_granularity_info_t name: propName desc: "[in] type of the info to query." diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 3078605d42..19082b8947 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -1347,14 +1347,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( UR_CHECK_ERROR(validateCommandDesc(hCommandBuffer, pUpdateKernelLaunch[i])); } - // Store changes in config struct in command handle object + // Store changes in config struct in command handle object and propagate + // changes to CUDA graph for (uint32_t i = 0; i < numKernelUpdates; i++) { UR_CHECK_ERROR(updateCommand(pUpdateKernelLaunch[i])); UR_CHECK_ERROR(updateKernelArguments(pUpdateKernelLaunch[i])); - } - // Propagate changes to CUDA driver API - for (uint32_t i = 0; i < numKernelUpdates; i++) { const auto &UpdateCommandDesc = pUpdateKernelLaunch[i]; // If no work-size is provided make sure we pass nullptr to setKernelParams diff --git a/source/adapters/cuda/virtual_mem.cpp b/source/adapters/cuda/virtual_mem.cpp index 29908ad1d4..38f70e031d 100644 --- a/source/adapters/cuda/virtual_mem.cpp +++ b/source/adapters/cuda/virtual_mem.cpp @@ -18,6 +18,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( ur_context_handle_t, ur_device_handle_t hDevice, + [[maybe_unused]] size_t allocationSize, ur_virtual_mem_granularity_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index c8aac5b772..abac82900f 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -984,14 +984,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( UR_CHECK_ERROR(validateCommandDesc(hCommandBuffer, pUpdateKernelLaunch[i])); } - // Store changes in config struct in command handle object + // Store changes in config struct in command handle object and propagate + // changes to HIP Graph. for (uint32_t i = 0; i < numKernelUpdates; i++) { UR_CHECK_ERROR(updateCommand(pUpdateKernelLaunch[i])); UR_CHECK_ERROR(updateKernelArguments(pUpdateKernelLaunch[i])); - } - // Propagate changes to HIP driver API - for (uint32_t i = 0; i < numKernelUpdates; i++) { const auto &UpdateCommandDesc = pUpdateKernelLaunch[i]; // If no worksize is provided make sure we pass nullptr to setKernelParams diff --git a/source/adapters/hip/virtual_mem.cpp b/source/adapters/hip/virtual_mem.cpp index 12cf9f838e..1effbbfa06 100644 --- a/source/adapters/hip/virtual_mem.cpp +++ b/source/adapters/hip/virtual_mem.cpp @@ -14,8 +14,8 @@ #include "physical_mem.hpp" UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( - ur_context_handle_t, ur_device_handle_t, ur_virtual_mem_granularity_info_t, - size_t, void *, size_t *) { + ur_context_handle_t, ur_device_handle_t, size_t, + ur_virtual_mem_granularity_info_t, size_t, void *, size_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/adapters/level_zero/adapter.cpp b/source/adapters/level_zero/adapter.cpp index e338a4c439..388af44695 100644 --- a/source/adapters/level_zero/adapter.cpp +++ b/source/adapters/level_zero/adapter.cpp @@ -21,6 +21,8 @@ #include #endif +ZeUSMImportExtension ZeUSMImport; + // Due to multiple DLLMain definitions with SYCL, Global Adapter is init at // variable creation. #if defined(_WIN32) diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp index 8ed6d7e579..0433a2d52d 100644 --- a/source/adapters/level_zero/common.cpp +++ b/source/adapters/level_zero/common.cpp @@ -84,8 +84,6 @@ bool setEnvVar(const char *name, const char *value) { return true; } -ZeUSMImportExtension ZeUSMImport; - void zeParseError(ze_result_t ZeError, const char *&ErrorString) { switch (ZeError) { #define ZE_ERRCASE(ERR) \ diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp index 5e9fad25cb..bbbe1fce96 100644 --- a/source/adapters/level_zero/ur_interface_loader.hpp +++ b/source/adapters/level_zero/ur_interface_loader.hpp @@ -165,8 +165,8 @@ ur_result_t urUSMPoolGetInfo(ur_usm_pool_handle_t hPool, void *pPropValue, size_t *pPropSizeRet); ur_result_t urVirtualMemGranularityGetInfo( ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_virtual_mem_granularity_info_t propName, size_t propSize, - void *pPropValue, size_t *pPropSizeRet); + size_t allocationSize, ur_virtual_mem_granularity_info_t propName, + size_t propSize, void *pPropValue, size_t *pPropSizeRet); ur_result_t urVirtualMemReserve(ur_context_handle_t hContext, const void *pStart, size_t size, void **ppStart); diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp index 41024be998..ca2b462067 100644 --- a/source/adapters/level_zero/usm.cpp +++ b/source/adapters/level_zero/usm.cpp @@ -607,8 +607,26 @@ ur_result_t UR_APICALL urUSMPoolDestroyExp(ur_context_handle_t /*Context*/, return UR_RESULT_SUCCESS; } -ur_result_t UR_APICALL urUSMPoolSetInfoExp(ur_usm_pool_handle_t, - ur_usm_pool_info_t, void *, size_t) { +ur_result_t UR_APICALL urUSMPoolSetInfoExp(ur_usm_pool_handle_t /*Pool*/, + ur_usm_pool_info_t PropName, + void * /*PropValue*/, + size_t PropSize) { + if (PropSize < sizeof(size_t)) { + return UR_RESULT_ERROR_INVALID_SIZE; + } + + switch (PropName) { + // TODO: Support for pool release threshold and maximum size hints. + case UR_USM_POOL_INFO_RELEASE_THRESHOLD_EXP: + case UR_USM_POOL_INFO_MAXIMUM_SIZE_EXP: + // TODO: Allow user to overwrite pool peak statistics. + case UR_USM_POOL_INFO_RESERVED_HIGH_EXP: + case UR_USM_POOL_INFO_USED_HIGH_EXP: + break; + default: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp index 4d43e249a6..7835b3e93d 100644 --- a/source/adapters/level_zero/v2/api.cpp +++ b/source/adapters/level_zero/v2/api.cpp @@ -41,28 +41,6 @@ ur_result_t urEventSetCallback(ur_event_handle_t hEvent, return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t UR_APICALL urUSMPoolCreateExp(ur_context_handle_t hContext, - ur_device_handle_t hDevice, - ur_usm_pool_desc_t *PoolDesc, - ur_usm_pool_handle_t *pPool) { - UR_LOG(ERR, "{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t UR_APICALL urUSMPoolDestroyExp(ur_context_handle_t hContext, - ur_device_handle_t hDevice, - ur_usm_pool_handle_t hPool) { - UR_LOG(ERR, "{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t UR_APICALL urUSMPoolSetInfoExp(ur_usm_pool_handle_t hPool, - ur_usm_pool_info_t propName, - void *pPropValue, size_t propSize) { - UR_LOG(ERR, "{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t UR_APICALL urUSMPoolGetDevicePoolExp(ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_usm_pool_handle_t *pPool) { diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 92118587d4..b4c2674bd3 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -166,7 +166,7 @@ ur_result_t ur_exp_command_buffer_handle_t_::registerExecutionEventUnlocked( return UR_RESULT_SUCCESS; } -ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { +ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() try { UR_CALL_NOCHECK(commandListManager.lock()->releaseSubmittedKernels()); if (currentExecution) { @@ -175,6 +175,9 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { for (auto &event : syncPoints) { event->release(); } +} catch (...) { + UR_LOG(DEBUG, "ur_exp_command_buffer_handle_t_ destructor failed with: {}", + exceptionToResult(std::current_exception())); } ur_result_t ur_exp_command_buffer_handle_t_::applyUpdateCommands( diff --git a/source/adapters/level_zero/v2/memory.cpp b/source/adapters/level_zero/v2/memory.cpp index 9c39a97d16..1b6855e630 100644 --- a/source/adapters/level_zero/v2/memory.cpp +++ b/source/adapters/level_zero/v2/memory.cpp @@ -55,14 +55,11 @@ void ur_usm_handle_t::unmapHostPtr(void * /*pMappedPtr*/, ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( ur_context_handle_t hContext, void *hostPtr, size_t size, - host_ptr_action_t hostPtrAction, device_access_mode_t accessMode) + device_access_mode_t accessMode) : ur_mem_buffer_t(hContext, size, accessMode) { - bool hostPtrImported = false; - if (hostPtrAction == host_ptr_action_t::import) { - hostPtrImported = - maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, - hContext->getZeHandle(), hostPtr, size); - } + bool hostPtrImported = + maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, + hContext->getZeHandle(), hostPtr, size); if (hostPtrImported) { this->ptr = usm_unique_ptr_t(hostPtr, [hContext](void *ptr) { @@ -201,8 +198,23 @@ ur_discrete_buffer_handle_t::ur_discrete_buffer_handle_t( device_access_mode_t accessMode) : ur_mem_buffer_t(hContext, size, accessMode), deviceAllocations(hContext->getPlatform()->getNumDevices()), - activeAllocationDevice(nullptr), mapToPtr(hostPtr), hostAllocations() { + activeAllocationDevice(nullptr), mapToPtr(nullptr, nullptr), + hostAllocations() { if (hostPtr) { + // Try importing the pointer to speed up memory copies for map/unmap + bool hostPtrImported = + maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, + hContext->getZeHandle(), hostPtr, size); + + if (hostPtrImported) { + mapToPtr = usm_unique_ptr_t(hostPtr, [hContext](void *ptr) { + ZeUSMImport.doZeUSMRelease( + hContext->getPlatform()->ZeDriverHandleExpTranslated, ptr); + }); + } else { + mapToPtr = usm_unique_ptr_t(hostPtr, [](void *) {}); + } + auto initialDevice = hContext->getDevices()[0]; UR_CALL_THROWS(migrateBufferTo(initialDevice, hostPtr, size)); } @@ -305,18 +317,18 @@ void *ur_discrete_buffer_handle_t::mapHostPtr(ur_map_flags_t flags, TRACK_SCOPE_LATENCY("ur_discrete_buffer_handle_t::mapHostPtr"); // TODO: use async alloc? - void *ptr = mapToPtr; + void *ptr = mapToPtr.get(); if (!ptr) { UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &ptr)); } usm_unique_ptr_t mappedPtr = - usm_unique_ptr_t(ptr, [ownsAlloc = bool(mapToPtr), this](void *p) { + usm_unique_ptr_t(ptr, [ownsAlloc = !bool(mapToPtr), this](void *p) { if (ownsAlloc) { auto ret = hContext->getDefaultUSMPool()->free(p); if (ret != UR_RESULT_SUCCESS) { - UR_LOG(ERR, "Failed to mapped memory: {}", ret); + UR_LOG(ERR, "Failed to free mapped memory: {}", ret); } } }); @@ -541,16 +553,16 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext, // ignore the flag for now. } + if (flags & UR_MEM_FLAG_USE_HOST_POINTER) { + // To speed up copies, we always import the host ptr to USM memory + } + void *hostPtr = pProperties ? pProperties->pHost : nullptr; auto accessMode = ur_mem_buffer_t::getDeviceAccessMode(flags); if (useHostBuffer(hContext)) { - auto hostPtrAction = - flags & UR_MEM_FLAG_USE_HOST_POINTER - ? ur_integrated_buffer_handle_t::host_ptr_action_t::import - : ur_integrated_buffer_handle_t::host_ptr_action_t::copy; *phBuffer = ur_mem_handle_t_::create( - hContext, hostPtr, size, hostPtrAction, accessMode); + hContext, hostPtr, size, accessMode); } else { *phBuffer = ur_mem_handle_t_::create( hContext, hostPtr, size, accessMode); diff --git a/source/adapters/level_zero/v2/memory.hpp b/source/adapters/level_zero/v2/memory.hpp index 7201df57c9..61b0a00f40 100644 --- a/source/adapters/level_zero/v2/memory.hpp +++ b/source/adapters/level_zero/v2/memory.hpp @@ -28,7 +28,7 @@ struct ur_mem_buffer_t : ur_object { enum class device_access_mode_t { read_write, read_only, write_only }; ur_mem_buffer_t(ur_context_handle_t hContext, size_t size, - device_access_mode_t accesMode); + device_access_mode_t accessMode); virtual ~ur_mem_buffer_t() = default; virtual ur_shared_mutex &getMutex(); @@ -90,14 +90,11 @@ struct ur_usm_handle_t : ur_mem_buffer_t { // For integrated devices the buffer has been allocated in host memory // and can be accessed by the device without copying. struct ur_integrated_buffer_handle_t : ur_mem_buffer_t { - enum class host_ptr_action_t { import, copy }; - ur_integrated_buffer_handle_t(ur_context_handle_t hContext, void *hostPtr, - size_t size, host_ptr_action_t useHostPtr, - device_access_mode_t accesMode); + size_t size, device_access_mode_t accessMode); ur_integrated_buffer_handle_t(ur_context_handle_t hContext, void *hostPtr, - size_t size, device_access_mode_t accesMode, + size_t size, device_access_mode_t accessMode, bool ownHostPtr); ~ur_integrated_buffer_handle_t(); @@ -134,13 +131,13 @@ struct ur_discrete_buffer_handle_t : ur_mem_buffer_t { // first device in the context. Otherwise, the buffer is allocated on // firt getDevicePtr call. ur_discrete_buffer_handle_t(ur_context_handle_t hContext, void *hostPtr, - size_t size, device_access_mode_t accesMode); + size_t size, device_access_mode_t accessMode); ~ur_discrete_buffer_handle_t(); // Create buffer on top of existing device memory. ur_discrete_buffer_handle_t(ur_context_handle_t hContext, ur_device_handle_t hDevice, void *devicePtr, - size_t size, device_access_mode_t accesMode, + size_t size, device_access_mode_t accessMode, void *writeBackMemory, bool ownDevicePtr); void *getDevicePtr(ur_device_handle_t, device_access_mode_t, size_t offset, @@ -166,7 +163,7 @@ struct ur_discrete_buffer_handle_t : ur_mem_buffer_t { void *writeBackPtr = nullptr; // If not null, mapHostPtr should map memory to this ptr - void *mapToPtr = nullptr; + usm_unique_ptr_t mapToPtr; std::vector hostAllocations; @@ -178,7 +175,7 @@ struct ur_discrete_buffer_handle_t : ur_mem_buffer_t { struct ur_shared_buffer_handle_t : ur_mem_buffer_t { ur_shared_buffer_handle_t(ur_context_handle_t hContext, void *devicePtr, - size_t size, device_access_mode_t accesMode, + size_t size, device_access_mode_t accessMode, bool ownDevicePtr); void *getDevicePtr(ur_device_handle_t, device_access_mode_t, size_t offset, @@ -196,7 +193,7 @@ struct ur_shared_buffer_handle_t : ur_mem_buffer_t { struct ur_mem_sub_buffer_t : ur_mem_buffer_t { ur_mem_sub_buffer_t(ur_mem_handle_t hParent, size_t offset, size_t size, - device_access_mode_t accesMode); + device_access_mode_t accessMode); ~ur_mem_sub_buffer_t(); void *getDevicePtr(ur_device_handle_t, device_access_mode_t, size_t offset, diff --git a/source/adapters/level_zero/v2/usm.cpp b/source/adapters/level_zero/v2/usm.cpp index 216e0b0e25..0d49a8ad0a 100644 --- a/source/adapters/level_zero/v2/usm.cpp +++ b/source/adapters/level_zero/v2/usm.cpp @@ -188,6 +188,68 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t hContext, } } +ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_usm_pool_desc_t *pPoolDesc) + : hContext(hContext) { + // TODO: handle UR_USM_POOL_FLAG_ZERO_INITIALIZE_BLOCK from pPoolDesc + auto disjointPoolConfigs = initializeDisjointPoolConfig(); + + if (disjointPoolConfigs.has_value()) { + if (auto limits = find_stype_node(pPoolDesc)) { + for (auto &config : disjointPoolConfigs.value().Configs) { + config.MaxPoolableSize = limits->maxPoolableSize; + config.SlabMinSize = limits->minDriverAllocSize; + } + } + } else { + // If pooling is disabled, do nothing. + UR_LOG(INFO, "USM pooling is disabled. Skiping pool limits adjustment."); + } + + // Create pool descriptor for single device provided + std::vector descriptors; + { + auto &desc = descriptors.emplace_back(); + desc.poolHandle = this; + desc.hContext = hContext; + desc.hDevice = hDevice; + desc.type = UR_USM_TYPE_DEVICE; + } + { + auto &desc = descriptors.emplace_back(); + desc.poolHandle = this; + desc.hContext = hContext; + desc.hDevice = hDevice; + desc.type = UR_USM_TYPE_SHARED; + desc.deviceReadOnly = false; + } + { + auto &desc = descriptors.emplace_back(); + desc.poolHandle = this; + desc.hContext = hContext; + desc.hDevice = hDevice; + desc.type = UR_USM_TYPE_SHARED; + desc.deviceReadOnly = true; + } + + for (auto &desc : descriptors) { + std::unique_ptr usmPool; + if (disjointPoolConfigs.has_value()) { + auto &poolConfig = + disjointPoolConfigs.value().Configs[descToDisjoinPoolMemType(desc)]; + auto pool = usm::makeDisjointPool(makeProvider(desc), poolConfig); + usmPool = std::make_unique(this, std::move(pool)); + } else { + auto pool = usm::makeProxyPool(makeProvider(desc)); + usmPool = std::make_unique(this, std::move(pool)); + } + UMF_CALL_THROWS( + umfPoolSetTag(usmPool->umfPool.get(), usmPool.get(), nullptr)); + poolManager.addPool(desc, std::move(usmPool)); + } +} + ur_context_handle_t ur_usm_pool_handle_t_::getContextHandle() const { return hContext; } @@ -358,27 +420,27 @@ size_t ur_usm_pool_handle_t_::getTotalReservedSize() { } size_t ur_usm_pool_handle_t_::getPeakReservedSize() { - size_t totalAllocatedSize = 0; + size_t maxPeakSize = 0; umf_result_t umfRet = UMF_RESULT_SUCCESS; poolManager.forEachPool([&](UsmPool *p) { umf_memory_provider_handle_t hProvider = nullptr; - size_t allocatedSize = 0; + size_t peakSize = 0; umfRet = umfPoolGetMemoryProvider(p->umfPool.get(), &hProvider); if (umfRet != UMF_RESULT_SUCCESS) { return false; } - umfRet = umfCtlGet("umf.provider.by_handle.{}.stats.peak_memory", - &allocatedSize, sizeof(allocatedSize), hProvider); + umfRet = umfCtlGet("umf.provider.by_handle.{}.stats.peak_memory", &peakSize, + sizeof(peakSize), hProvider); if (umfRet != UMF_RESULT_SUCCESS) { return false; } - totalAllocatedSize += allocatedSize; + maxPeakSize = std::max(maxPeakSize, peakSize); return true; }); - return umfRet == UMF_RESULT_SUCCESS ? totalAllocatedSize : 0; + return umfRet == UMF_RESULT_SUCCESS ? maxPeakSize : 0; } size_t ur_usm_pool_handle_t_::getTotalUsedSize() { @@ -460,6 +522,32 @@ ur_result_t urUSMPoolGetInfo( return exceptionToResult(std::current_exception()); } +ur_result_t urUSMPoolCreateExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_usm_pool_desc_t *pPoolDesc, + ur_usm_pool_handle_t *pPool) try { + *pPool = new ur_usm_pool_handle_t_(hContext, hDevice, pPoolDesc); + hContext->addUsmPool(*pPool); + return UR_RESULT_SUCCESS; +} catch (umf_result_t e) { + return umf::umf2urResult(e); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t urUSMPoolDestroyExp(ur_context_handle_t, ur_device_handle_t, + ur_usm_pool_handle_t hPool) try { + if (hPool->RefCount.release()) { + hPool->getContextHandle()->removeUsmPool(hPool); + delete hPool; + } + return UR_RESULT_SUCCESS; +} catch (umf_result_t e) { + return umf::umf2urResult(e); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + ur_result_t urUSMPoolGetInfoExp(ur_usm_pool_handle_t hPool, ur_usm_pool_info_t propName, void *pPropValue, size_t *pPropSizeRet) { @@ -497,6 +585,28 @@ ur_result_t urUSMPoolGetInfoExp(ur_usm_pool_handle_t hPool, return UR_RESULT_SUCCESS; } +ur_result_t urUSMPoolSetInfoExp(ur_usm_pool_handle_t /*hPool*/, + ur_usm_pool_info_t propName, + void * /*pPropValue*/, size_t propSize) { + if (propSize < sizeof(size_t)) { + return UR_RESULT_ERROR_INVALID_SIZE; + } + + switch (propName) { + // TODO: Support for pool release threshold and maximum size hints. + case UR_USM_POOL_INFO_RELEASE_THRESHOLD_EXP: + case UR_USM_POOL_INFO_MAXIMUM_SIZE_EXP: + // TODO: Allow user to overwrite pool peak statistics. + case UR_USM_POOL_INFO_RESERVED_HIGH_EXP: + case UR_USM_POOL_INFO_USED_HIGH_EXP: + break; + default: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + + return UR_RESULT_SUCCESS; +} + ur_result_t urUSMPoolGetDefaultDevicePoolExp(ur_context_handle_t hContext, ur_device_handle_t, ur_usm_pool_handle_t *pPool) { diff --git a/source/adapters/level_zero/v2/usm.hpp b/source/adapters/level_zero/v2/usm.hpp index 5b498b361c..825ecb5fcd 100644 --- a/source/adapters/level_zero/v2/usm.hpp +++ b/source/adapters/level_zero/v2/usm.hpp @@ -55,6 +55,9 @@ struct AllocationStats { struct ur_usm_pool_handle_t_ : ur_object { ur_usm_pool_handle_t_(ur_context_handle_t hContext, ur_usm_pool_desc_t *pPoolDes); + ur_usm_pool_handle_t_(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_usm_pool_desc_t *pPoolDes); ur_context_handle_t getContextHandle() const; diff --git a/source/adapters/level_zero/virtual_mem.cpp b/source/adapters/level_zero/virtual_mem.cpp index f61c8fd43f..0488d21023 100644 --- a/source/adapters/level_zero/virtual_mem.cpp +++ b/source/adapters/level_zero/virtual_mem.cpp @@ -23,8 +23,8 @@ namespace ur::level_zero { ur_result_t urVirtualMemGranularityGetInfo( ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_virtual_mem_granularity_info_t propName, size_t propSize, - void *pPropValue, size_t *pPropSizeRet) { + size_t allocationSize, ur_virtual_mem_granularity_info_t propName, + size_t propSize, void *pPropValue, size_t *pPropSizeRet) { UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { case UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM: @@ -34,7 +34,8 @@ ur_result_t urVirtualMemGranularityGetInfo( // aligned size. size_t PageSize; ZE2UR_CALL(zeVirtualMemQueryPageSize, - (hContext->getZeHandle(), hDevice->ZeDevice, 1, &PageSize)); + (hContext->getZeHandle(), hDevice->ZeDevice, allocationSize, + &PageSize)); return ReturnValue(PageSize); } default: diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index fb5529f95b..3ab79444f3 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -2729,6 +2729,9 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. @@ -2744,7 +2747,8 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( ur_result_t result = UR_RESULT_SUCCESS; ur_virtual_mem_granularity_get_info_params_t params = { - &hContext, &hDevice, &propName, &propSize, &pPropValue, &pPropSizeRet}; + &hContext, &hDevice, &allocationSize, &propName, + &propSize, &pPropValue, &pPropSizeRet}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( diff --git a/source/adapters/native_cpu/virtual_mem.cpp b/source/adapters/native_cpu/virtual_mem.cpp index 131b480ac1..6697902564 100644 --- a/source/adapters/native_cpu/virtual_mem.cpp +++ b/source/adapters/native_cpu/virtual_mem.cpp @@ -13,8 +13,8 @@ #include "physical_mem.hpp" UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( - ur_context_handle_t, ur_device_handle_t, ur_virtual_mem_granularity_info_t, - size_t, void *, size_t *) { + ur_context_handle_t, ur_device_handle_t, size_t, + ur_virtual_mem_granularity_info_t, size_t, void *, size_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/adapters/offload/device.cpp b/source/adapters/offload/device.cpp index ebe0405b89..76ea3e6c4f 100644 --- a/source/adapters/offload/device.cpp +++ b/source/adapters/offload/device.cpp @@ -67,6 +67,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_PLATFORM: return ReturnValue(hDevice->Platform); break; + case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: + case UR_DEVICE_INFO_USM_HOST_SUPPORT: case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: return ReturnValue(UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS); case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: @@ -76,6 +78,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(uint32_t{1}); case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: return ReturnValue(uint32_t{3}); + case UR_DEVICE_INFO_COMPILER_AVAILABLE: + return ReturnValue(true); + case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: + // TODO: Implement subgroups in Offload + return ReturnValue(1); // Unimplemented features case UR_DEVICE_INFO_PROGRAM_SET_SPECIALIZATION_CONSTANTS: case UR_DEVICE_INFO_GLOBAL_VARIABLE_SUPPORT: @@ -83,12 +90,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: case UR_DEVICE_INFO_IMAGE_SUPPORT: case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: + case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: + // TODO: Atomic queries in Offload + case UR_DEVICE_INFO_ATOMIC_64: + case UR_DEVICE_INFO_IMAGE_SRGB: + case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: + case UR_DEVICE_INFO_LINKER_AVAILABLE: return ReturnValue(false); case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: - case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: - case UR_DEVICE_INFO_USM_HOST_SUPPORT: case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: return ReturnValue(uint32_t{0}); + case UR_DEVICE_INFO_QUEUE_PROPERTIES: + case UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES: + return ReturnValue(0); + case UR_DEVICE_INFO_SUPPORTED_PARTITIONS: { + if (pPropSizeRet) { + *pPropSizeRet = 0; + } + return UR_RESULT_SUCCESS; + } default: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } diff --git a/source/adapters/offload/program.cpp b/source/adapters/offload/program.cpp index cf497c571f..e889f59ef8 100644 --- a/source/adapters/offload/program.cpp +++ b/source/adapters/offload/program.cpp @@ -125,6 +125,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL +urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, + size_t length, const ur_program_properties_t *pProperties, + ur_program_handle_t *phProgram) { + // Liboffload consumes both IR and binaries through the same entrypoint + return urProgramCreateWithBinary(hContext, 1, &hContext->Device, &length, + reinterpret_cast(&pIL), + pProperties, phProgram); +} + UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t, ur_program_handle_t, const char *) { @@ -147,12 +157,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(ur_context_handle_t, return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urProgramCreateWithIL(ur_context_handle_t, const void *, size_t, - const ur_program_properties_t *, ur_program_handle_t *) { - return UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE; -} - UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { diff --git a/source/adapters/opencl/virtual_mem.cpp b/source/adapters/opencl/virtual_mem.cpp index 7c411d9b7b..c7db068eca 100644 --- a/source/adapters/opencl/virtual_mem.cpp +++ b/source/adapters/opencl/virtual_mem.cpp @@ -13,8 +13,8 @@ #include "physical_mem.hpp" UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( - ur_context_handle_t, ur_device_handle_t, ur_virtual_mem_granularity_info_t, - size_t, void *, size_t *) { + ur_context_handle_t, ur_device_handle_t, size_t, + ur_virtual_mem_granularity_info_t, size_t, void *, size_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/common/cuda-hip/stream_queue.hpp b/source/common/cuda-hip/stream_queue.hpp index 2547070fa9..be52421437 100644 --- a/source/common/cuda-hip/stream_queue.hpp +++ b/source/common/cuda-hip/stream_queue.hpp @@ -30,47 +30,90 @@ struct stream_queue_t { static constexpr int DefaultNumComputeStreams = CS; static constexpr int DefaultNumTransferStreams = TS; + // Mutex to guard modifications to the ComputeStreams vector, and + // NumComputeStreams. + std::mutex ComputeStreamMutex; std::vector ComputeStreams; + // Number of compute streams that have been created + unsigned int NumComputeStreams{0}; + + // Mutex to guard modifications to the TransferStreams vector, and + // NumTransferStreams. + std::mutex TransferStreamMutex; std::vector TransferStreams; + // Number of transfer streams that have been created + unsigned int NumTransferStreams{0}; + + // The stream indices are incremented every time we return a stream. This + // means that they encode both the index of the next stream in the round + // robin, as well as which iteration of the round robin we're on. Dividing + // the stream index by the size of the associated stream vector will give the + // number of round robins we've done as quotient, and the index of the next + // stream to use as remainder. + std::atomic_uint32_t ComputeStreamIndex{0}; + std::atomic_uint32_t TransferStreamIndex{0}; + + // The LastSync indices keep track of the index based on ComputeStreamIndex + // or TransferStreamIndex of the last stream that was synchronized during a + // syncStreams operation. + unsigned int LastSyncComputeStreams{0}; + unsigned int LastSyncTransferStreams{0}; + // Stream used for recording EvQueue, which holds information about when the // command in question is enqueued on host, as opposed to started. It is // created only if profiling is enabled - either for queue or per event. native_type HostSubmitTimeStream{0}; + // Flag to keep track of the creation og HostSubmitTimeStream, it is created + // either in the queue constructor when profiling is enabled or whenever it + // is requested for the first time through timestamp entry points. std::once_flag HostSubmitTimeStreamFlag; - // delay_compute_ keeps track of which streams have been recently reused and + + // DelayCompute keeps track of which streams have been recently reused and // their next use should be delayed. If a stream has been recently reused it // will be skipped the next time it would be selected round-robin style. When // skipped, its delay flag is cleared. std::vector DelayCompute; - // keep track of which streams have applied barrier + + // ComputeStreamSyncMutex is used to guard compute streams when they are + // being re-used. + // + // When ComputeStreamSyncMutex and ComputeStreamMutex both need to be + // locked at the same time, ComputeStreamSyncMutex should be locked first + // to avoid deadlocks. + std::mutex ComputeStreamSyncMutex; + + // Guards barrier insertion in urEnqueueEventsWaitWithBarrier. + std::mutex BarrierMutex; + BarrierEventT BarrierEvent = nullptr; + BarrierEventT BarrierTmpEvent = nullptr; + + // Keep track of which streams have applied barrier. std::vector ComputeAppliedBarrier; std::vector TransferAppliedBarrier; - ur_context_handle_t_ *Context; - ur_device_handle_t_ *Device; + + ur_context_handle_t Context; + ur_device_handle_t Device; + + // Reference count for the queue object. ur::RefCount RefCount; + + // Event count used to give events an ordering used in the event class + // forLatestEvents. std::atomic_uint32_t EventCount{0}; - std::atomic_uint32_t ComputeStreamIndex{0}; - std::atomic_uint32_t TransferStreamIndex{0}; - unsigned int NumComputeStreams{0}; - unsigned int NumTransferStreams{0}; - unsigned int LastSyncComputeStreams{0}; - unsigned int LastSyncTransferStreams{0}; + + // Queue flags in the native API format as well as UR format. unsigned int Flags; ur_queue_flags_t URFlags; + + // Priority of this queue, matches underlying API priority. int Priority; - // When ComputeStreamSyncMutex and ComputeStreamMutex both need to be - // locked at the same time, ComputeStreamSyncMutex should be locked first - // to avoid deadlocks - std::mutex ComputeStreamSyncMutex; - std::mutex ComputeStreamMutex; - std::mutex TransferStreamMutex; - std::mutex BarrierMutex; + + // Tracks if the queue owns the underlying native streams, this may happen + // for queues created from interop. bool HasOwnership; - BarrierEventT BarrierEvent = nullptr; - BarrierEventT BarrierTmpEvent = nullptr; - stream_queue_t(bool IsOutOfOrder, ur_context_handle_t_ *Context, - ur_device_handle_t_ *Device, unsigned int Flags, + stream_queue_t(bool IsOutOfOrder, ur_context_handle_t Context, + ur_device_handle_t Device, unsigned int Flags, ur_queue_flags_t URFlags, int Priority) : ComputeStreams(IsOutOfOrder ? DefaultNumComputeStreams : 1), TransferStreams(IsOutOfOrder ? DefaultNumTransferStreams : 0), @@ -87,16 +130,16 @@ struct stream_queue_t { } } - // Create a queue from a native handle - stream_queue_t(native_type stream, ur_context_handle_t_ *Context, - ur_device_handle_t_ *Device, unsigned int Flags, + // Create a queue from a native handle. + stream_queue_t(native_type stream, ur_context_handle_t Context, + ur_device_handle_t Device, unsigned int Flags, ur_queue_flags_t URFlags, bool BackendOwns) - : ComputeStreams(1, stream), TransferStreams(0), + : ComputeStreams(1, stream), NumComputeStreams{1}, TransferStreams(0), DelayCompute(this->ComputeStreams.size(), false), ComputeAppliedBarrier(this->ComputeStreams.size()), TransferAppliedBarrier(this->TransferStreams.size()), Context{Context}, - Device{Device}, NumComputeStreams{1}, Flags(Flags), URFlags(URFlags), - Priority(0), HasOwnership{BackendOwns} { + Device{Device}, Flags(Flags), URFlags(URFlags), Priority(0), + HasOwnership{BackendOwns} { urContextRetain(Context); // Create timing stream if profiling is enabled. @@ -107,6 +150,7 @@ struct stream_queue_t { ~stream_queue_t() { urContextRelease(Context); } + // Methods defined by the specific adapters. void computeStreamWaitForBarrierIfNeeded(native_type Strean, uint32_t StreamI); void transferStreamWaitForBarrierIfNeeded(native_type Stream, @@ -206,9 +250,6 @@ struct stream_queue_t { return Result; } - native_type get() { return getNextComputeStream(); }; - ur_device_handle_t getDevice() const noexcept { return Device; }; - native_type getHostSubmitTimeStream() { return HostSubmitTimeStream; } bool hasBeenSynchronized(uint32_t StreamToken) { @@ -345,7 +386,8 @@ struct stream_queue_t { } } - ur_context_handle_t_ *getContext() const { return Context; }; + ur_device_handle_t getDevice() const noexcept { return Device; }; + ur_context_handle_t getContext() const noexcept { return Context; }; uint32_t getNextEventId() noexcept { return ++EventCount; } diff --git a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp index 3539a2d2a5..f8f7c58bf5 100644 --- a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp +++ b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp @@ -272,10 +272,13 @@ size_t GetKernelPrivateMemorySize(ur_kernel_handle_t Kernel, size_t GetVirtualMemGranularity(ur_context_handle_t Context, ur_device_handle_t Device) { size_t Size; + const size_t allocationSize = + 1; // probably we want to use actual allocation size [[maybe_unused]] auto Result = getContext()->urDdiTable.VirtualMem.pfnGranularityGetInfo( - Context, Device, UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED, - sizeof(Size), &Size, nullptr); + Context, Device, allocationSize, + UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED, sizeof(Size), &Size, + nullptr); assert(Result == UR_RESULT_SUCCESS); return Size; } diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index e0d57228e4..0abbb7604c 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -2236,6 +2236,9 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. @@ -2255,7 +2258,8 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; ur_virtual_mem_granularity_get_info_params_t params = { - &hContext, &hDevice, &propName, &propSize, &pPropValue, &pPropSizeRet}; + &hContext, &hDevice, &allocationSize, &propName, + &propSize, &pPropValue, &pPropSizeRet}; uint64_t instance = getContext()->notify_begin(UR_FUNCTION_VIRTUAL_MEM_GRANULARITY_GET_INFO, "urVirtualMemGranularityGetInfo", ¶ms); @@ -2263,8 +2267,9 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( auto &logger = getContext()->logger; UR_LOG_L(logger, INFO, " ---> urVirtualMemGranularityGetInfo\n"); - ur_result_t result = pfnGranularityGetInfo( - hContext, hDevice, propName, propSize, pPropValue, pPropSizeRet); + ur_result_t result = + pfnGranularityGetInfo(hContext, hDevice, allocationSize, propName, + propSize, pPropValue, pPropSizeRet); getContext()->notify_end(UR_FUNCTION_VIRTUAL_MEM_GRANULARITY_GET_INFO, "urVirtualMemGranularityGetInfo", ¶ms, &result, diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 979eb3ef22..b61356afd2 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -2182,6 +2182,9 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. @@ -2228,8 +2231,9 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( URLOG_CTX_INVALID_REFERENCE(hDevice); } - ur_result_t result = pfnGranularityGetInfo( - hContext, hDevice, propName, propSize, pPropValue, pPropSizeRet); + ur_result_t result = + pfnGranularityGetInfo(hContext, hDevice, allocationSize, propName, + propSize, pPropValue, pPropSizeRet); return result; } diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index 1cf13837bd..74712c5c4d 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -1238,6 +1238,9 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. @@ -1258,8 +1261,8 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( return UR_RESULT_ERROR_UNINITIALIZED; // forward to device-platform - return pfnGranularityGetInfo(hContext, hDevice, propName, propSize, - pPropValue, pPropSizeRet); + return pfnGranularityGetInfo(hContext, hDevice, allocationSize, propName, + propSize, pPropValue, pPropSizeRet); } /////////////////////////////////////////////////////////////////////////////// diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 1261145424..cad6de4dd9 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -2725,6 +2725,9 @@ ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. @@ -2742,8 +2745,8 @@ ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( if (nullptr == pfnGranularityGetInfo) return UR_RESULT_ERROR_UNINITIALIZED; - return pfnGranularityGetInfo(hContext, hDevice, propName, propSize, - pPropValue, pPropSizeRet); + return pfnGranularityGetInfo(hContext, hDevice, allocationSize, propName, + propSize, pPropValue, pPropSizeRet); } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/source/ur_api.cpp b/source/ur_api.cpp index cc69811f57..426ca95027 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -2410,6 +2410,9 @@ ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( /// device is null then the granularity is suitable for all devices in /// context. ur_device_handle_t hDevice, + /// [in] allocation size in bytes for which the alignment is being + /// queried. + size_t allocationSize, /// [in] type of the info to query. ur_virtual_mem_granularity_info_t propName, /// [in] size in bytes of the memory pointed to by pPropValue. diff --git a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp index 327728bb5a..fa3eb3f4b5 100644 --- a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp +++ b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp @@ -491,11 +491,11 @@ struct urEnqueueKernelLaunchWithVirtualMemory : uur::urKernelExecutionTest { GTEST_SKIP() << "Virtual memory is not supported."; } + alloc_size = 1024; ASSERT_SUCCESS(urVirtualMemGranularityGetInfo( - context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, + context, device, alloc_size, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, sizeof(granularity), &granularity, nullptr)); - alloc_size = 1024; virtual_page_size = uur::RoundUpToNearestFactor(alloc_size, granularity); ASSERT_SUCCESS(urPhysicalMemCreate(context, device, virtual_page_size, diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h index b67eddd8f8..fff0be4a01 100644 --- a/test/conformance/testing/include/uur/fixtures.h +++ b/test/conformance/testing/include/uur/fixtures.h @@ -976,9 +976,12 @@ struct urVirtualMemGranularityTest : urContextTest { GTEST_SKIP() << "Virtual memory is not supported."; } + const size_t allocationSize = + 1; // assuming allocations in test are small enough and minimal granularity is used ASSERT_SUCCESS(urVirtualMemGranularityGetInfo( - context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - sizeof(granularity), &granularity, nullptr)); + context, device, allocationSize, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, sizeof(granularity), + &granularity, nullptr)); } size_t granularity; }; @@ -995,10 +998,12 @@ struct urVirtualMemGranularityTestWithParam : urContextTestWithParam { if (!virtual_memory_support) { GTEST_SKIP() << "Virtual memory is not supported."; } - + const size_t allocationSize = + 1; // assuming allocations in test are small and use smallest granularity ASSERT_SUCCESS(urVirtualMemGranularityGetInfo( - this->context, this->device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - sizeof(granularity), &granularity, nullptr)); + this->context, this->device, allocationSize, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, sizeof(granularity), + &granularity, nullptr)); ASSERT_NE(granularity, 0); } diff --git a/test/conformance/virtual_memory/urVirtualMemGranularityGetInfo.cpp b/test/conformance/virtual_memory/urVirtualMemGranularityGetInfo.cpp index 0507b8903a..cd4e3ed076 100644 --- a/test/conformance/virtual_memory/urVirtualMemGranularityGetInfo.cpp +++ b/test/conformance/virtual_memory/urVirtualMemGranularityGetInfo.cpp @@ -20,89 +20,96 @@ struct urVirtualMemGranularityGetInfoTest : uur::urContextTest { UUR_INSTANTIATE_DEVICE_TEST_SUITE(urVirtualMemGranularityGetInfoTest); -TEST_P(urVirtualMemGranularityGetInfoTest, SuccessMinimum) { +void urVirtualMemGranularityGetInfoTest_successCase( + ur_context_handle_t context, ur_device_handle_t device, + const ur_virtual_mem_granularity_info_t property_name, + const size_t allocation_size) { size_t property_size = 0; - const ur_virtual_mem_granularity_info_t property_name = - UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM; ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urVirtualMemGranularityGetInfo(context, device, property_name, 0, nullptr, - &property_size), + urVirtualMemGranularityGetInfo(context, device, allocation_size, + property_name, 0, nullptr, &property_size), property_name); ASSERT_EQ(sizeof(size_t), property_size); size_t property_value = 0; ASSERT_QUERY_RETURNS_VALUE( - urVirtualMemGranularityGetInfo(context, device, property_name, - property_size, &property_value, nullptr), + urVirtualMemGranularityGetInfo(context, device, allocation_size, + property_name, property_size, + &property_value, nullptr), property_value); ASSERT_GT(property_value, 0); } -TEST_P(urVirtualMemGranularityGetInfoTest, SuccessRecommended) { - size_t property_size = 0; - const ur_virtual_mem_granularity_info_t property_name = - UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED; +TEST_P(urVirtualMemGranularityGetInfoTest, SuccessMinimum_smallAllocation) { + urVirtualMemGranularityGetInfoTest_successCase( + context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 1); +} - ASSERT_SUCCESS_OR_OPTIONAL_QUERY( - urVirtualMemGranularityGetInfo(context, device, property_name, 0, nullptr, - &property_size), - property_name); - ASSERT_EQ(sizeof(size_t), property_size); +TEST_P(urVirtualMemGranularityGetInfoTest, SuccessMinimum_largeAllocation) { + urVirtualMemGranularityGetInfoTest_successCase( + context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 191439360); +} - size_t property_value = 0; - ASSERT_QUERY_RETURNS_VALUE( - urVirtualMemGranularityGetInfo(context, device, property_name, - property_size, &property_value, nullptr), - property_value); +TEST_P(urVirtualMemGranularityGetInfoTest, SuccessRecommended_smallAllocation) { + urVirtualMemGranularityGetInfoTest_successCase( + context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 19); +} - ASSERT_GT(property_value, 0); +TEST_P(urVirtualMemGranularityGetInfoTest, SuccessRecommended_largeAllocation) { + urVirtualMemGranularityGetInfoTest_successCase( + context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 211739367); } TEST_P(urVirtualMemGranularityGetInfoTest, InvalidNullHandleContext) { size_t property_size = 0; - ASSERT_EQ_RESULT(urVirtualMemGranularityGetInfo( - nullptr, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - 0, nullptr, &property_size), - UR_RESULT_ERROR_INVALID_NULL_HANDLE); + ASSERT_EQ_RESULT( + urVirtualMemGranularityGetInfo(nullptr, device, 1, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 0, + nullptr, &property_size), + UR_RESULT_ERROR_INVALID_NULL_HANDLE); } TEST_P(urVirtualMemGranularityGetInfoTest, InvalidEnumeration) { size_t property_size = 0; ASSERT_EQ_RESULT(urVirtualMemGranularityGetInfo( - context, device, + context, device, 1, UR_VIRTUAL_MEM_GRANULARITY_INFO_FORCE_UINT32, 0, nullptr, &property_size), UR_RESULT_ERROR_INVALID_ENUMERATION); } TEST_P(urVirtualMemGranularityGetInfoTest, InvalidNullPointerPropSizeRet) { - ASSERT_EQ_RESULT(urVirtualMemGranularityGetInfo( - context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - 0, nullptr, nullptr), - UR_RESULT_ERROR_INVALID_NULL_POINTER); + ASSERT_EQ_RESULT( + urVirtualMemGranularityGetInfo(context, device, 1, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 0, + nullptr, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); } TEST_P(urVirtualMemGranularityGetInfoTest, InvalidNullPointerPropValue) { - ASSERT_EQ_RESULT(urVirtualMemGranularityGetInfo( - context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - sizeof(size_t), nullptr, nullptr), - UR_RESULT_ERROR_INVALID_NULL_POINTER); + ASSERT_EQ_RESULT( + urVirtualMemGranularityGetInfo(context, device, 1, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, + sizeof(size_t), nullptr, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); } TEST_P(urVirtualMemGranularityGetInfoTest, InvalidPropSizeZero) { size_t minimum = 0; - ASSERT_EQ_RESULT(urVirtualMemGranularityGetInfo( - context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - 0, &minimum, nullptr), - UR_RESULT_ERROR_INVALID_SIZE); + ASSERT_EQ_RESULT( + urVirtualMemGranularityGetInfo(context, device, 1, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, 0, + &minimum, nullptr), + UR_RESULT_ERROR_INVALID_SIZE); } TEST_P(urVirtualMemGranularityGetInfoTest, InvalidSizePropSizeSmall) { size_t minimum = 0; - ASSERT_EQ_RESULT(urVirtualMemGranularityGetInfo( - context, device, UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, - sizeof(size_t) - 1, &minimum, nullptr), - UR_RESULT_ERROR_INVALID_SIZE); + ASSERT_EQ_RESULT( + urVirtualMemGranularityGetInfo(context, device, 1, + UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM, + sizeof(size_t) - 1, &minimum, nullptr), + UR_RESULT_ERROR_INVALID_SIZE); }