From b209eba86e4adf5deefa89603eec80d63ee05b8a Mon Sep 17 00:00:00 2001 From: "Zhao, Maosu" Date: Wed, 27 Nov 2024 00:48:27 -0800 Subject: [PATCH] [DevASAN] Do allocation with USM pool to reduce memory overhead Release mapped physical memory according to its dependency may cause some problems. So, we decide to use USM pool to do allocation to reduce memory overhead. --- .../sanitizer/asan/asan_interceptor.cpp | 54 +++++++------- .../sanitizer/asan/asan_interceptor.hpp | 7 +- .../layers/sanitizer/asan/asan_shadow.cpp | 72 +++++++------------ .../layers/sanitizer/asan/asan_shadow.hpp | 11 +-- 4 files changed, 64 insertions(+), 80 deletions(-) diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp index 8d2a9e5ee2..1a1185e1ba 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp @@ -36,7 +36,8 @@ AsanInterceptor::~AsanInterceptor() { // We must release these objects before releasing adapters, since // they may use the adapter in their destructor for (const auto &[_, DeviceInfo] : m_DeviceMap) { - DeviceInfo->Shadow->Destory(); + [[maybe_unused]] auto URes = DeviceInfo->Shadow->Destory(); + assert(URes == UR_RESULT_SUCCESS); } m_Quarantine = nullptr; @@ -96,6 +97,10 @@ ur_result_t AsanInterceptor::allocateMemory(ur_context_handle_t Context, void *Allocated = nullptr; + if (Pool == nullptr) { + Pool = ContextInfo->getUSMPool(); + } + if (Type == AllocType::DEVICE_USM) { UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc( Context, Device, Properties, Pool, NeededSize, &Allocated)); @@ -228,16 +233,6 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context, ContextInfo->Stats.UpdateUSMRealFreed( ToFreeAllocInfo->AllocSize, ToFreeAllocInfo->getRedzoneSize()); - if (ToFreeAllocInfo->Type == AllocType::HOST_USM) { - for (auto &Device : ContextInfo->DeviceList) { - UR_CALL(getDeviceInfo(Device)->Shadow->ReleaseShadow( - ToFreeAllocInfo)); - } - } else { - UR_CALL(getDeviceInfo(ToFreeAllocInfo->Device) - ->Shadow->ReleaseShadow(ToFreeAllocInfo)); - } - UR_CALL(getContext()->urDdiTable.USM.pfnFree( Context, (void *)(ToFreeAllocInfo->AllocBegin))); @@ -436,12 +431,6 @@ ur_result_t AsanInterceptor::unregisterProgram(ur_program_handle_t Program) { auto ProgramInfo = getProgramInfo(Program); assert(ProgramInfo != nullptr && "unregistered program!"); - for (auto AI : ProgramInfo->AllocInfoForGlobals) { - UR_CALL(getDeviceInfo(AI->Device)->Shadow->ReleaseShadow(AI)); - m_AllocationMap.erase(AI->AllocBegin); - } - ProgramInfo->AllocInfoForGlobals.clear(); - ProgramInfo->InstrumentedKernels.clear(); return UR_RESULT_SUCCESS; @@ -560,10 +549,6 @@ AsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) { {}}); ContextInfo->insertAllocInfo({Device}, AI); - ProgramInfo->AllocInfoForGlobals.emplace(AI); - - std::scoped_lock Guard(m_AllocationMapMutex); - m_AllocationMap.emplace(AI->AllocBegin, std::move(AI)); } } @@ -887,9 +872,14 @@ bool ProgramInfo::isKernelInstrumented(ur_kernel_handle_t Kernel) const { ContextInfo::~ContextInfo() { Stats.Print(Handle); - [[maybe_unused]] auto Result = - getContext()->urDdiTable.Context.pfnRelease(Handle); - assert(Result == UR_RESULT_SUCCESS); + [[maybe_unused]] ur_result_t URes; + if (USMPool) { + URes = getContext()->urDdiTable.USM.pfnPoolRelease(USMPool); + assert(URes == UR_RESULT_SUCCESS); + } + + URes = getContext()->urDdiTable.Context.pfnRelease(Handle); + assert(URes == UR_RESULT_SUCCESS); // check memory leaks if (getAsanInterceptor()->getOptions().DetectLeaks && @@ -905,6 +895,22 @@ ContextInfo::~ContextInfo() { } } +ur_usm_pool_handle_t ContextInfo::getUSMPool() { + std::call_once(PoolInit, [this]() { + ur_usm_pool_desc_t Desc{UR_STRUCTURE_TYPE_USM_POOL_DESC, nullptr, 0}; + auto URes = + getContext()->urDdiTable.USM.pfnPoolCreate(Handle, &Desc, &USMPool); + if (URes != UR_RESULT_SUCCESS && + URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { + getContext()->logger.warning( + "Failed to create USM pool, the memory overhead " + "may increase: {}", + URes); + } + }); + return USMPool; +} + AsanRuntimeDataWrapper::~AsanRuntimeDataWrapper() { [[maybe_unused]] ur_result_t Result; if (Host.LocalArgs) { diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp index 122865bd11..f1e80dae56 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp @@ -112,7 +112,6 @@ struct ProgramInfo { std::atomic RefCount = 1; // Program is built only once, so we don't need to lock it - std::unordered_set> AllocInfoForGlobals; std::unordered_set InstrumentedKernels; explicit ProgramInfo(ur_program_handle_t Program) : Handle(Program) { @@ -132,6 +131,10 @@ struct ProgramInfo { struct ContextInfo { ur_context_handle_t Handle; + + ur_usm_pool_handle_t USMPool{}; + std::once_flag PoolInit; + std::atomic RefCount = 1; std::vector DeviceList; @@ -155,6 +158,8 @@ struct ContextInfo { AllocInfos.List.emplace_back(AI); } } + + ur_usm_pool_handle_t getUSMPool(); }; struct AsanRuntimeDataWrapper { diff --git a/source/loader/layers/sanitizer/asan/asan_shadow.cpp b/source/loader/layers/sanitizer/asan/asan_shadow.cpp index 8e2329ac06..de0679687b 100644 --- a/source/loader/layers/sanitizer/asan/asan_shadow.cpp +++ b/source/loader/layers/sanitizer/asan/asan_shadow.cpp @@ -108,11 +108,15 @@ ur_result_t ShadowMemoryGPU::Setup() { // TODO: Protect Bad Zone auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve( Context, nullptr, ShadowSize, (void **)&ShadowBegin); - if (Result == UR_RESULT_SUCCESS) { - ShadowEnd = ShadowBegin + ShadowSize; - // Retain the context which reserves shadow memory - getContext()->urDdiTable.Context.pfnRetain(Context); + if (Result != UR_RESULT_SUCCESS) { + getContext()->logger.error( + "Shadow memory reserved failed with size {}: {}", + (void *)ShadowSize, Result); + return Result; } + ShadowEnd = ShadowBegin + ShadowSize; + // Retain the context which reserves shadow memory + getContext()->urDdiTable.Context.pfnRetain(Context); // Set shadow memory for null pointer // For GPU, wu use up to 1 page of shadow memory @@ -137,6 +141,24 @@ ur_result_t ShadowMemoryGPU::Destory() { Context, (void *)PrivateShadowOffset)); PrivateShadowOffset = 0; } + + static ur_result_t Result = [this]() { + const size_t PageSize = GetVirtualMemGranularity(Context, Device); + for (auto [MappedPtr, PhysicalMem] : VirtualMemMaps) { + UR_CALL(getContext()->urDdiTable.VirtualMem.pfnUnmap( + Context, (void *)MappedPtr, PageSize)); + UR_CALL( + getContext()->urDdiTable.PhysicalMem.pfnRelease(PhysicalMem)); + } + UR_CALL(getContext()->urDdiTable.VirtualMem.pfnFree( + Context, (const void *)ShadowBegin, GetShadowSize())); + UR_CALL(getContext()->urDdiTable.Context.pfnRelease(Context)); + return UR_RESULT_SUCCESS; + }(); + if (!Result) { + return Result; + } + if (LocalShadowOffset != 0) { UR_CALL(getContext()->urDdiTable.USM.pfnFree( Context, (void *)LocalShadowOffset)); @@ -205,19 +227,8 @@ ur_result_t ShadowMemoryGPU::EnqueuePoisonShadow(ur_queue_handle_t Queue, return URes; } - VirtualMemMaps[MappedPtr].first = PhysicalMem; + VirtualMemMaps[MappedPtr] = PhysicalMem; } - - // We don't need to record virtual memory map for null pointer, - // since it doesn't have an alloc info. - if (Ptr == 0) { - continue; - } - - auto AllocInfoIt = - getAsanInterceptor()->findAllocInfoByAddress(Ptr); - assert(AllocInfoIt); - VirtualMemMaps[MappedPtr].second.insert((*AllocInfoIt)->second); } } @@ -235,35 +246,6 @@ ur_result_t ShadowMemoryGPU::EnqueuePoisonShadow(ur_queue_handle_t Queue, return UR_RESULT_SUCCESS; } -ur_result_t ShadowMemoryGPU::ReleaseShadow(std::shared_ptr AI) { - uptr ShadowBegin = MemToShadow(AI->AllocBegin); - uptr ShadowEnd = MemToShadow(AI->AllocBegin + AI->AllocSize); - assert(ShadowBegin <= ShadowEnd); - - static const size_t PageSize = GetVirtualMemGranularity(Context, Device); - - for (auto MappedPtr = RoundDownTo(ShadowBegin, PageSize); - MappedPtr <= ShadowEnd; MappedPtr += PageSize) { - std::scoped_lock Guard(VirtualMemMapsMutex); - if (VirtualMemMaps.find(MappedPtr) == VirtualMemMaps.end()) { - continue; - } - VirtualMemMaps[MappedPtr].second.erase(AI); - if (VirtualMemMaps[MappedPtr].second.empty()) { - UR_CALL(getContext()->urDdiTable.VirtualMem.pfnUnmap( - Context, (void *)MappedPtr, PageSize)); - UR_CALL(getContext()->urDdiTable.PhysicalMem.pfnRelease( - VirtualMemMaps[MappedPtr].first)); - getContext()->logger.debug("urVirtualMemUnmap: {} ~ {}", - (void *)MappedPtr, - (void *)(MappedPtr + PageSize - 1)); - VirtualMemMaps.erase(MappedPtr); - } - } - - return UR_RESULT_SUCCESS; -} - ur_result_t ShadowMemoryGPU::AllocLocalShadow(ur_queue_handle_t Queue, uint32_t NumWG, uptr &Begin, uptr &End) { diff --git a/source/loader/layers/sanitizer/asan/asan_shadow.hpp b/source/loader/layers/sanitizer/asan/asan_shadow.hpp index 0658a07925..48054378fe 100644 --- a/source/loader/layers/sanitizer/asan/asan_shadow.hpp +++ b/source/loader/layers/sanitizer/asan/asan_shadow.hpp @@ -35,10 +35,6 @@ struct ShadowMemory { virtual ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr, uptr Size, u8 Value) = 0; - virtual ur_result_t ReleaseShadow(std::shared_ptr) { - return UR_RESULT_SUCCESS; - } - virtual size_t GetShadowSize() = 0; virtual ur_result_t AllocLocalShadow(ur_queue_handle_t Queue, @@ -98,8 +94,6 @@ struct ShadowMemoryGPU : public ShadowMemory { ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr, uptr Size, u8 Value) override final; - ur_result_t ReleaseShadow(std::shared_ptr AI) override final; - ur_result_t AllocLocalShadow(ur_queue_handle_t Queue, uint32_t NumWG, uptr &Begin, uptr &End) override final; @@ -108,10 +102,7 @@ struct ShadowMemoryGPU : public ShadowMemory { ur_mutex VirtualMemMapsMutex; - std::unordered_map< - uptr, std::pair>>> - VirtualMemMaps; + std::unordered_map VirtualMemMaps; uptr LocalShadowOffset = 0;