From b209eba86e4adf5deefa89603eec80d63ee05b8a Mon Sep 17 00:00:00 2001
From: "Zhao, Maosu" <maosu.zhao@intel.com>
Date: Wed, 27 Nov 2024 00:48:27 -0800
Subject: [PATCH] [DevASAN] Do allocation with USM pool to reduce memory
 overhead

Release mapped physical memory according to its dependency may cause
some problems. So, we decide to use USM pool to do allocation to
reduce memory overhead.
---
 .../sanitizer/asan/asan_interceptor.cpp       | 54 +++++++-------
 .../sanitizer/asan/asan_interceptor.hpp       |  7 +-
 .../layers/sanitizer/asan/asan_shadow.cpp     | 72 +++++++------------
 .../layers/sanitizer/asan/asan_shadow.hpp     | 11 +--
 4 files changed, 64 insertions(+), 80 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
index 8d2a9e5ee2..1a1185e1ba 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
@@ -36,7 +36,8 @@ AsanInterceptor::~AsanInterceptor() {
     // We must release these objects before releasing adapters, since
     // they may use the adapter in their destructor
     for (const auto &[_, DeviceInfo] : m_DeviceMap) {
-        DeviceInfo->Shadow->Destory();
+        [[maybe_unused]] auto URes = DeviceInfo->Shadow->Destory();
+        assert(URes == UR_RESULT_SUCCESS);
     }
 
     m_Quarantine = nullptr;
@@ -96,6 +97,10 @@ ur_result_t AsanInterceptor::allocateMemory(ur_context_handle_t Context,
 
     void *Allocated = nullptr;
 
+    if (Pool == nullptr) {
+        Pool = ContextInfo->getUSMPool();
+    }
+
     if (Type == AllocType::DEVICE_USM) {
         UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc(
             Context, Device, Properties, Pool, NeededSize, &Allocated));
@@ -228,16 +233,6 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context,
             ContextInfo->Stats.UpdateUSMRealFreed(
                 ToFreeAllocInfo->AllocSize, ToFreeAllocInfo->getRedzoneSize());
 
-            if (ToFreeAllocInfo->Type == AllocType::HOST_USM) {
-                for (auto &Device : ContextInfo->DeviceList) {
-                    UR_CALL(getDeviceInfo(Device)->Shadow->ReleaseShadow(
-                        ToFreeAllocInfo));
-                }
-            } else {
-                UR_CALL(getDeviceInfo(ToFreeAllocInfo->Device)
-                            ->Shadow->ReleaseShadow(ToFreeAllocInfo));
-            }
-
             UR_CALL(getContext()->urDdiTable.USM.pfnFree(
                 Context, (void *)(ToFreeAllocInfo->AllocBegin)));
 
@@ -436,12 +431,6 @@ ur_result_t AsanInterceptor::unregisterProgram(ur_program_handle_t Program) {
     auto ProgramInfo = getProgramInfo(Program);
     assert(ProgramInfo != nullptr && "unregistered program!");
 
-    for (auto AI : ProgramInfo->AllocInfoForGlobals) {
-        UR_CALL(getDeviceInfo(AI->Device)->Shadow->ReleaseShadow(AI));
-        m_AllocationMap.erase(AI->AllocBegin);
-    }
-    ProgramInfo->AllocInfoForGlobals.clear();
-
     ProgramInfo->InstrumentedKernels.clear();
 
     return UR_RESULT_SUCCESS;
@@ -560,10 +549,6 @@ AsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) {
                           {}});
 
             ContextInfo->insertAllocInfo({Device}, AI);
-            ProgramInfo->AllocInfoForGlobals.emplace(AI);
-
-            std::scoped_lock<ur_shared_mutex> Guard(m_AllocationMapMutex);
-            m_AllocationMap.emplace(AI->AllocBegin, std::move(AI));
         }
     }
 
@@ -887,9 +872,14 @@ bool ProgramInfo::isKernelInstrumented(ur_kernel_handle_t Kernel) const {
 ContextInfo::~ContextInfo() {
     Stats.Print(Handle);
 
-    [[maybe_unused]] auto Result =
-        getContext()->urDdiTable.Context.pfnRelease(Handle);
-    assert(Result == UR_RESULT_SUCCESS);
+    [[maybe_unused]] ur_result_t URes;
+    if (USMPool) {
+        URes = getContext()->urDdiTable.USM.pfnPoolRelease(USMPool);
+        assert(URes == UR_RESULT_SUCCESS);
+    }
+
+    URes = getContext()->urDdiTable.Context.pfnRelease(Handle);
+    assert(URes == UR_RESULT_SUCCESS);
 
     // check memory leaks
     if (getAsanInterceptor()->getOptions().DetectLeaks &&
@@ -905,6 +895,22 @@ ContextInfo::~ContextInfo() {
     }
 }
 
+ur_usm_pool_handle_t ContextInfo::getUSMPool() {
+    std::call_once(PoolInit, [this]() {
+        ur_usm_pool_desc_t Desc{UR_STRUCTURE_TYPE_USM_POOL_DESC, nullptr, 0};
+        auto URes =
+            getContext()->urDdiTable.USM.pfnPoolCreate(Handle, &Desc, &USMPool);
+        if (URes != UR_RESULT_SUCCESS &&
+            URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+            getContext()->logger.warning(
+                "Failed to create USM pool, the memory overhead "
+                "may increase: {}",
+                URes);
+        }
+    });
+    return USMPool;
+}
+
 AsanRuntimeDataWrapper::~AsanRuntimeDataWrapper() {
     [[maybe_unused]] ur_result_t Result;
     if (Host.LocalArgs) {
diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
index 122865bd11..f1e80dae56 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
@@ -112,7 +112,6 @@ struct ProgramInfo {
     std::atomic<int32_t> RefCount = 1;
 
     // Program is built only once, so we don't need to lock it
-    std::unordered_set<std::shared_ptr<AllocInfo>> AllocInfoForGlobals;
     std::unordered_set<std::string> InstrumentedKernels;
 
     explicit ProgramInfo(ur_program_handle_t Program) : Handle(Program) {
@@ -132,6 +131,10 @@ struct ProgramInfo {
 
 struct ContextInfo {
     ur_context_handle_t Handle;
+
+    ur_usm_pool_handle_t USMPool{};
+    std::once_flag PoolInit;
+
     std::atomic<int32_t> RefCount = 1;
 
     std::vector<ur_device_handle_t> DeviceList;
@@ -155,6 +158,8 @@ struct ContextInfo {
             AllocInfos.List.emplace_back(AI);
         }
     }
+
+    ur_usm_pool_handle_t getUSMPool();
 };
 
 struct AsanRuntimeDataWrapper {
diff --git a/source/loader/layers/sanitizer/asan/asan_shadow.cpp b/source/loader/layers/sanitizer/asan/asan_shadow.cpp
index 8e2329ac06..de0679687b 100644
--- a/source/loader/layers/sanitizer/asan/asan_shadow.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_shadow.cpp
@@ -108,11 +108,15 @@ ur_result_t ShadowMemoryGPU::Setup() {
         // TODO: Protect Bad Zone
         auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve(
             Context, nullptr, ShadowSize, (void **)&ShadowBegin);
-        if (Result == UR_RESULT_SUCCESS) {
-            ShadowEnd = ShadowBegin + ShadowSize;
-            // Retain the context which reserves shadow memory
-            getContext()->urDdiTable.Context.pfnRetain(Context);
+        if (Result != UR_RESULT_SUCCESS) {
+            getContext()->logger.error(
+                "Shadow memory reserved failed with size {}: {}",
+                (void *)ShadowSize, Result);
+            return Result;
         }
+        ShadowEnd = ShadowBegin + ShadowSize;
+        // Retain the context which reserves shadow memory
+        getContext()->urDdiTable.Context.pfnRetain(Context);
 
         // Set shadow memory for null pointer
         // For GPU, wu use up to 1 page of shadow memory
@@ -137,6 +141,24 @@ ur_result_t ShadowMemoryGPU::Destory() {
             Context, (void *)PrivateShadowOffset));
         PrivateShadowOffset = 0;
     }
+
+    static ur_result_t Result = [this]() {
+        const size_t PageSize = GetVirtualMemGranularity(Context, Device);
+        for (auto [MappedPtr, PhysicalMem] : VirtualMemMaps) {
+            UR_CALL(getContext()->urDdiTable.VirtualMem.pfnUnmap(
+                Context, (void *)MappedPtr, PageSize));
+            UR_CALL(
+                getContext()->urDdiTable.PhysicalMem.pfnRelease(PhysicalMem));
+        }
+        UR_CALL(getContext()->urDdiTable.VirtualMem.pfnFree(
+            Context, (const void *)ShadowBegin, GetShadowSize()));
+        UR_CALL(getContext()->urDdiTable.Context.pfnRelease(Context));
+        return UR_RESULT_SUCCESS;
+    }();
+    if (!Result) {
+        return Result;
+    }
+
     if (LocalShadowOffset != 0) {
         UR_CALL(getContext()->urDdiTable.USM.pfnFree(
             Context, (void *)LocalShadowOffset));
@@ -205,19 +227,8 @@ ur_result_t ShadowMemoryGPU::EnqueuePoisonShadow(ur_queue_handle_t Queue,
                     return URes;
                 }
 
-                VirtualMemMaps[MappedPtr].first = PhysicalMem;
+                VirtualMemMaps[MappedPtr] = PhysicalMem;
             }
-
-            // We don't need to record virtual memory map for null pointer,
-            // since it doesn't have an alloc info.
-            if (Ptr == 0) {
-                continue;
-            }
-
-            auto AllocInfoIt =
-                getAsanInterceptor()->findAllocInfoByAddress(Ptr);
-            assert(AllocInfoIt);
-            VirtualMemMaps[MappedPtr].second.insert((*AllocInfoIt)->second);
         }
     }
 
@@ -235,35 +246,6 @@ ur_result_t ShadowMemoryGPU::EnqueuePoisonShadow(ur_queue_handle_t Queue,
     return UR_RESULT_SUCCESS;
 }
 
-ur_result_t ShadowMemoryGPU::ReleaseShadow(std::shared_ptr<AllocInfo> AI) {
-    uptr ShadowBegin = MemToShadow(AI->AllocBegin);
-    uptr ShadowEnd = MemToShadow(AI->AllocBegin + AI->AllocSize);
-    assert(ShadowBegin <= ShadowEnd);
-
-    static const size_t PageSize = GetVirtualMemGranularity(Context, Device);
-
-    for (auto MappedPtr = RoundDownTo(ShadowBegin, PageSize);
-         MappedPtr <= ShadowEnd; MappedPtr += PageSize) {
-        std::scoped_lock<ur_mutex> Guard(VirtualMemMapsMutex);
-        if (VirtualMemMaps.find(MappedPtr) == VirtualMemMaps.end()) {
-            continue;
-        }
-        VirtualMemMaps[MappedPtr].second.erase(AI);
-        if (VirtualMemMaps[MappedPtr].second.empty()) {
-            UR_CALL(getContext()->urDdiTable.VirtualMem.pfnUnmap(
-                Context, (void *)MappedPtr, PageSize));
-            UR_CALL(getContext()->urDdiTable.PhysicalMem.pfnRelease(
-                VirtualMemMaps[MappedPtr].first));
-            getContext()->logger.debug("urVirtualMemUnmap: {} ~ {}",
-                                       (void *)MappedPtr,
-                                       (void *)(MappedPtr + PageSize - 1));
-            VirtualMemMaps.erase(MappedPtr);
-        }
-    }
-
-    return UR_RESULT_SUCCESS;
-}
-
 ur_result_t ShadowMemoryGPU::AllocLocalShadow(ur_queue_handle_t Queue,
                                               uint32_t NumWG, uptr &Begin,
                                               uptr &End) {
diff --git a/source/loader/layers/sanitizer/asan/asan_shadow.hpp b/source/loader/layers/sanitizer/asan/asan_shadow.hpp
index 0658a07925..48054378fe 100644
--- a/source/loader/layers/sanitizer/asan/asan_shadow.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_shadow.hpp
@@ -35,10 +35,6 @@ struct ShadowMemory {
     virtual ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr,
                                             uptr Size, u8 Value) = 0;
 
-    virtual ur_result_t ReleaseShadow(std::shared_ptr<AllocInfo>) {
-        return UR_RESULT_SUCCESS;
-    }
-
     virtual size_t GetShadowSize() = 0;
 
     virtual ur_result_t AllocLocalShadow(ur_queue_handle_t Queue,
@@ -98,8 +94,6 @@ struct ShadowMemoryGPU : public ShadowMemory {
     ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr,
                                     uptr Size, u8 Value) override final;
 
-    ur_result_t ReleaseShadow(std::shared_ptr<AllocInfo> AI) override final;
-
     ur_result_t AllocLocalShadow(ur_queue_handle_t Queue, uint32_t NumWG,
                                  uptr &Begin, uptr &End) override final;
 
@@ -108,10 +102,7 @@ struct ShadowMemoryGPU : public ShadowMemory {
 
     ur_mutex VirtualMemMapsMutex;
 
-    std::unordered_map<
-        uptr, std::pair<ur_physical_mem_handle_t,
-                        std::unordered_set<std::shared_ptr<AllocInfo>>>>
-        VirtualMemMaps;
+    std::unordered_map<uptr, ur_physical_mem_handle_t> VirtualMemMaps;
 
     uptr LocalShadowOffset = 0;