From dc588e483b4f44fee5467737e2b8bde3a9b4650c Mon Sep 17 00:00:00 2001
From: "Zhao, Maosu" <maosu.zhao@intel.com>
Date: Tue, 29 Oct 2024 19:37:38 -0700
Subject: [PATCH] [DeviceASAN] Re-use shadow if required size is not larger
 than last one

---
 .../layers/sanitizer/asan_interceptor.cpp     | 149 ++++--------------
 .../loader/layers/sanitizer/asan_shadow.cpp   | 106 +++++++++++--
 .../loader/layers/sanitizer/asan_shadow.hpp   |  32 ++++
 3 files changed, 163 insertions(+), 124 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index 4a315588fd..fbcc401909 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -682,28 +682,6 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
                      LocalWorkSize[Dim];
         }
 
-        auto EnqueueAllocateShadowMemory = [Context = ContextInfo->Handle,
-                                            Device = DeviceInfo->Handle,
-                                            Queue](size_t Size, uptr &Ptr) {
-            void *Allocated = nullptr;
-            auto URes = getContext()->urDdiTable.USM.pfnDeviceAlloc(
-                Context, Device, nullptr, nullptr, Size, &Allocated);
-            if (URes != UR_RESULT_SUCCESS) {
-                return URes;
-            }
-            // Initialize shadow memory
-            URes = EnqueueUSMBlockingSet(Queue, Allocated, 0, Size);
-            if (URes != UR_RESULT_SUCCESS) {
-                [[maybe_unused]] auto URes =
-                    getContext()->urDdiTable.USM.pfnFree(Context, Allocated);
-                assert(URes == UR_RESULT_SUCCESS &&
-                       "urUSMFree failed at allocating shadow memory");
-                Allocated = nullptr;
-            }
-            Ptr = (uptr)Allocated;
-            return URes;
-        };
-
         auto LocalMemoryUsage =
             GetKernelLocalMemorySize(Kernel, DeviceInfo->Handle);
         auto PrivateMemoryUsage =
@@ -715,86 +693,45 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
 
         // Write shadow memory offset for local memory
         if (getOptions().DetectLocals) {
-            // CPU needn't this
-            if (DeviceInfo->Type == DeviceType::GPU_PVC ||
-                DeviceInfo->Type == DeviceType::GPU_DG2) {
-                const size_t LocalMemorySize =
-                    GetDeviceLocalMemorySize(DeviceInfo->Handle);
-                const size_t LocalShadowMemorySize =
-                    (NumWG * LocalMemorySize) >> ASAN_SHADOW_SCALE;
-
-                getContext()->logger.debug(
-                    "LocalMemory(WorkGroup={}, LocalMemorySize={}, "
-                    "LocalShadowMemorySize={})",
-                    NumWG, LocalMemorySize, LocalShadowMemorySize);
-
-                if (EnqueueAllocateShadowMemory(
-                        LocalShadowMemorySize,
-                        LaunchInfo.Data->LocalShadowOffset) !=
-                    UR_RESULT_SUCCESS) {
-                    getContext()->logger.warning(
-                        "Failed to allocate shadow memory for local "
-                        "memory, maybe the number of workgroup ({}) is too "
-                        "large",
-                        NumWG);
-                    getContext()->logger.warning(
-                        "Skip checking local memory of kernel <{}>",
-                        GetKernelName(Kernel));
-                } else {
-                    LaunchInfo.Data->LocalShadowOffsetEnd =
-                        LaunchInfo.Data->LocalShadowOffset +
-                        LocalShadowMemorySize - 1;
-
-                    ContextInfo->Stats.UpdateShadowMalloced(
-                        LocalShadowMemorySize);
-
-                    getContext()->logger.info(
-                        "ShadowMemory(Local, {} - {})",
-                        (void *)LaunchInfo.Data->LocalShadowOffset,
-                        (void *)LaunchInfo.Data->LocalShadowOffsetEnd);
-                }
+            if (DeviceInfo->Shadow->AllocLocalShadow(
+                    Queue, NumWG, LaunchInfo.Data->LocalShadowOffset,
+                    LaunchInfo.Data->LocalShadowOffsetEnd) !=
+                UR_RESULT_SUCCESS) {
+                getContext()->logger.warning(
+                    "Failed to allocate shadow memory for local "
+                    "memory, maybe the number of workgroup ({}) is too "
+                    "large",
+                    NumWG);
+                getContext()->logger.warning(
+                    "Skip checking local memory of kernel <{}>",
+                    GetKernelName(Kernel));
+            } else {
+                getContext()->logger.info(
+                    "ShadowMemory(Local, WorkGroup{}, {} - {})", NumWG,
+                    (void *)LaunchInfo.Data->LocalShadowOffset,
+                    (void *)LaunchInfo.Data->LocalShadowOffsetEnd);
             }
         }
 
         // Write shadow memory offset for private memory
         if (getOptions().DetectPrivates) {
-            if (DeviceInfo->Type == DeviceType::CPU) {
-                LaunchInfo.Data->PrivateShadowOffset =
-                    DeviceInfo->Shadow->ShadowBegin;
-            } else if (DeviceInfo->Type == DeviceType::GPU_PVC ||
-                       DeviceInfo->Type == DeviceType::GPU_DG2) {
-                const size_t PrivateShadowMemorySize =
-                    (NumWG * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE;
-
-                getContext()->logger.debug("PrivateMemory(WorkGroup={}, "
-                                           "PrivateShadowMemorySize={})",
-                                           NumWG, PrivateShadowMemorySize);
-
-                if (EnqueueAllocateShadowMemory(
-                        PrivateShadowMemorySize,
-                        LaunchInfo.Data->PrivateShadowOffset) !=
-                    UR_RESULT_SUCCESS) {
-                    getContext()->logger.warning(
-                        "Failed to allocate shadow memory for private "
-                        "memory, maybe the number of workgroup ({}) is too "
-                        "large",
-                        NumWG);
-                    getContext()->logger.warning(
-                        "Skip checking private memory of kernel <{}>",
-                        GetKernelName(Kernel));
-                } else {
-                    LaunchInfo.Data->PrivateShadowOffsetEnd =
-                        LaunchInfo.Data->PrivateShadowOffset +
-                        PrivateShadowMemorySize - 1;
-
-                    ContextInfo->Stats.UpdateShadowMalloced(
-                        PrivateShadowMemorySize);
-
-                    getContext()->logger.info(
-                        "ShadowMemory(Private, {} - {})",
-                        (void *)LaunchInfo.Data->PrivateShadowOffset,
-                        (void *)LaunchInfo.Data->PrivateShadowOffsetEnd);
-                }
+            if (DeviceInfo->Shadow->AllocPrivateShadow(
+                    Queue, NumWG, LaunchInfo.Data->PrivateShadowOffset,
+                    LaunchInfo.Data->PrivateShadowOffsetEnd) !=
+                UR_RESULT_SUCCESS) {
+                getContext()->logger.warning(
+                    "Failed to allocate shadow memory for private "
+                    "memory, maybe the number of workgroup ({}) is too "
+                    "large",
+                    NumWG);
+                getContext()->logger.warning(
+                    "Skip checking private memory of kernel <{}>",
+                    GetKernelName(Kernel));
+            } else {
+                getContext()->logger.info(
+                    "ShadowMemory(Private, WorkGroup{}, {} - {})", NumWG,
+                    (void *)LaunchInfo.Data->PrivateShadowOffset,
+                    (void *)LaunchInfo.Data->PrivateShadowOffsetEnd);
             }
         }
     } while (false);
@@ -878,25 +815,7 @@ ur_result_t USMLaunchInfo::updateKernelInfo(const KernelInfo &KI) {
 USMLaunchInfo::~USMLaunchInfo() {
     [[maybe_unused]] ur_result_t Result;
     if (Data) {
-        auto Type = GetDeviceType(Context, Device);
         auto ContextInfo = getContext()->interceptor->getContextInfo(Context);
-        if (Type == DeviceType::GPU_PVC || Type == DeviceType::GPU_DG2) {
-            if (Data->PrivateShadowOffset) {
-                ContextInfo->Stats.UpdateShadowFreed(
-                    Data->PrivateShadowOffsetEnd - Data->PrivateShadowOffset +
-                    1);
-                Result = getContext()->urDdiTable.USM.pfnFree(
-                    Context, (void *)Data->PrivateShadowOffset);
-                assert(Result == UR_RESULT_SUCCESS);
-            }
-            if (Data->LocalShadowOffset) {
-                ContextInfo->Stats.UpdateShadowFreed(
-                    Data->LocalShadowOffsetEnd - Data->LocalShadowOffset + 1);
-                Result = getContext()->urDdiTable.USM.pfnFree(
-                    Context, (void *)Data->LocalShadowOffset);
-                assert(Result == UR_RESULT_SUCCESS);
-            }
-        }
         if (Data->LocalArgs) {
             Result = getContext()->urDdiTable.USM.pfnFree(
                 Context, (void *)Data->LocalArgs);
diff --git a/source/loader/layers/sanitizer/asan_shadow.cpp b/source/loader/layers/sanitizer/asan_shadow.cpp
index 629ce3a491..f5800a694c 100644
--- a/source/loader/layers/sanitizer/asan_shadow.cpp
+++ b/source/loader/layers/sanitizer/asan_shadow.cpp
@@ -131,16 +131,23 @@ ur_result_t ShadowMemoryGPU::Setup() {
 }
 
 ur_result_t ShadowMemoryGPU::Destory() {
-    if (ShadowBegin == 0) {
-        return UR_RESULT_SUCCESS;
+    if (PrivateShadowOffset != 0) {
+        UR_CALL(getContext()->urDdiTable.USM.pfnFree(
+            Context, (void *)PrivateShadowOffset));
+        PrivateShadowOffset = 0;
     }
-    static ur_result_t Result = [this]() {
-        auto Result = getContext()->urDdiTable.VirtualMem.pfnFree(
-            Context, (const void *)ShadowBegin, GetShadowSize());
-        getContext()->urDdiTable.Context.pfnRelease(Context);
-        return Result;
-    }();
-    return Result;
+    if (LocalShadowOffset != 0) {
+        UR_CALL(getContext()->urDdiTable.USM.pfnFree(
+            Context, (void *)LocalShadowOffset));
+        LocalShadowOffset = 0;
+    }
+    if (ShadowBegin != 0) {
+        UR_CALL(getContext()->urDdiTable.VirtualMem.pfnFree(
+            Context, (const void *)ShadowBegin, GetShadowSize()));
+        UR_CALL(getContext()->urDdiTable.Context.pfnRelease(Context));
+        ShadowBegin = ShadowEnd = 0;
+    }
+    return UR_RESULT_SUCCESS;
 }
 
 ur_result_t ShadowMemoryGPU::EnqueuePoisonShadow(ur_queue_handle_t Queue,
@@ -255,6 +262,87 @@ ur_result_t ShadowMemoryGPU::ReleaseShadow(std::shared_ptr<AllocInfo> AI) {
     return UR_RESULT_SUCCESS;
 }
 
+ur_result_t ShadowMemoryGPU::AllocLocalShadow(ur_queue_handle_t Queue,
+                                              uint32_t NumWG, uptr &Begin,
+                                              uptr &End) {
+    const size_t LocalMemorySize = GetDeviceLocalMemorySize(Device);
+    const size_t RequiredShadowSize =
+        (NumWG * LocalMemorySize) >> ASAN_SHADOW_SCALE;
+    static size_t LastAllocedSize = 0;
+    if (RequiredShadowSize > LastAllocedSize) {
+        auto ContextInfo = getContext()->interceptor->getContextInfo(Context);
+        if (LocalShadowOffset) {
+            UR_CALL(getContext()->urDdiTable.USM.pfnFree(
+                Context, (void *)LocalShadowOffset));
+            ContextInfo->Stats.UpdateShadowFreed(LastAllocedSize);
+            LocalShadowOffset = 0;
+            LastAllocedSize = 0;
+        }
+
+        UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc(
+            Context, Device, nullptr, nullptr, RequiredShadowSize,
+            (void **)&LocalShadowOffset));
+
+        // Initialize shadow memory
+        ur_result_t URes = EnqueueUSMBlockingSet(
+            Queue, (void *)LocalShadowOffset, 0, RequiredShadowSize);
+        if (URes != UR_RESULT_SUCCESS) {
+            UR_CALL(getContext()->urDdiTable.USM.pfnFree(
+                Context, (void *)LocalShadowOffset));
+            LocalShadowOffset = 0;
+            LastAllocedSize = 0;
+        }
+
+        ContextInfo->Stats.UpdateShadowMalloced(RequiredShadowSize);
+
+        LastAllocedSize = RequiredShadowSize;
+    }
+
+    Begin = LocalShadowOffset;
+    End = LocalShadowOffset + RequiredShadowSize - 1;
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ShadowMemoryGPU::AllocPrivateShadow(ur_queue_handle_t Queue,
+                                                uint32_t NumWG, uptr &Begin,
+                                                uptr &End) {
+    const size_t RequiredShadowSize =
+        (NumWG * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE;
+    static size_t LastAllocedSize = 0;
+    if (RequiredShadowSize > LastAllocedSize) {
+        auto ContextInfo = getContext()->interceptor->getContextInfo(Context);
+        if (PrivateShadowOffset) {
+            UR_CALL(getContext()->urDdiTable.USM.pfnFree(
+                Context, (void *)PrivateShadowOffset));
+            ContextInfo->Stats.UpdateShadowFreed(LastAllocedSize);
+            PrivateShadowOffset = 0;
+            LastAllocedSize = 0;
+        }
+
+        UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc(
+            Context, Device, nullptr, nullptr, RequiredShadowSize,
+            (void **)&PrivateShadowOffset));
+
+        // Initialize shadow memory
+        ur_result_t URes = EnqueueUSMBlockingSet(
+            Queue, (void *)PrivateShadowOffset, 0, RequiredShadowSize);
+        if (URes != UR_RESULT_SUCCESS) {
+            UR_CALL(getContext()->urDdiTable.USM.pfnFree(
+                Context, (void *)PrivateShadowOffset));
+            PrivateShadowOffset = 0;
+            LastAllocedSize = 0;
+        }
+
+        ContextInfo->Stats.UpdateShadowMalloced(RequiredShadowSize);
+
+        LastAllocedSize = RequiredShadowSize;
+    }
+
+    Begin = PrivateShadowOffset;
+    End = PrivateShadowOffset + RequiredShadowSize - 1;
+    return UR_RESULT_SUCCESS;
+}
+
 uptr ShadowMemoryPVC::MemToShadow(uptr Ptr) {
     if (Ptr & 0xFF00000000000000ULL) { // Device USM
         return ShadowBegin + 0x80000000000ULL +
diff --git a/source/loader/layers/sanitizer/asan_shadow.hpp b/source/loader/layers/sanitizer/asan_shadow.hpp
index 7ae095062a..d6d6e634e6 100644
--- a/source/loader/layers/sanitizer/asan_shadow.hpp
+++ b/source/loader/layers/sanitizer/asan_shadow.hpp
@@ -39,6 +39,14 @@ struct ShadowMemory {
 
     virtual size_t GetShadowSize() = 0;
 
+    virtual ur_result_t AllocLocalShadow(ur_queue_handle_t Queue,
+                                         uint32_t NumWG, uptr &Begin,
+                                         uptr &End) = 0;
+
+    virtual ur_result_t AllocPrivateShadow(ur_queue_handle_t Queue,
+                                           uint32_t NumWG, uptr &Begin,
+                                           uptr &End) = 0;
+
     ur_context_handle_t Context{};
 
     ur_device_handle_t Device{};
@@ -62,6 +70,20 @@ struct ShadowMemoryCPU final : public ShadowMemory {
                                     uptr Size, u8 Value) override;
 
     size_t GetShadowSize() override { return 0x80000000000ULL; }
+
+    ur_result_t AllocLocalShadow(ur_queue_handle_t, uint32_t, uptr &Begin,
+                                 uptr &End) override {
+        Begin = ShadowBegin;
+        End = ShadowEnd;
+        return UR_RESULT_SUCCESS;
+    }
+
+    ur_result_t AllocPrivateShadow(ur_queue_handle_t, uint32_t, uptr &Begin,
+                                   uptr &End) override {
+        Begin = ShadowBegin;
+        End = ShadowEnd;
+        return UR_RESULT_SUCCESS;
+    }
 };
 
 struct ShadowMemoryGPU : public ShadowMemory {
@@ -76,12 +98,22 @@ struct ShadowMemoryGPU : public ShadowMemory {
 
     ur_result_t ReleaseShadow(std::shared_ptr<AllocInfo> AI) override final;
 
+    ur_result_t AllocLocalShadow(ur_queue_handle_t Queue, uint32_t NumWG,
+                                 uptr &Begin, uptr &End) override final;
+
+    ur_result_t AllocPrivateShadow(ur_queue_handle_t Queue, uint32_t NumWG,
+                                   uptr &Begin, uptr &End) override final;
+
     ur_mutex VirtualMemMapsMutex;
 
     std::unordered_map<
         uptr, std::pair<ur_physical_mem_handle_t,
                         std::unordered_set<std::shared_ptr<AllocInfo>>>>
         VirtualMemMaps;
+
+    uptr LocalShadowOffset = 0;
+
+    uptr PrivateShadowOffset = 0;
 };
 
 /// Shadow Memory layout of GPU PVC device