From a2085420f4ee3522f997ade3caff4fa4bf330d17 Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 3 Oct 2023 14:32:17 +0100 Subject: [PATCH 1/8] Replace uses of the global-offset intrinsic The uses of each global-offset intrinsic are fully removed from the kernel when -enable-global-offset=false. Afterwards, the offset intrinsic is removed as well. --- llvm/lib/SYCLLowerIR/GlobalOffset.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp index 05e1a7431975..bac76bf3f09c 100644 --- a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp +++ b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp @@ -17,6 +17,7 @@ #include "llvm/SYCLLowerIR/TargetHelpers.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "iostream" using namespace llvm; @@ -61,9 +62,6 @@ ModulePass *llvm::createGlobalOffsetPassLegacy() { // New PM implementation. PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { - if (!EnableGlobalOffset) - return PreservedAnalyses::all(); - AT = TargetHelpers::getArchType(M); Function *ImplicitOffsetIntrinsic = M.getFunction(Intrinsic::getName( AT == ArchType::Cuda @@ -73,6 +71,12 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty()) return PreservedAnalyses::all(); + if (!EnableGlobalOffset) { + ImplicitOffsetIntrinsic->replaceAllUsesWith( + Constant::getNullValue(ImplicitOffsetIntrinsic->getType())); + ImplicitOffsetIntrinsic->eraseFromParent(); + return PreservedAnalyses::none(); + } // For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is // happy with ADDRESS_SPACE_GENERIC (0). TargetAS = AT == ArchType::Cuda ? 0 : 5; From 1ed65a7f9d03fc7d307b838cb9d102ffb6a35c10 Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 10 Oct 2023 14:54:00 +0100 Subject: [PATCH 2/8] Fix global offset removal + Remove GEP and Loads Instead of simply replacing all uses of the global offset intrinsics, remove all its CallInsts and GEPs and Loads that use it. Usages of each Load are replaced by constant zeros. This ensures that no usage fragments are left in the kernel after the global offset pass is completed. --- llvm/lib/SYCLLowerIR/GlobalOffset.cpp | 85 ++++++++++++++++++--------- 1 file changed, 57 insertions(+), 28 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp index bac76bf3f09c..a0bdc591bad3 100644 --- a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp +++ b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp @@ -17,7 +17,6 @@ #include "llvm/SYCLLowerIR/TargetHelpers.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "iostream" using namespace llvm; @@ -72,38 +71,68 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { return PreservedAnalyses::all(); if (!EnableGlobalOffset) { - ImplicitOffsetIntrinsic->replaceAllUsesWith( - Constant::getNullValue(ImplicitOffsetIntrinsic->getType())); - ImplicitOffsetIntrinsic->eraseFromParent(); - return PreservedAnalyses::none(); - } - // For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is - // happy with ADDRESS_SPACE_GENERIC (0). - TargetAS = AT == ArchType::Cuda ? 0 : 5; - /// The value for NVVM's ADDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen - /// to be 3, use it for the implicit argument pointer type. - KernelImplicitArgumentType = - ArrayType::get(Type::getInt32Ty(M.getContext()), 3); - ImplicitOffsetPtrType = - Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS); - assert((ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) && - "Implicit offset intrinsic does not return the expected type"); - - SmallVector KernelPayloads; - TargetHelpers::populateKernels(M, KernelPayloads, AT); - - // Validate kernels and populate entry map - EntryPointMetadata = generateKernelMDNodeMap(M, KernelPayloads); + SmallVector Worklist; + SmallVector LI; + SmallVector PtrUses; + + std::function getLoads = [&](Instruction *P) { + PtrUses.push_back(P); + if (isa(*P)) + LI.push_back(cast(P)); + else { + for (Value *V : P->users()) { + assert(isa(*V) || isa(*V)); + getLoads(cast(V)); + } + } + }; + for (Value *V : ImplicitOffsetIntrinsic->users()) { + assert(isa(*V)); + Worklist.push_back(cast(V)); + for (Value *V2 : V->users()) { + assert(isa(*V2) || isa(*V2)); + getLoads(cast(V2)); + } + } + for (LoadInst *L : LI) + L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0)); - // Add implicit parameters to all direct and indirect users of the offset - addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr); + for (auto *I : reverse(PtrUses)) + I->eraseFromParent(); - // Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete - // it. + for (CallInst *CI : Worklist) { + Instruction *I = cast(CI); + I->eraseFromParent(); + } + } else { + // For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is + // happy with ADDRESS_SPACE_GENERIC (0). + TargetAS = AT == ArchType::Cuda ? 0 : 5; + /// The value for NVVM's adDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen + /// to be 3, use it for the implicit argument pointer type. + KernelImplicitArgumentType = + ArrayType::get(Type::getInt32Ty(M.getContext()), 3); + ImplicitOffsetPtrType = + Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS); + assert( + (ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) && + "Implicit offset intrinsic does not return the expected type"); + + SmallVector KernelPayloads; + TargetHelpers::populateKernels(M, KernelPayloads, AT); + + // Validate kernels and populate entry map + EntryPointMetadata = generateKernelMDNodeMap(M, KernelPayloads); + + // Add implicit parameters to all direct and indirect users of the offset + addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr); + + // Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete + // it. + } assert(ImplicitOffsetIntrinsic->use_empty() && "Not all uses of intrinsic removed"); ImplicitOffsetIntrinsic->eraseFromParent(); - return PreservedAnalyses::none(); } From c9d9d1feed7e7339ca6be10cbd81c054a4f3cc00 Mon Sep 17 00:00:00 2001 From: Martin Date: Wed, 11 Oct 2023 15:30:52 +0100 Subject: [PATCH 3/8] Add test cases for implicit offset optimization The test cases for the AMDGPU and NVPTX check if all GEPS, Loads and the implicit offset intrinsic have been removed --- .../CodeGen/AMDGPU/global-offset-removal.ll | 18 ++++++++++++++++++ .../CodeGen/NVPTX/global-offset-removal.ll | 19 +++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/global-offset-removal.ll create mode 100644 llvm/test/CodeGen/NVPTX/global-offset-removal.ll diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll b/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll new file mode 100644 index 000000000000..cf9570a06317 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll @@ -0,0 +1,18 @@ +; RUN: opt -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s + +; This test checks that the implicit offset intrinsic is correctly removed + +declare ptr addrspace(5) @llvm.amdgcn.implicit.offset() +; CHECK-NOT: llvm.amdgcn.implicit.offset + +define weak_odr dso_local i64 @_ZTS14example_kernel() { +entry: +; CHECK-NOT: @llvm.amdgcn.implicit.offset() +; CHECK-NOT: getelementptr +; CHECK-NOT: load + %0 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset() + %1 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 1 + %2 = load i32, ptr addrspace(5) %1, align 4 + %3 = zext i32 %2 to i64 + ret i64 %3 +} diff --git a/llvm/test/CodeGen/NVPTX/global-offset-removal.ll b/llvm/test/CodeGen/NVPTX/global-offset-removal.ll new file mode 100644 index 000000000000..70d4d0150391 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/global-offset-removal.ll @@ -0,0 +1,19 @@ +; RUN: opt -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s +target triple = "nvptx64-nvidia-cuda" + +; This test checks that the implicit offset intrinsic is correctly removed + +declare ptr @llvm.nvvm.implicit.offset() +; CHECK-NOT: llvm.nvvm.implicit.offset + +define weak_odr dso_local i64 @_ZTS14example_kernel() { +entry: +; CHECK-NOT: @llvm.nvvm.implicit.offset() +; CHECK-NOT: getelementptr +; CHECK-NOT: load + %0 = tail call ptr @llvm.nvvm.implicit.offset() + %1 = getelementptr inbounds i32, ptr %0, i64 1 + %2 = load i32, ptr %1, align 4 + %3 = zext i32 %2 to i64 + ret i64 %3 +} From e7ade029c1ac0929d67ce88993b59779e14d3837 Mon Sep 17 00:00:00 2001 From: Martin Date: Mon, 16 Oct 2023 16:29:24 +0100 Subject: [PATCH 4/8] Apply suggestions --- llvm/lib/SYCLLowerIR/GlobalOffset.cpp | 39 +++++++++---------- .../CodeGen/NVPTX/global-offset-removal.ll | 2 + 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp index a0bdc591bad3..d54ca34f7c02 100644 --- a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp +++ b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp @@ -59,6 +59,19 @@ ModulePass *llvm::createGlobalOffsetPassLegacy() { return new GlobalOffsetLegacy(); } +void getLoads(Instruction *P, SmallVectorImpl &Traversed, + SmallVectorImpl &Loads) { + Traversed.push_back(P); + auto *L = dyn_cast(P); + if (L) + Loads.push_back(L); + else { + assert(isa(*P)); + for (Value *V : P->users()) + getLoads(cast(V), Traversed, Loads); + } +} + // New PM implementation. PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { AT = TargetHelpers::getArchType(M); @@ -75,24 +88,10 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { SmallVector LI; SmallVector PtrUses; - std::function getLoads = [&](Instruction *P) { - PtrUses.push_back(P); - if (isa(*P)) - LI.push_back(cast(P)); - else { - for (Value *V : P->users()) { - assert(isa(*V) || isa(*V)); - getLoads(cast(V)); - } - } - }; for (Value *V : ImplicitOffsetIntrinsic->users()) { - assert(isa(*V)); Worklist.push_back(cast(V)); - for (Value *V2 : V->users()) { - assert(isa(*V2) || isa(*V2)); - getLoads(cast(V2)); - } + for (Value *V2 : V->users()) + getLoads(cast(V2), PtrUses, LI); } for (LoadInst *L : LI) L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0)); @@ -101,7 +100,7 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { I->eraseFromParent(); for (CallInst *CI : Worklist) { - Instruction *I = cast(CI); + auto *I = cast(CI); I->eraseFromParent(); } } else { @@ -126,10 +125,10 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { // Add implicit parameters to all direct and indirect users of the offset addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr); - - // Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete - // it. } + + // Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete + // it. assert(ImplicitOffsetIntrinsic->use_empty() && "Not all uses of intrinsic removed"); ImplicitOffsetIntrinsic->eraseFromParent(); diff --git a/llvm/test/CodeGen/NVPTX/global-offset-removal.ll b/llvm/test/CodeGen/NVPTX/global-offset-removal.ll index 70d4d0150391..da116feede47 100644 --- a/llvm/test/CodeGen/NVPTX/global-offset-removal.ll +++ b/llvm/test/CodeGen/NVPTX/global-offset-removal.ll @@ -11,6 +11,8 @@ entry: ; CHECK-NOT: @llvm.nvvm.implicit.offset() ; CHECK-NOT: getelementptr ; CHECK-NOT: load +; CHECK: [[REG:%[0-9]+]] = zext i{{[0-9]+}} 0 to i{{[0-9]+}} +; CHECK: ret i{{[0-9]+}} [[REG]] %0 = tail call ptr @llvm.nvvm.implicit.offset() %1 = getelementptr inbounds i32, ptr %0, i64 1 %2 = load i32, ptr %1, align 4 From 638c4c4a9b64ce3b60193a46321b3dd02635439c Mon Sep 17 00:00:00 2001 From: Martin Date: Mon, 16 Oct 2023 16:34:46 +0100 Subject: [PATCH 5/8] Add zext check to AMDGPU global offset test llvm/test/CodeGen/NVPTX/global-offset-removal.ll and llvm/test/CodeGen/AMDGPU/global-offset-removal.ll are not supposed to pass when the optimization would simply delete the Load usages --- llvm/test/CodeGen/AMDGPU/global-offset-removal.ll | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll b/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll index cf9570a06317..ccc06cf248a6 100644 --- a/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll +++ b/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll @@ -10,6 +10,8 @@ entry: ; CHECK-NOT: @llvm.amdgcn.implicit.offset() ; CHECK-NOT: getelementptr ; CHECK-NOT: load +; CHECK: [[REG:%[0-9]+]] = zext i{{[0-9]+}} 0 to i{{[0-9]+}} +; CHECK: ret i{{[0-9]+}} [[REG]] %0 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset() %1 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 1 %2 = load i32, ptr addrspace(5) %1, align 4 From 96bdba986ed35931e643a3c3c8a06919223677f8 Mon Sep 17 00:00:00 2001 From: Martin Date: Thu, 26 Oct 2023 15:15:47 +0100 Subject: [PATCH 6/8] Add comment to clarify usage of reverse function --- llvm/lib/SYCLLowerIR/GlobalOffset.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp index d54ca34f7c02..771ae595d074 100644 --- a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp +++ b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp @@ -96,6 +96,8 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { for (LoadInst *L : LI) L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0)); + // PtrUses is returned by `getLoads` in topological order. + // Walk it backwards so we don't violate users for (auto *I : reverse(PtrUses)) I->eraseFromParent(); From be1ccba397139154c671b45e73992a7a0559734d Mon Sep 17 00:00:00 2001 From: Martin Date: Fri, 27 Oct 2023 10:27:08 +0100 Subject: [PATCH 7/8] Apply suggestions --- llvm/lib/SYCLLowerIR/GlobalOffset.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp index 771ae595d074..e9191d64d1ba 100644 --- a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp +++ b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp @@ -59,11 +59,11 @@ ModulePass *llvm::createGlobalOffsetPassLegacy() { return new GlobalOffsetLegacy(); } -void getLoads(Instruction *P, SmallVectorImpl &Traversed, +// Recursive helper function to collect Loads from GEPs in a BFS fashion. +static void getLoads(Instruction *P, SmallVectorImpl &Traversed, SmallVectorImpl &Loads) { Traversed.push_back(P); - auto *L = dyn_cast(P); - if (L) + if (auto *L = dyn_cast(P)) // Base case for recursion Loads.push_back(L); else { assert(isa(*P)); @@ -88,19 +88,24 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { SmallVector LI; SmallVector PtrUses; + // Collect all GEPs and Loads from the intrinsic's CallInsts for (Value *V : ImplicitOffsetIntrinsic->users()) { Worklist.push_back(cast(V)); for (Value *V2 : V->users()) getLoads(cast(V2), PtrUses, LI); } + + // Replace each use of a collected Load with a Constant 0 for (LoadInst *L : LI) L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0)); + // Remove all collected Loads and GEPs from the kernel. // PtrUses is returned by `getLoads` in topological order. - // Walk it backwards so we don't violate users + // Walk it backwards so we don't violate users. for (auto *I : reverse(PtrUses)) I->eraseFromParent(); + // Remove all collected CallInsts from the kernel. for (CallInst *CI : Worklist) { auto *I = cast(CI); I->eraseFromParent(); From 0fef3398af30b75d9d7baf9796c7593da7dc6894 Mon Sep 17 00:00:00 2001 From: Martin Date: Fri, 27 Oct 2023 10:39:58 +0100 Subject: [PATCH 8/8] Fix formatting --- llvm/lib/SYCLLowerIR/GlobalOffset.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp index e9191d64d1ba..02e6ed3c32f2 100644 --- a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp +++ b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp @@ -59,9 +59,9 @@ ModulePass *llvm::createGlobalOffsetPassLegacy() { return new GlobalOffsetLegacy(); } -// Recursive helper function to collect Loads from GEPs in a BFS fashion. +// Recursive helper function to collect Loads from GEPs in a BFS fashion. static void getLoads(Instruction *P, SmallVectorImpl &Traversed, - SmallVectorImpl &Loads) { + SmallVectorImpl &Loads) { Traversed.push_back(P); if (auto *L = dyn_cast(P)) // Base case for recursion Loads.push_back(L);