diff --git a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp index 05e1a7431975f..02e6ed3c32f2c 100644 --- a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp +++ b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp @@ -59,11 +59,21 @@ ModulePass *llvm::createGlobalOffsetPassLegacy() { return new GlobalOffsetLegacy(); } +// Recursive helper function to collect Loads from GEPs in a BFS fashion. +static void getLoads(Instruction *P, SmallVectorImpl &Traversed, + SmallVectorImpl &Loads) { + Traversed.push_back(P); + if (auto *L = dyn_cast(P)) // Base case for recursion + Loads.push_back(L); + else { + assert(isa(*P)); + for (Value *V : P->users()) + getLoads(cast(V), Traversed, Loads); + } +} + // New PM implementation. PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { - if (!EnableGlobalOffset) - return PreservedAnalyses::all(); - AT = TargetHelpers::getArchType(M); Function *ImplicitOffsetIntrinsic = M.getFunction(Intrinsic::getName( AT == ArchType::Cuda @@ -73,33 +83,62 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty()) return PreservedAnalyses::all(); - // For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is - // happy with ADDRESS_SPACE_GENERIC (0). - TargetAS = AT == ArchType::Cuda ? 0 : 5; - /// The value for NVVM's ADDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen - /// to be 3, use it for the implicit argument pointer type. - KernelImplicitArgumentType = - ArrayType::get(Type::getInt32Ty(M.getContext()), 3); - ImplicitOffsetPtrType = - Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS); - assert((ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) && - "Implicit offset intrinsic does not return the expected type"); + if (!EnableGlobalOffset) { + SmallVector Worklist; + SmallVector LI; + SmallVector PtrUses; - SmallVector KernelPayloads; - TargetHelpers::populateKernels(M, KernelPayloads, AT); + // Collect all GEPs and Loads from the intrinsic's CallInsts + for (Value *V : ImplicitOffsetIntrinsic->users()) { + Worklist.push_back(cast(V)); + for (Value *V2 : V->users()) + getLoads(cast(V2), PtrUses, LI); + } + + // Replace each use of a collected Load with a Constant 0 + for (LoadInst *L : LI) + L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0)); - // Validate kernels and populate entry map - EntryPointMetadata = generateKernelMDNodeMap(M, KernelPayloads); + // Remove all collected Loads and GEPs from the kernel. + // PtrUses is returned by `getLoads` in topological order. + // Walk it backwards so we don't violate users. + for (auto *I : reverse(PtrUses)) + I->eraseFromParent(); - // Add implicit parameters to all direct and indirect users of the offset - addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr); + // Remove all collected CallInsts from the kernel. + for (CallInst *CI : Worklist) { + auto *I = cast(CI); + I->eraseFromParent(); + } + } else { + // For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is + // happy with ADDRESS_SPACE_GENERIC (0). + TargetAS = AT == ArchType::Cuda ? 0 : 5; + /// The value for NVVM's adDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen + /// to be 3, use it for the implicit argument pointer type. + KernelImplicitArgumentType = + ArrayType::get(Type::getInt32Ty(M.getContext()), 3); + ImplicitOffsetPtrType = + Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS); + assert( + (ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) && + "Implicit offset intrinsic does not return the expected type"); + + SmallVector KernelPayloads; + TargetHelpers::populateKernels(M, KernelPayloads, AT); + + // Validate kernels and populate entry map + EntryPointMetadata = generateKernelMDNodeMap(M, KernelPayloads); + + // Add implicit parameters to all direct and indirect users of the offset + addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr); + } // Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete // it. assert(ImplicitOffsetIntrinsic->use_empty() && "Not all uses of intrinsic removed"); ImplicitOffsetIntrinsic->eraseFromParent(); - return PreservedAnalyses::none(); } diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll b/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll new file mode 100644 index 0000000000000..ccc06cf248a68 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll @@ -0,0 +1,20 @@ +; RUN: opt -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s + +; This test checks that the implicit offset intrinsic is correctly removed + +declare ptr addrspace(5) @llvm.amdgcn.implicit.offset() +; CHECK-NOT: llvm.amdgcn.implicit.offset + +define weak_odr dso_local i64 @_ZTS14example_kernel() { +entry: +; CHECK-NOT: @llvm.amdgcn.implicit.offset() +; CHECK-NOT: getelementptr +; CHECK-NOT: load +; CHECK: [[REG:%[0-9]+]] = zext i{{[0-9]+}} 0 to i{{[0-9]+}} +; CHECK: ret i{{[0-9]+}} [[REG]] + %0 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset() + %1 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 1 + %2 = load i32, ptr addrspace(5) %1, align 4 + %3 = zext i32 %2 to i64 + ret i64 %3 +} diff --git a/llvm/test/CodeGen/NVPTX/global-offset-removal.ll b/llvm/test/CodeGen/NVPTX/global-offset-removal.ll new file mode 100644 index 0000000000000..da116feede474 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/global-offset-removal.ll @@ -0,0 +1,21 @@ +; RUN: opt -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s +target triple = "nvptx64-nvidia-cuda" + +; This test checks that the implicit offset intrinsic is correctly removed + +declare ptr @llvm.nvvm.implicit.offset() +; CHECK-NOT: llvm.nvvm.implicit.offset + +define weak_odr dso_local i64 @_ZTS14example_kernel() { +entry: +; CHECK-NOT: @llvm.nvvm.implicit.offset() +; CHECK-NOT: getelementptr +; CHECK-NOT: load +; CHECK: [[REG:%[0-9]+]] = zext i{{[0-9]+}} 0 to i{{[0-9]+}} +; CHECK: ret i{{[0-9]+}} [[REG]] + %0 = tail call ptr @llvm.nvvm.implicit.offset() + %1 = getelementptr inbounds i32, ptr %0, i64 1 + %2 = load i32, ptr %1, align 4 + %3 = zext i32 %2 to i64 + ret i64 %3 +}