diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 6016bd5187d88..f913833a25119 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -149,6 +149,11 @@ struct AMDGPULowerBufferFatPointersPass const TargetMachine &TM; }; +struct AMDGPUCloneModuleLDSPass + : public PassInfoMixin { + PreservedAnalyses run(Module &, ModuleAnalysisManager &); +}; + void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCloneModuleLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCloneModuleLDS.cpp new file mode 100644 index 0000000000000..35fd88a592ec6 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUCloneModuleLDS.cpp @@ -0,0 +1,156 @@ +//===-- AMDGPUCloneModuleLDSPass.cpp ------------------------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The purpose of this pass is to ensure that the combined module contains +// as many LDS global variables as there are kernels that (indirectly) access +// them. As LDS variables behave like C++ static variables, it is important that +// each partition contains a unique copy of the variable on a per kernel basis. +// This representation also prepares the combined module to eliminate +// cross-module false dependencies of LDS variables. This pass runs prior to the +// AMDGPULowerModuleLDS pass in the fullLTO pipeline and is used to improve +// the functionality of --lto-partitions. +// +// This pass operates as follows: +// 1. Firstly, traverse the call graph from each kernel to determine the number +// of kernels calling each device function. +// 2. For each LDS global variable GV, determine the function F that defines it. +// Collect it's caller functions. Clone F and GV, and finally insert a +// call/invoke instruction in each caller function. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "Utils/AMDGPUMemoryUtils.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Support/ScopedPrinter.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; +using GVToFnMapTy = DenseMap; + +#define DEBUG_TYPE "amdgpu-clone-module-lds" + +static cl::opt MaxCountForClonedFunctions( + "clone-lds-functions-max-count", cl::init(16), cl::Hidden, + cl::desc("Specify a limit to the number of clones of a function")); + +/// Return the function that defines \p GV +/// \param GV The global variable in question +/// \return The function defining \p GV +static Function *getFunctionDefiningGV(GlobalVariable &GV) { + SmallVector Worklist(GV.users()); + while (!Worklist.empty()) { + User *U = Worklist.pop_back_val(); + if (auto *Inst = dyn_cast(U)) + return Inst->getFunction(); + if (auto *Op = dyn_cast(U)) + append_range(Worklist, Op->users()); + } + return nullptr; +}; + +/// Return a map of LDS globals paired with the function defining them +/// \param M Module in question +/// \return Map of LDS global variables and their functions +static GVToFnMapTy collectModuleGlobals(Module &M) { + GVToFnMapTy GVToFnMap; + for (auto &GA : M.aliases()) { + if (auto *GV = dyn_cast(GA.getAliaseeObject())) { + if (AMDGPU::isLDSVariableToLower(*GV) && !GVToFnMap.contains(GV)) + GVToFnMap.insert({GV, getFunctionDefiningGV(*GV)}); + } + } + + for (auto &GV : M.globals()) { + if (AMDGPU::isLDSVariableToLower(GV) && !GVToFnMap.contains(&GV)) + GVToFnMap.insert({&GV, getFunctionDefiningGV(GV)}); + } + return GVToFnMap; +} + +PreservedAnalyses AMDGPUCloneModuleLDSPass::run(Module &M, + ModuleAnalysisManager &AM) { + if (MaxCountForClonedFunctions.getValue() == 1) + return PreservedAnalyses::all(); + + bool Changed = false; + auto &CG = AM.getResult(M); + + // For each function in the call graph, determine the number + // of ancestor-caller kernels. + DenseMap KernelRefsToFuncs; + for (auto &Fn : M) { + if (Fn.getCallingConv() != CallingConv::AMDGPU_KERNEL) + continue; + for (auto I = df_begin(&CG), E = df_end(&CG); I != E; ++I) { + if (auto *F = I->getFunction()) + KernelRefsToFuncs[F]++; + } + } + + GVToFnMapTy GVToFnMap = collectModuleGlobals(M); + for (auto [GV, OldF] : GVToFnMap) { + LLVM_DEBUG(dbgs() << "Found LDS " << GV.getName() << " used in function " + << OldF->getName() << '\n'); + + // Collect all call instructions to OldF + SmallVector InstsCallingOldF; + for (auto &I : OldF->uses()) { + if (auto *CI = dyn_cast(I.getUser())) + InstsCallingOldF.push_back(CI); + } + + // Create as many clones of the function containing LDS global as + // there are kernels calling the function (including the function + // already defining the LDS global). Respectively, clone the + // LDS global and the call instructions to the function. + LLVM_DEBUG(dbgs() << "\tFunction is referenced by " + << KernelRefsToFuncs[OldF] << " kernels.\n"); + for (unsigned int ID = 0; + ID + 1 < std::min(KernelRefsToFuncs[OldF], + MaxCountForClonedFunctions.getValue()); + ++ID) { + // Clone LDS global variable + auto *NewGV = new GlobalVariable( + M, GV->getValueType(), GV->isConstant(), GlobalValue::InternalLinkage, + PoisonValue::get(GV->getValueType()), + GV->getName() + ".clone." + Twine(ID), GV, + GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false); + NewGV->copyAttributesFrom(GV); + NewGV->copyMetadata(GV, 0); + NewGV->setComdat(GV->getComdat()); + LLVM_DEBUG(dbgs() << "Inserting LDS clone with name " << NewGV->getName() + << '\n'); + + // Clone function + ValueToValueMapTy VMap; + VMap[GV] = NewGV; + auto *NewF = CloneFunction(OldF, VMap); + NewF->setName(OldF->getName() + ".clone." + Twine(ID)); + LLVM_DEBUG(dbgs() << "Inserting function clone with name " + << NewF->getName() << '\n'); + + // Create a new CallInst to call the cloned function + for (auto *Inst : InstsCallingOldF) { + Instruction *I = Inst->clone(); + I->setName(Inst->getName() + ".clone." + Twine(ID)); + if (auto *CI = dyn_cast(I)) + CI->setCalledOperand(NewF); + I->insertAfter(Inst); + LLVM_DEBUG(dbgs() << "Inserting inst: " << *I << '\n'); + } + Changed = true; + } + } + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 90f36fadf3590..eb4bf25fef628 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -22,6 +22,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers", AMDGPULowerBufferFatPointersPass(*this)) MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass()) MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this)) +MODULE_PASS("amdgpu-clone-module-lds", AMDGPUCloneModuleLDSPass()) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass()) #undef MODULE_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 305a6c8c3b926..09beabd3f9c55 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -725,6 +725,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks( // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. + PM.addPass(AMDGPUCloneModuleLDSPass()); if (EnableLowerModuleLDS) PM.addPass(AMDGPULowerModuleLDSPass(*this)); }); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 48325a0928f93..fbf59e0422cb7 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -50,6 +50,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUAtomicOptimizer.cpp AMDGPUAttributor.cpp AMDGPUCallLowering.cpp + AMDGPUCloneModuleLDS.cpp AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp AMDGPUCtorDtorLowering.cpp diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function-ancestor-kernels.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function-ancestor-kernels.ll new file mode 100644 index 0000000000000..bc1d90e1c5a58 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function-ancestor-kernels.ll @@ -0,0 +1,120 @@ +; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s + +; RUN: opt -passes=amdgpu-clone-module-lds %s -S -o %t +; RUN: llvm-split -o %t %t -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=MOD0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=MOD1 %s + +target triple = "amdgcn-amd-amdhsa" + +; Before transformation, After transformation, +; K1 K2 K1 K2 +; | / | / +; | / | / +; A ==> A +; | \ | \ +; | \ | \ +; B C B C +; | | \ +; X X1 X2 +; +; where X contains an LDS reference + +; CHECK: [[GV_CLONE:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16 +; CHECK: [[GV:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16 +@lds_gv = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16 + +define protected amdgpu_kernel void @kernel1(i32 %n) #3 { +; CHECK-LABEL: define protected amdgpu_kernel void @kernel1( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @A(i32 [[N]]) +; CHECK-NEXT: ret void +; +entry: + %call = call i32 @A(i32 %n) + ret void +} + +define protected amdgpu_kernel void @kernel2(i32 %n) #3 { +; CHECK-LABEL: define protected amdgpu_kernel void @kernel2( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @A(i32 [[N]]) +; CHECK-NEXT: ret void +; +entry: + %call = call i32 @A(i32 %n) + ret void +} + +define void @A() { +; CHECK-LABEL: define void @A() { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @B() +; CHECK-NEXT: call void @C() +; CHECK-NEXT: ret void +; +entry: + call void @B() + call void @C() + ret void +} + +define i32 @B() { +; CHECK-LABEL: define i32 @B() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 5, ptr [[P]], align 4 +; CHECK-NEXT: [[RET:%.*]] = call i32 @X(ptr [[P]]) +; CHECK-NEXT: [[RET_CLONE_0:%.*]] = call i32 @X.clone.0(ptr [[P]]) +; CHECK-NEXT: ret i32 [[RET]] +; +entry: + %p = alloca i32 + store i32 5, ptr %p + %ret = call i32 @X(ptr %p) + ret i32 %ret +} + +define void @C() { +; CHECK-LABEL: define void @C() { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + ret void +} + +define i32 @X(ptr %x) { +; CHECK-LABEL: define i32 @X( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV]] to ptr), i64 0, i64 0 +; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[X]], align 4 +; CHECK-NEXT: store i32 [[V]], ptr [[P]], align 4 +; CHECK-NEXT: ret i32 [[V]] +; +entry: + %p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 0, i64 0 + %v = load i32, ptr %x + store i32 %v, ptr %p + ret i32 %v +} + +; CHECK-LABEL: define i32 @X.clone.0(ptr %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV_CLONE]] to ptr), i64 0, i64 0 +; CHECK-NEXT: %v = load i32, ptr %x, align 4 +; CHECK-NEXT: store i32 %v, ptr %p, align 4 +; CHECK-NEXT: ret i32 %v + +; MOD0: {{.*}} addrspace(3) global [64 x i32] undef, align 16 +; MOD0: define i32 @X(ptr %x) + +; MOD1: {{.*}} addrspace(3) global [64 x i32] poison, align 16 +; MOD1: define protected amdgpu_kernel void @kernel1(i32 %n) +; MOD1: define protected amdgpu_kernel void @kernel2(i32 %n) +; MOD1: define void @A() +; MOD1: define i32 @B() +; MOD1: define i32 @X.clone.0(ptr %x) diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function-successor.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function-successor.ll new file mode 100644 index 0000000000000..71c02ca50b2b4 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function-successor.ll @@ -0,0 +1,148 @@ +; RUN: opt -passes=amdgpu-clone-module-lds %s -S -o - | FileCheck %s + +; RUN: opt -passes=amdgpu-clone-module-lds %s -S -o %t +; RUN: llvm-split -o %t %t -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=MOD0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=MOD1 %s + +target triple = "amdgcn-amd-amdhsa" + +; Before transformation, After transformation, +; K1 K2 K3 K1 K2 K3 +; | / | | / | +; | / | | / | +; A --------+ ==> A --------+ +; | | +; | | +; B B +; | / | \ +; X X1 X2 X3 +; | \ | / +; D \ | / +; D +; where X contains an LDS reference + +; CHECK: [[GV_CLONE_0:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16 +; CHECK: [[GV_CLONE_1:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16 +; CHECK: [[GV:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16 +@lds_gv = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16 + +define protected amdgpu_kernel void @kernel1(i32 %n) { +; CHECK-LABEL: define protected amdgpu_kernel void @kernel1( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @A(i32 [[N]]) +; CHECK-NEXT: ret void +; +entry: + %call = call i32 @A(i32 %n) + ret void +} + +define protected amdgpu_kernel void @kernel2(i32 %n) { +; CHECK-LABEL: define protected amdgpu_kernel void @kernel2( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @A(i32 [[N]]) +; CHECK-NEXT: ret void +; +entry: + %call = call i32 @A(i32 %n) + ret void +} + +define protected amdgpu_kernel void @kernel3(i32 %n) { +; CHECK-LABEL: define protected amdgpu_kernel void @kernel3( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @A(i32 [[N]]) +; CHECK-NEXT: ret void +; +entry: + %call = call i32 @A(i32 %n) + ret void +} + +define void @A() { +; CHECK-LABEL: define void @A() { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @B() +; CHECK-NEXT: ret void +; +entry: + call void @B() + ret void +} + +define i32 @B() { +; CHECK-LABEL: define i32 @B() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 5, ptr [[P]], align 4 +; CHECK-NEXT: [[RET:%.*]] = call i32 @X(ptr [[P]]) +; CHECK-NEXT: [[RET_CLONE_1:%.*]] = call i32 @X.clone.1(ptr [[P]]) +; CHECK-NEXT: [[RET_CLONE_0:%.*]] = call i32 @X.clone.0(ptr [[P]]) +; CHECK-NEXT: ret i32 [[RET]] +; +entry: + %p = alloca i32 + store i32 5, ptr %p + %ret = call i32 @X(ptr %p) + ret i32 %ret +} + +define i32 @X(ptr %x) { +; CHECK-LABEL: define i32 @X( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV]] to ptr), i64 0, i64 0 +; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[X]], align 4 +; CHECK-NEXT: call void @D(ptr [[P]]) +; CHECK-NEXT: store i32 [[V]], ptr [[P]], align 4 +; CHECK-NEXT: ret i32 [[V]] +; +entry: + %p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 0, i64 0 + %v = load i32, ptr %x + call void @D(ptr %p) + store i32 %v, ptr %p + ret i32 %v +} + +define void @D(ptr %x) { +; CHECK-LABEL: define void @D(ptr %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 8, ptr %x, align 4 +; CHECK-NEXT: ret void +entry: + store i32 8, ptr %x + ret void +} + +; CHECK-LABEL: define i32 @X.clone.0(ptr %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV_CLONE_0]] to ptr), i64 0, i64 0 +; CHECK-NEXT: %v = load i32, ptr %x, align 4 +; CHECK-NEXT: call void @D(ptr [[P]]) +; CHECK-NEXT: store i32 %v, ptr %p, align 4 +; CHECK-NEXT: ret i32 %v + +; CHECK-LABEL: define i32 @X.clone.1(ptr %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV_CLONE_1]] to ptr), i64 0, i64 0 +; CHECK-NEXT: %v = load i32, ptr %x, align 4 +; CHECK-NEXT: call void @D(ptr [[P]]) +; CHECK-NEXT: store i32 %v, ptr %p, align 4 +; CHECK-NEXT: ret i32 %v + +; MOD0: {{.*}} addrspace(3) global [64 x i32] undef, align 16 +; MOD0: define i32 @X(ptr %x) + +; MOD1: {{.*}} addrspace(3) global [64 x i32] poison, align 16 +; MOD1: {{.*}} addrspace(3) global [64 x i32] poison, align 16 +; MOD1: define protected amdgpu_kernel void @kernel1(i32 %n) +; MOD1: define protected amdgpu_kernel void @kernel2(i32 %n) +; MOD1: define protected amdgpu_kernel void @kernel3(i32 %n) +; MOD1: define void @A() +; MOD1: define i32 @B() +; MOD1: define i32 @X.clone.0(ptr %x) diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function.ll new file mode 100644 index 0000000000000..aa06377045644 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function.ll @@ -0,0 +1,73 @@ +; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s + +; RUN: opt -passes=amdgpu-clone-module-lds %s -S -o %t +; RUN: llvm-split -o %t %t -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=MOD0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=MOD1 %s + +target triple = "amdgcn-amd-amdhsa" + +; In this examples, CloneModuleLDS pass creates two copies of LDS_GV +; as two kernels call the same device function where LDS_GV is used. + +; CHECK: [[LDS_GV_CLONE:@.*\.clone\.0]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16 +; CHECK: [[LDS_GV:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16 +@lds_gv = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16 +@lds_gv_alias = alias ptr addrspace(3), ptr addrspace(3) @lds_gv + +define protected amdgpu_kernel void @kernel1(i32 %n) #3 { +; CHECK-LABEL: define protected amdgpu_kernel void @kernel1( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @lds_func(i32 [[N]]) +; CHECK-NEXT: [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0(i32 [[N]]) +; CHECK-NEXT: ret void +; +entry: + %call = call i32 @lds_func(i32 %n) + ret void +} + +define protected amdgpu_kernel void @kernel2(i32 %n) #3 { +; CHECK-LABEL: define protected amdgpu_kernel void @kernel2( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @lds_func(i32 [[N]]) +; CHECK-NEXT: [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0(i32 [[N]]) +; CHECK-NEXT: ret void +; +entry: + %call = call i32 @lds_func(i32 %n) + ret void +} + + +define i32 @lds_func(i32 %x) { +; CHECK-LABEL: define i32 @lds_func( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[LDS_GV]] to ptr), i64 0, i64 0 +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 4 +; CHECK-NEXT: ret i32 [[X]] +; +entry: + %p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 0, i64 0 + store i32 %x, ptr %p + ret i32 %x +} + +; CHECK-LABEL: define i32 @lds_func.clone.0(i32 %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[LDS_GV_CLONE]] to ptr), i64 0, i64 0 +; CHECK-NEXT: store i32 %x, ptr %p, align 4 +; CHECK-NEXT: ret i32 %x + +; MOD0: @lds_gv.clone.0 = {{.*}} addrspace(3) global [64 x i32], align 16 +; MOD0: @lds_gv = {{.*}} addrspace(3) global [64 x i32] poison, align 16 + +; MOD1: @lds_gv.clone.0 = {{.*}} addrspace(3) global [64 x i32] poison, align 16 +; MOD1: @lds_gv = {{.*}} addrspace(3) global [64 x i32], align 16 +; MOD1: define protected amdgpu_kernel void @kernel1(i32 %n) +; MOD1: define protected amdgpu_kernel void @kernel2(i32 %n) +; MOD1: define i32 @lds_func(i32 %x) +; MOD1: define i32 @lds_func.clone.0(i32 %x) diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-global-intrinsics.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-global-intrinsics.ll new file mode 100644 index 0000000000000..8c59e37b749be --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-global-intrinsics.ll @@ -0,0 +1,79 @@ +; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s + +; RUN: opt -passes=amdgpu-clone-module-lds %s -S -o %t +; RUN: llvm-split -o %t %t -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=MOD0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=MOD1 %s + +target triple = "amdgcn-amd-amdhsa" + +%struct.RT = type { i8, [10 x [20 x i32]], i8 } +%struct.GV = type { i32, double, %struct.RT } + +; CHECK: [[LDS_GV_CLONE:@.*\.clone\.0]] = internal addrspace(3) global %struct.GV poison, align 8 +; CHECK: [[LDS_GV:@.*]] = internal addrspace(3) global %struct.GV poison, align 8 +; CHECK: @llvm.used = appending global [1 x ptr] [ +; CHECK-SAME: ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr)], section "llvm.metadata" +@lds_gv = internal addrspace(3) global %struct.GV poison, align 8 +@llvm.used = appending global [1 x ptr] [ + ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr) +], section "llvm.metadata" + +define protected amdgpu_kernel void @kernel1(i32 %n) #3 { +; CHECK-LABEL: define protected amdgpu_kernel void @kernel1( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @lds_func() +; CHECK-NEXT: [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0() +; CHECK-NEXT: ret void +; +entry: + %call = call i32 @lds_func() + ret void +} + +define protected amdgpu_kernel void @kernel2(i32 %n) #3 { +; CHECK-LABEL: define protected amdgpu_kernel void @kernel2( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @lds_func() +; CHECK-NEXT: [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0() +; CHECK-NEXT: ret void +; +entry: + %call = call i32 @lds_func() + ret void +} + +define ptr @lds_func() { +; CHECK-LABEL: define ptr @lds_func() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [[STRUCT_GV:%.*]], ptr addrspace(3) [[LDS_GV]], i64 1, i32 2, i32 1, i64 5, i64 13 +; CHECK-NEXT: [[RET_PTR:%.*]] = addrspacecast ptr addrspace(3) [[P]] to ptr +; CHECK-NEXT: ret ptr [[RET_PTR]] +; +entry: + %p = getelementptr inbounds %struct.GV, ptr addrspace(3) @lds_gv, i64 1, i32 2, i32 1, i64 5, i64 13 + %ret_ptr = addrspacecast ptr addrspace(3) %p to ptr + ret ptr %ret_ptr +} + +; CHECK-LABEL: define ptr @lds_func.clone.0() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds %struct.GV, ptr addrspace(3) [[LDS_GV_CLONE]], i64 1, i32 2, i32 1, i64 5, i64 13 +; CHECK-NEXT: [[RET_PTR:%.*]] = addrspacecast ptr addrspace(3) [[P]] to ptr +; CHECK-NEXT: ret ptr [[RET_PTR]] +; CHECK-NEXT: } + +; MOD0: @lds_gv.clone.0 = external hidden addrspace(3) global %struct.GV, align 8 +; MOD0: @lds_gv = hidden addrspace(3) global %struct.GV poison, align 8 +; MOD0: @llvm.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr)], section "llvm.metadata" + +; MOD1: @lds_gv.clone.0 = hidden addrspace(3) global %struct.GV poison, align 8 +; MOD1: @lds_gv = external hidden addrspace(3) global %struct.GV, align 8 +; MOD1: @llvm.used = external global [1 x ptr], section "llvm.metadata" + +; MOD1: define protected amdgpu_kernel void @kernel1(i32 %n) +; MOD1: define protected amdgpu_kernel void @kernel2(i32 %n) +; MOD1: define ptr @lds_func() +; MOD1: define ptr @lds_func.clone.0() diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-struct-insts.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-struct-insts.ll new file mode 100644 index 0000000000000..6d13d928fb132 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-struct-insts.ll @@ -0,0 +1,114 @@ +; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s + +; RUN: opt -passes=amdgpu-clone-module-lds %s -S -o %t +; RUN: llvm-split -o %t %t -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=MOD0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=MOD1 %s + +target triple = "amdgcn-amd-amdhsa" + +%struct.RT = type { i8, [10 x [20 x i32]], i8 } +%struct.GV = type { i32, double, %struct.RT } + +; CHECK: [[GV_CLONE_0:@.*]] = internal addrspace(3) global %struct.GV poison, align 8 +; CHECK: [[GV:@.*]] = internal addrspace(3) global %struct.GV poison, align 8 +@lds_gv = internal addrspace(3) global %struct.GV poison, align 8 + +define protected amdgpu_kernel void @kernel1(i32 %n) #3 { +; CHECK-LABEL: define protected amdgpu_kernel void @kernel1( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @lds_func(i32 [[N]], i1 false) +; CHECK-NEXT: [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0(i32 [[N]], i1 false) +; CHECK-NEXT: ret void +; +entry: + %call = call i32 @lds_func(i32 %n, i1 false) + ret void +} + +define protected amdgpu_kernel void @kernel2(i32 %n) #3 { +; CHECK-LABEL: define protected amdgpu_kernel void @kernel2( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @lds_func(i32 [[N]], i1 true) +; CHECK-NEXT: [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0(i32 [[N]], i1 true) +; CHECK-NEXT: ret void +; +entry: + %call = call i32 @lds_func(i32 %n, i1 1) + ret void +} + +define i32 @lds_func(i32 %x, i1 %cond) { +; CHECK-LABEL: define i32 @lds_func( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP_0:%.*]] = alloca %struct.GV, align 8, addrspace(3) +; CHECK-NEXT: %p = getelementptr inbounds [[STRUCT_GV:%.*]], ptr addrspace(3) [[GV]], i64 1, i32 2, i32 1, i64 5, i64 13 +; CHECK-NEXT: store i32 %x, ptr addrspace(3) %p, align 4 +; CHECK-NEXT: store i32 %x, ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) [[GV]], i64 1, i32 2, i32 1, i64 5, i64 12), align 4 +; CHECK-NEXT: store ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) [[GV]], i64 1, i32 2, i32 1, i64 5, i64 11), ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) [[GV]], i64 1, i32 2, i32 1, i64 5, i64 1), align 4 +; CHECK-NEXT: %gep.ascast = load i8, ptr getelementptr inbounds (%struct.GV, ptr addrspacecast (ptr addrspace(3) [[GV]] to ptr), i64 6), align 1 +; CHECK-NEXT: br i1 %cond, label %bb.1, label %bb.2 +; CHECK: bb.1: ; preds = %entry +; CHECK-NEXT: br label %sink +; CHECK: bb.2: ; preds = %entry +; CHECK-NEXT: br label %sink +; CHECK: sink: ; preds = %bb.2, %bb.1 +; CHECK-NEXT: %val = phi ptr addrspace(3) [ [[TMP_0]], %bb.1 ], [ [[GV]], %bb.2 ] +; CHECK-NEXT: %p.0 = getelementptr inbounds %struct.GV, ptr addrspace(3) [[GV]], i64 1, i32 2, i32 1, i64 5, i64 1 +; CHECK-NEXT: %retval = load i32, ptr addrspace(3) %p.0, align 4 +; CHECK-NEXT: ret i32 %retval +; +entry: + %tmp.GV = alloca %struct.GV, addrspace(3) + %p = getelementptr inbounds %struct.GV, ptr addrspace(3) @lds_gv, i64 1, i32 2, i32 1, i64 5, i64 13 + store i32 %x, ptr addrspace(3) %p + store i32 %x, ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) @lds_gv, i64 1, i32 2, i32 1, i64 5, i64 12) + store ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) @lds_gv, i64 1, i32 2, i32 1, i64 5, i64 11), ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) @lds_gv, i64 1, i32 2, i32 1, i64 5, i64 1) + %gep.ascast = load i8, ptr getelementptr inbounds (%struct.GV, ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 6), align 1 + br i1 %cond, label %bb.1, label %bb.2 + +bb.1: + br label %sink + +bb.2: + br label %sink + +sink: + %val = phi ptr addrspace(3) [%tmp.GV, %bb.1], [@lds_gv, %bb.2] + %p.0 = getelementptr inbounds %struct.GV, ptr addrspace(3) @lds_gv, i64 1, i32 2, i32 1, i64 5, i64 1 + %retval = load i32, ptr addrspace(3) %p.0 + ret i32 %retval +} + +; CHECK-LABEL: define i32 @lds_func.clone.0( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP_0]] = alloca %struct.GV, align 8, addrspace(3) +; CHECK-NEXT: %p = getelementptr inbounds [[STRUCT_GV:%.*]], ptr addrspace(3) [[GV_CLONE_0]], i64 1, i32 2, i32 1, i64 5, i64 13 +; CHECK-NEXT: store i32 %x, ptr addrspace(3) %p, align 4 +; CHECK-NEXT: store i32 %x, ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) [[GV_CLONE_0]], i64 1, i32 2, i32 1, i64 5, i64 12), align 4 +; CHECK-NEXT: store ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) [[GV_CLONE_0]], i64 1, i32 2, i32 1, i64 5, i64 11), ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) [[GV_CLONE_0]], i64 1, i32 2, i32 1, i64 5, i64 1), align 4 +; CHECK-NEXT: %gep.ascast = load i8, ptr getelementptr inbounds (%struct.GV, ptr addrspacecast (ptr addrspace(3) [[GV_CLONE_0]] to ptr), i64 6), align 1 +; CHECK-NEXT: br i1 %cond, label %bb.1, label %bb.2 +; CHECK: bb.1: ; preds = %entry +; CHECK-NEXT: br label %sink +; CHECK: bb.2: ; preds = %entry +; CHECK-NEXT: br label %sink +; CHECK: sink: ; preds = %bb.2, %bb.1 +; CHECK-NEXT: %val = phi ptr addrspace(3) [ [[TMP_0]], %bb.1 ], [ [[GV_CLONE_0]], %bb.2 ] +; CHECK-NEXT: %p.0 = getelementptr inbounds %struct.GV, ptr addrspace(3) [[GV_CLONE_0]], i64 1, i32 2, i32 1, i64 5, i64 1 +; CHECK-NEXT: %retval = load i32, ptr addrspace(3) %p.0, align 4 +; CHECK-NEXT: ret i32 %retval + +; MOD0: {{.*}} addrspace(3) global %struct.GV, align 8 +; MOD0: {{.*}} addrspace(3) global %struct.GV poison, align 8 + +; MOD1: {{.*}} addrspace(3) global %struct.GV poison, align 8 +; MOD1: {{.*}} addrspace(3) global %struct.GV, align 8 +; MOD1: define protected amdgpu_kernel void @kernel1(i32 %n) +; MOD1: define protected amdgpu_kernel void @kernel2(i32 %n) +; MOD1: define i32 @lds_func(i32 %x, i1 %cond) +; MOD1: define i32 @lds_func.clone.0(i32 %x, i1 %cond)