Skip to content

Commit 00cf4c2

Browse files
author
MartinWehking
authored
[SYCL][CUDA][HIP] Fix enable-global-offset flag (#11674)
- Modify the globaloffset pass to remove calls to the `llvm.nvvm.implicit.offset` and `llvm.amdgcn.implicit.offset` from the IR during the SYCL globaloffset pass when `-enable-global-offset=false`. Remove their respective uses, i.e. GEPs and Loads and replace further uses of the latter with 0 constants. Ensure that these intrinsics do not occur anymore during target lowering. Before, in some cases a compilation error was thrown because the intrinsic could not be selected for the AMDGPU and NVPTX targets. Based on the inspection of the IR, any calls of the intrinsic were probably expected to be fully removed after the globaloffset pass. - Replace Loads from the intrinsic with known constants and enable further optimization of the IR to remove dead code. In our observed cases, several kernels with implicit global offset failed to remove useless stores to the stack.
1 parent 499dae4 commit 00cf4c2

File tree

3 files changed

+101
-21
lines changed

3 files changed

+101
-21
lines changed

llvm/lib/SYCLLowerIR/GlobalOffset.cpp

Lines changed: 60 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,21 @@ ModulePass *llvm::createGlobalOffsetPassLegacy() {
5959
return new GlobalOffsetLegacy();
6060
}
6161

62+
// Recursive helper function to collect Loads from GEPs in a BFS fashion.
63+
static void getLoads(Instruction *P, SmallVectorImpl<Instruction *> &Traversed,
64+
SmallVectorImpl<LoadInst *> &Loads) {
65+
Traversed.push_back(P);
66+
if (auto *L = dyn_cast<LoadInst>(P)) // Base case for recursion
67+
Loads.push_back(L);
68+
else {
69+
assert(isa<GetElementPtrInst>(*P));
70+
for (Value *V : P->users())
71+
getLoads(cast<Instruction>(V), Traversed, Loads);
72+
}
73+
}
74+
6275
// New PM implementation.
6376
PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
64-
if (!EnableGlobalOffset)
65-
return PreservedAnalyses::all();
66-
6777
AT = TargetHelpers::getArchType(M);
6878
Function *ImplicitOffsetIntrinsic = M.getFunction(Intrinsic::getName(
6979
AT == ArchType::Cuda
@@ -73,33 +83,62 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
7383
if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty())
7484
return PreservedAnalyses::all();
7585

76-
// For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is
77-
// happy with ADDRESS_SPACE_GENERIC (0).
78-
TargetAS = AT == ArchType::Cuda ? 0 : 5;
79-
/// The value for NVVM's ADDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen
80-
/// to be 3, use it for the implicit argument pointer type.
81-
KernelImplicitArgumentType =
82-
ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
83-
ImplicitOffsetPtrType =
84-
Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS);
85-
assert((ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) &&
86-
"Implicit offset intrinsic does not return the expected type");
86+
if (!EnableGlobalOffset) {
87+
SmallVector<CallInst *, 4> Worklist;
88+
SmallVector<LoadInst *, 4> LI;
89+
SmallVector<Instruction *, 4> PtrUses;
8790

88-
SmallVector<KernelPayload, 4> KernelPayloads;
89-
TargetHelpers::populateKernels(M, KernelPayloads, AT);
91+
// Collect all GEPs and Loads from the intrinsic's CallInsts
92+
for (Value *V : ImplicitOffsetIntrinsic->users()) {
93+
Worklist.push_back(cast<CallInst>(V));
94+
for (Value *V2 : V->users())
95+
getLoads(cast<Instruction>(V2), PtrUses, LI);
96+
}
97+
98+
// Replace each use of a collected Load with a Constant 0
99+
for (LoadInst *L : LI)
100+
L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0));
90101

91-
// Validate kernels and populate entry map
92-
EntryPointMetadata = generateKernelMDNodeMap(M, KernelPayloads);
102+
// Remove all collected Loads and GEPs from the kernel.
103+
// PtrUses is returned by `getLoads` in topological order.
104+
// Walk it backwards so we don't violate users.
105+
for (auto *I : reverse(PtrUses))
106+
I->eraseFromParent();
93107

94-
// Add implicit parameters to all direct and indirect users of the offset
95-
addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr);
108+
// Remove all collected CallInsts from the kernel.
109+
for (CallInst *CI : Worklist) {
110+
auto *I = cast<Instruction>(CI);
111+
I->eraseFromParent();
112+
}
113+
} else {
114+
// For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is
115+
// happy with ADDRESS_SPACE_GENERIC (0).
116+
TargetAS = AT == ArchType::Cuda ? 0 : 5;
117+
/// The value for NVVM's adDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen
118+
/// to be 3, use it for the implicit argument pointer type.
119+
KernelImplicitArgumentType =
120+
ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
121+
ImplicitOffsetPtrType =
122+
Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS);
123+
assert(
124+
(ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) &&
125+
"Implicit offset intrinsic does not return the expected type");
126+
127+
SmallVector<KernelPayload, 4> KernelPayloads;
128+
TargetHelpers::populateKernels(M, KernelPayloads, AT);
129+
130+
// Validate kernels and populate entry map
131+
EntryPointMetadata = generateKernelMDNodeMap(M, KernelPayloads);
132+
133+
// Add implicit parameters to all direct and indirect users of the offset
134+
addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr);
135+
}
96136

97137
// Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete
98138
// it.
99139
assert(ImplicitOffsetIntrinsic->use_empty() &&
100140
"Not all uses of intrinsic removed");
101141
ImplicitOffsetIntrinsic->eraseFromParent();
102-
103142
return PreservedAnalyses::none();
104143
}
105144

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
; RUN: opt -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s
2+
3+
; This test checks that the implicit offset intrinsic is correctly removed
4+
5+
declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
6+
; CHECK-NOT: llvm.amdgcn.implicit.offset
7+
8+
define weak_odr dso_local i64 @_ZTS14example_kernel() {
9+
entry:
10+
; CHECK-NOT: @llvm.amdgcn.implicit.offset()
11+
; CHECK-NOT: getelementptr
12+
; CHECK-NOT: load
13+
; CHECK: [[REG:%[0-9]+]] = zext i{{[0-9]+}} 0 to i{{[0-9]+}}
14+
; CHECK: ret i{{[0-9]+}} [[REG]]
15+
%0 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
16+
%1 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 1
17+
%2 = load i32, ptr addrspace(5) %1, align 4
18+
%3 = zext i32 %2 to i64
19+
ret i64 %3
20+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
; RUN: opt -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s
2+
target triple = "nvptx64-nvidia-cuda"
3+
4+
; This test checks that the implicit offset intrinsic is correctly removed
5+
6+
declare ptr @llvm.nvvm.implicit.offset()
7+
; CHECK-NOT: llvm.nvvm.implicit.offset
8+
9+
define weak_odr dso_local i64 @_ZTS14example_kernel() {
10+
entry:
11+
; CHECK-NOT: @llvm.nvvm.implicit.offset()
12+
; CHECK-NOT: getelementptr
13+
; CHECK-NOT: load
14+
; CHECK: [[REG:%[0-9]+]] = zext i{{[0-9]+}} 0 to i{{[0-9]+}}
15+
; CHECK: ret i{{[0-9]+}} [[REG]]
16+
%0 = tail call ptr @llvm.nvvm.implicit.offset()
17+
%1 = getelementptr inbounds i32, ptr %0, i64 1
18+
%2 = load i32, ptr %1, align 4
19+
%3 = zext i32 %2 to i64
20+
ret i64 %3
21+
}

0 commit comments

Comments
 (0)