Skip to content

Commit 4a9d218

Browse files
authored
[SeparateConstOffsetFromGEP] propagate const offset through GEP chains (#143470)
When separating the constant offset from a GEP, if the pointer operand is a constant ptradd (likely generated when we performed this transform on that GEP), we accumulate the offset into the current offset. This ensures that when there is a chain of GEPs the constant offset reaches the final memory instruction where it can likely be folded into the addressing.
1 parent 76b8e19 commit 4a9d218

File tree

5 files changed

+465
-26
lines changed

5 files changed

+465
-26
lines changed

llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,19 +1039,31 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
10391039
if (GEP->getType()->isVectorTy())
10401040
return false;
10411041

1042+
// If the base of this GEP is a ptradd of a constant, lets pass the constant
1043+
// along. This ensures that when we have a chain of GEPs the constant
1044+
// offset from each is accumulated.
1045+
Value *NewBase;
1046+
const APInt *BaseOffset;
1047+
const bool ExtractBase =
1048+
match(GEP->getPointerOperand(),
1049+
m_PtrAdd(m_Value(NewBase), m_APInt(BaseOffset)));
1050+
1051+
const int64_t BaseByteOffset = ExtractBase ? BaseOffset->getSExtValue() : 0;
1052+
10421053
// The backend can already nicely handle the case where all indices are
10431054
// constant.
1044-
if (GEP->hasAllConstantIndices())
1055+
if (GEP->hasAllConstantIndices() && !ExtractBase)
10451056
return false;
10461057

10471058
bool Changed = canonicalizeArrayIndicesToIndexSize(GEP);
10481059

10491060
bool NeedsExtraction;
1050-
int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
1061+
int64_t AccumulativeByteOffset =
1062+
BaseByteOffset + accumulateByteOffset(GEP, NeedsExtraction);
10511063

10521064
TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
10531065

1054-
if (!NeedsExtraction) {
1066+
if (!NeedsExtraction && !ExtractBase) {
10551067
Changed |= reorderGEP(GEP, TTI);
10561068
return Changed;
10571069
}
@@ -1075,7 +1087,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
10751087

10761088
// Track information for preserving GEP flags.
10771089
bool AllOffsetsNonNegative = AccumulativeByteOffset >= 0;
1078-
bool AllNUWPreserved = true;
1090+
bool AllNUWPreserved = GEP->hasNoUnsignedWrap();
1091+
bool NewGEPInBounds = GEP->isInBounds();
1092+
bool NewGEPNUSW = GEP->hasNoUnsignedSignedWrap();
10791093

10801094
// Remove the constant offset in each sequential index. The resultant GEP
10811095
// computes the variadic base.
@@ -1111,6 +1125,16 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
11111125
}
11121126
}
11131127
}
1128+
if (ExtractBase) {
1129+
GEPOperator *Base = cast<GEPOperator>(GEP->getPointerOperand());
1130+
AllNUWPreserved &= Base->hasNoUnsignedWrap();
1131+
NewGEPInBounds &= Base->isInBounds();
1132+
NewGEPNUSW &= Base->hasNoUnsignedSignedWrap();
1133+
AllOffsetsNonNegative &= BaseByteOffset >= 0;
1134+
1135+
GEP->setOperand(0, NewBase);
1136+
RecursivelyDeleteTriviallyDeadInstructions(Base);
1137+
}
11141138

11151139
// Clear the inbounds attribute because the new index may be off-bound.
11161140
// e.g.,
@@ -1138,21 +1162,21 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
11381162

11391163
// If the initial GEP was NUW and all operations that we reassociate were NUW
11401164
// additions, the resulting GEPs are also NUW.
1141-
if (GEP->hasNoUnsignedWrap() && AllNUWPreserved) {
1165+
if (AllNUWPreserved) {
11421166
NewGEPFlags |= GEPNoWrapFlags::noUnsignedWrap();
11431167
// If the initial GEP additionally had NUSW (or inbounds, which implies
11441168
// NUSW), we know that the indices in the initial GEP must all have their
11451169
// signbit not set. For indices that are the result of NUW adds, the
11461170
// add-operands therefore also don't have their signbit set. Therefore, all
11471171
// indices of the resulting GEPs are non-negative -> we can preserve
11481172
// the inbounds/nusw flag.
1149-
CanPreserveInBoundsNUSW |= GEP->hasNoUnsignedSignedWrap();
1173+
CanPreserveInBoundsNUSW |= NewGEPNUSW;
11501174
}
11511175

11521176
if (CanPreserveInBoundsNUSW) {
1153-
if (GEP->isInBounds())
1177+
if (NewGEPInBounds)
11541178
NewGEPFlags |= GEPNoWrapFlags::inBounds();
1155-
else if (GEP->hasNoUnsignedSignedWrap())
1179+
else if (NewGEPNUSW)
11561180
NewGEPFlags |= GEPNoWrapFlags::noUnsignedSignedWrap();
11571181
}
11581182

@@ -1220,11 +1244,13 @@ bool SeparateConstOffsetFromGEP::run(Function &F) {
12201244

12211245
DL = &F.getDataLayout();
12221246
bool Changed = false;
1223-
for (BasicBlock &B : F) {
1224-
if (!DT->isReachableFromEntry(&B))
1247+
1248+
ReversePostOrderTraversal<Function *> RPOT(&F);
1249+
for (BasicBlock *B : RPOT) {
1250+
if (!DT->isReachableFromEntry(B))
12251251
continue;
12261252

1227-
for (Instruction &I : llvm::make_early_inc_range(B))
1253+
for (Instruction &I : llvm::make_early_inc_range(*B))
12281254
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I))
12291255
Changed |= splitGEP(GEP);
12301256
// No need to split GEP ConstantExprs because all its indices are constant

llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -279,11 +279,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
279279
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
280280
; CHECK-NEXT: ds_write_b32 v0, v58
281281
; CHECK-NEXT: s_branch .LBB0_7
282-
; CHECK-NEXT: .LBB0_16: ; %Flow45
282+
; CHECK-NEXT: .LBB0_16: ; %Flow43
283283
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
284284
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69
285285
; CHECK-NEXT: v_mov_b32_e32 v57, v0
286-
; CHECK-NEXT: .LBB0_17: ; %Flow46
286+
; CHECK-NEXT: .LBB0_17: ; %Flow44
287287
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
288288
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68
289289
; CHECK-NEXT: s_mov_b32 s55, exec_lo
@@ -330,11 +330,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
330330
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
331331
; CHECK-NEXT: ds_write_b32 v0, v57
332332
; CHECK-NEXT: s_branch .LBB0_19
333-
; CHECK-NEXT: .LBB0_22: ; %Flow43
333+
; CHECK-NEXT: .LBB0_22: ; %Flow41
334334
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
335335
; CHECK-NEXT: s_inst_prefetch 0x2
336336
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68
337-
; CHECK-NEXT: .LBB0_23: ; %Flow44
337+
; CHECK-NEXT: .LBB0_23: ; %Flow42
338338
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
339339
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
340340
; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
@@ -347,7 +347,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
347347
; CHECK-NEXT: s_or_b32 s53, s4, s53
348348
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53
349349
; CHECK-NEXT: s_cbranch_execnz .LBB0_5
350-
; CHECK-NEXT: .LBB0_25: ; %Flow51
350+
; CHECK-NEXT: .LBB0_25: ; %Flow49
351351
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
352352
; CHECK-NEXT: v_mov_b32_e32 v31, v40
353353
; CHECK-NEXT: v_mov_b32_e32 v0, 1

llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -279,8 +279,8 @@ define void @addrspace3(ptr addrspace(3) %in.ptr, i64 %in.idx1) {
279279
; CHECK-NEXT: entry:
280280
; CHECK-NEXT: [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
281281
; CHECK-NEXT: [[IDXPROM:%.*]] = trunc i64 [[IN_IDX1_NNEG]] to i32
282-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i128, ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM]]
283-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i64 1024
282+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i128, ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM]]
283+
; CHECK-NEXT: [[IDX11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP0]], i32 1024
284284
; CHECK-NEXT: ret void
285285
;
286286
entry:
@@ -296,8 +296,8 @@ define void @addrspace7(ptr addrspace(7) %in.ptr, i64 %in.idx1) {
296296
; CHECK-NEXT: entry:
297297
; CHECK-NEXT: [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
298298
; CHECK-NEXT: [[IDXPROM:%.*]] = trunc i64 [[IN_IDX1_NNEG]] to i32
299-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i128, ptr addrspace(7) [[IN_PTR]], i32 [[IDXPROM]]
300-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(7) [[TMP0]], i64 1024
299+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i128, ptr addrspace(7) [[IN_PTR]], i32 [[IDXPROM]]
300+
; CHECK-NEXT: [[IDX11:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 1024
301301
; CHECK-NEXT: ret void
302302
;
303303
entry:

llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,13 +74,13 @@ define void @reorder_i8half(ptr addrspace(3) %in.ptr, i64 %in.idx0, i64 %in.idx1
7474
; CHECK-NEXT: [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM1]]
7575
; CHECK-NEXT: [[IDXPROM2:%.*]] = trunc i64 [[IN_IDX1]] to i32
7676
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM2]]
77-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i64 256
77+
; CHECK-NEXT: [[IDX13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 256
7878
; CHECK-NEXT: [[IDXPROM3:%.*]] = trunc i64 [[IN_IDX1]] to i32
7979
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM3]]
80-
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 512
80+
; CHECK-NEXT: [[IDX25:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 512
8181
; CHECK-NEXT: [[IDXPROM4:%.*]] = trunc i64 [[IN_IDX1]] to i32
8282
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM4]]
83-
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i64 768
83+
; CHECK-NEXT: [[IDX37:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 768
8484
; CHECK-NEXT: ret void
8585
;
8686
entry:
@@ -169,13 +169,13 @@ define void @bad_index(ptr addrspace(3) %in.ptr, i64 %in.idx0, i64 %in.idx1) {
169169
; CHECK-NEXT: [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM1]]
170170
; CHECK-NEXT: [[IDXPROM2:%.*]] = trunc i64 [[IN_IDX1]] to i32
171171
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM2]]
172-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i64 1
172+
; CHECK-NEXT: [[IDX13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 1
173173
; CHECK-NEXT: [[IDXPROM3:%.*]] = trunc i64 [[IN_IDX1]] to i32
174174
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM3]]
175-
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 2
175+
; CHECK-NEXT: [[IDX25:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 2
176176
; CHECK-NEXT: [[IDXPROM4:%.*]] = trunc i64 [[IN_IDX1]] to i32
177177
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM4]]
178-
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i64 3
178+
; CHECK-NEXT: [[IDX37:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 3
179179
; CHECK-NEXT: ret void
180180
;
181181
entry:

0 commit comments

Comments
 (0)