Skip to content

Commit 92e5608

Browse files
authored
[Transform][LoadStoreVectorizer] allow redundant in Chain (#163019)
This can absorb redundant loads when forming vector load. Can be used to fix the situation created by VectorCombine. See: https://discourse.llvm.org/t/what-is-the-purpose-of-vectorizeloadinsert-in-the-vectorcombine-pass/88532
1 parent be717af commit 92e5608

14 files changed

+320
-328
lines changed

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 50 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -626,26 +626,35 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
626626
std::vector<Chain> Ret;
627627
Ret.push_back({C.front()});
628628

629+
unsigned ElemBytes = DL.getTypeStoreSize(getChainElemTy(C));
630+
APInt PrevReadEnd = C[0].OffsetFromLeader +
631+
DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
629632
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
630633
// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
631634
auto &CurChain = Ret.back();
632-
const ChainElem &Prev = CurChain.back();
633-
unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
634-
assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
635-
"collectEquivalenceClass");
636-
APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
635+
unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
637636

638637
// Add this instruction to the end of the current chain, or start a new one.
639-
bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
640-
LLVM_DEBUG(dbgs() << "LSV: Instructions are "
641-
<< (AreContiguous ? "" : "not ") << "contiguous: "
642-
<< *Prev.Inst << " (ends at offset " << PrevReadEnd
643-
<< ") -> " << *It->Inst << " (starts at offset "
638+
assert(SzBytes % ElemBytes == 0);
639+
APInt ReadEnd = It->OffsetFromLeader + SzBytes;
640+
// Allow redundancy: partial or full overlap counts as contiguous.
641+
bool AreContiguous = false;
642+
if (It->OffsetFromLeader.sle(PrevReadEnd)) {
643+
uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
644+
if (Overlap % ElemBytes == 0)
645+
AreContiguous = true;
646+
}
647+
648+
LLVM_DEBUG(dbgs() << "LSV: Instruction is "
649+
<< (AreContiguous ? "contiguous" : "chain-breaker")
650+
<< *It->Inst << " (starts at offset "
644651
<< It->OffsetFromLeader << ")\n");
652+
645653
if (AreContiguous)
646654
CurChain.push_back(*It);
647655
else
648656
Ret.push_back({*It});
657+
PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
649658
}
650659

651660
// Filter out length-1 chains, these are uninteresting.
@@ -874,15 +883,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
874883
Type *VecElemTy = getChainElemTy(C);
875884
bool IsLoadChain = isa<LoadInst>(C[0].Inst);
876885
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
877-
unsigned ChainBytes = std::accumulate(
878-
C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
879-
return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
880-
});
886+
unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
887+
APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
888+
unsigned ChainBytes = BytesAdded;
889+
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
890+
unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
891+
APInt ReadEnd = It->OffsetFromLeader + SzBytes;
892+
// Update ChainBytes considering possible overlap.
893+
BytesAdded =
894+
PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
895+
ChainBytes += BytesAdded;
896+
PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
897+
}
898+
881899
assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
882900
// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
883901
// than 1 byte (e.g. VecTy == <32 x i1>).
884-
Type *VecTy = FixedVectorType::get(
885-
VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
902+
unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy);
903+
Type *VecTy = FixedVectorType::get(VecElemTy, NumElem);
886904

887905
Align Alignment = getLoadStoreAlignment(C[0].Inst);
888906
// If this is a load/store of an alloca, we might have upgraded the alloca's
@@ -909,27 +927,31 @@ bool Vectorizer::vectorizeChain(Chain &C) {
909927
llvm::min_element(C, [](const auto &A, const auto &B) {
910928
return A.Inst->comesBefore(B.Inst);
911929
})->Inst);
912-
930+
// This can happen due to a chain of redundant loads.
931+
// In this case, just use the element-type, and avoid ExtractElement.
932+
if (NumElem == 1)
933+
VecTy = VecElemTy;
913934
// Chain is in offset order, so C[0] is the instr with the lowest offset,
914935
// i.e. the root of the vector.
915936
VecInst = Builder.CreateAlignedLoad(VecTy,
916937
getLoadStorePointerOperand(C[0].Inst),
917938
Alignment);
918939

919-
unsigned VecIdx = 0;
920940
for (const ChainElem &E : C) {
921941
Instruction *I = E.Inst;
922942
Value *V;
923943
Type *T = getLoadStoreType(I);
944+
int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
945+
int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
924946
if (auto *VT = dyn_cast<FixedVectorType>(T)) {
925947
auto Mask = llvm::to_vector<8>(
926948
llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
927949
V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
928-
VecIdx += VT->getNumElements();
929-
} else {
950+
} else if (VecTy != VecElemTy) {
930951
V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
931952
I->getName());
932-
++VecIdx;
953+
} else {
954+
V = VecInst;
933955
}
934956
if (V->getType() != I->getType())
935957
V = Builder.CreateBitOrPointerCast(V, I->getType());
@@ -964,22 +986,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
964986

965987
// Build the vector to store.
966988
Value *Vec = PoisonValue::get(VecTy);
967-
unsigned VecIdx = 0;
968-
auto InsertElem = [&](Value *V) {
989+
auto InsertElem = [&](Value *V, unsigned VecIdx) {
969990
if (V->getType() != VecElemTy)
970991
V = Builder.CreateBitOrPointerCast(V, VecElemTy);
971-
Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
992+
Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
972993
};
973994
for (const ChainElem &E : C) {
974995
auto *I = cast<StoreInst>(E.Inst);
996+
int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
997+
int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
975998
if (FixedVectorType *VT =
976999
dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
9771000
for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
9781001
InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
979-
Builder.getInt32(J)));
1002+
Builder.getInt32(J)),
1003+
VecIdx++);
9801004
}
9811005
} else {
982-
InsertElem(I->getValueOperand());
1006+
InsertElem(I->getValueOperand(), VecIdx);
9831007
}
9841008
}
9851009

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3850,8 +3850,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
38503850
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
38513851
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) poison`, addrspace 4)
38523852
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
3853-
; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p3) from `ptr addrspace(1) poison`, addrspace 1)
3854-
; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p5) from `ptr addrspace(1) poison`, addrspace 1)
3853+
; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `ptr addrspace(1) poison`, addrspace 1)
3854+
; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[LOAD2]](s32)
3855+
; CHECK-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p5) = G_INTTOPTR [[LOAD2]](s32)
38553856
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
38563857
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
38573858
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3880,10 +3881,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
38803881
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
38813882
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
38823883
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
3883-
; CHECK-NEXT: G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
3884+
; CHECK-NEXT: G_STORE [[INTTOPTR]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
38843885
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
38853886
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
3886-
; CHECK-NEXT: G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
3887+
; CHECK-NEXT: G_STORE [[INTTOPTR1]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
38873888
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
38883889
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
38893890
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)

0 commit comments

Comments
 (0)