-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[SLPVectorizer][NFC] Save stride in a map. #157706
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-llvm-transforms Author: Mikhail Gudim (mgudim) ChangesIn order to avoid recalculating stride of strided load twice save it in a map. Full diff: https://github.com/llvm/llvm-project/pull/157706.diff 1 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1cfcd3ffbd664..aef2581aab615 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1916,6 +1916,19 @@ class BoUpSLP {
class ShuffleCostEstimator;
class ShuffleInstructionBuilder;
+ /// If we decide to generate strided load / store, this struct contains all
+ /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
+ /// and analyzeConstantStrideCandidate. Note that Stride can be given either
+ /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
+ /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
+ /// size of element of FixedVectorType.
+ struct StridedPtrInfo {
+ Value *StrideVal = nullptr;
+ const SCEV *StrideSCEV = nullptr;
+ FixedVectorType *Ty = nullptr;
+ };
+ SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
+
public:
/// Tracks the state we can represent the loads in the given sequence.
enum class LoadsState {
@@ -2211,6 +2224,11 @@ class BoUpSLP {
/// TODO: If load combining is allowed in the IR optimizer, this analysis
/// may not be necessary.
bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
+ bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
+ const DataLayout &DL, ScalarEvolution &SE,
+ const bool IsAnyPointerUsedOutGraph, const int64_t Diff,
+ StridedPtrInfo &SPtrInfo) const;
/// Checks if the given array of loads can be represented as a vectorized,
/// scatter or just simple gather.
@@ -2225,6 +2243,7 @@ class BoUpSLP {
LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
SmallVectorImpl<unsigned> &Order,
SmallVectorImpl<Value *> &PointerOps,
+ StridedPtrInfo &SPtrInfo,
unsigned *BestVF = nullptr,
bool TryRecursiveCheck = true) const;
@@ -4469,11 +4488,10 @@ class BoUpSLP {
/// Checks if the specified list of the instructions/values can be vectorized
/// and fills required data before actual scheduling of the instructions.
- TreeEntry::EntryState
- getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
- bool IsScatterVectorizeUserTE,
- OrdersType &CurrentOrder,
- SmallVectorImpl<Value *> &PointerOps);
+ TreeEntry::EntryState getScalarsVectorizationState(
+ const InstructionsState &S, ArrayRef<Value *> VL,
+ bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
+ SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
/// Maps a specific scalar to its tree entry(ies).
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -6446,6 +6464,7 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
++Cnt;
}
}
+
return Stride;
}
@@ -6789,12 +6808,13 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
/// 4. Any pointer operand is an instruction with the users outside of the
/// current graph (for masked gathers extra extractelement instructions
/// might be required).
-static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
- ArrayRef<unsigned> Order,
- const TargetTransformInfo &TTI, const DataLayout &DL,
- ScalarEvolution &SE,
- const bool IsAnyPointerUsedOutGraph,
- const int64_t Diff) {
+bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order,
+ const TargetTransformInfo &TTI,
+ const DataLayout &DL, ScalarEvolution &SE,
+ const bool IsAnyPointerUsedOutGraph,
+ const int64_t Diff,
+ StridedPtrInfo &SPtrInfo) const {
const size_t Sz = VL.size();
const uint64_t AbsoluteDiff = std::abs(Diff);
Type *ScalarTy = VL.front()->getType();
@@ -6836,17 +6856,20 @@ static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
break;
}
- if (Dists.size() == Sz)
+ if (Dists.size() == Sz) {
+ Type *StrideTy = DL.getIndexType(Ptr0->getType());
+ SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
+ SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
return true;
+ }
}
return false;
}
-BoUpSLP::LoadsState
-BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
- SmallVectorImpl<unsigned> &Order,
- SmallVectorImpl<Value *> &PointerOps,
- unsigned *BestVF, bool TryRecursiveCheck) const {
+BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
+ ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
+ SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
+ unsigned *BestVF, bool TryRecursiveCheck) const {
// Check that a vectorized load would load the same memory as a scalar
// load. For example, we don't want to vectorize loads that are smaller
// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
@@ -6884,9 +6907,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
if (!IsSorted) {
if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
- if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
- calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
+ if (const SCEV *Stride =
+ calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
+ Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+ SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
+ SPtrInfo.StrideSCEV = Stride;
return LoadsState::StridedVectorize;
+ }
}
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -6930,7 +6957,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
});
if (IsPossibleStrided &&
isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
- IsAnyPointerUsedOutGraph, *Diff))
+ IsAnyPointerUsedOutGraph, *Diff, SPtrInfo))
return LoadsState::StridedVectorize;
}
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7014,9 +7041,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
SmallVector<unsigned> Order;
SmallVector<Value *> PointerOps;
- LoadsState LS =
- canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
- /*TryRecursiveCheck=*/false);
+ LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
+ PointerOps, SPtrInfo, BestVF,
+ /*TryRecursiveCheck=*/false);
// Check that the sorted loads are consecutive.
if (LS == LoadsState::Gather) {
if (BestVF) {
@@ -7688,9 +7715,10 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
// extra analysis later, so include such nodes into a special list.
if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
OrdersType CurrentOrder;
LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
- CurrentOrder, PointerOps);
+ CurrentOrder, PointerOps, SPtrInfo);
if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
Res == LoadsState::CompressVectorize)
return std::move(CurrentOrder);
@@ -9193,8 +9221,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// Try to build vector load.
ArrayRef<Value *> Values(
reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
+ StridedPtrInfo SPtrInfo;
LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
- PointerOps, &BestVF);
+ PointerOps, SPtrInfo, &BestVF);
if (LS != LoadsState::Gather ||
(BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
if (LS == LoadsState::ScatterVectorize) {
@@ -9388,6 +9417,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
unsigned VF = *CommonVF;
OrdersType Order;
SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
// Segmented load detected - vectorize at maximum vector factor.
if (InterleaveFactor <= Slice.size() &&
TTI.isLegalInterleavedAccessType(
@@ -9396,8 +9426,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
cast<LoadInst>(Slice.front())->getAlign(),
cast<LoadInst>(Slice.front())
->getPointerAddressSpace()) &&
- canVectorizeLoads(Slice, Slice.front(), Order,
- PointerOps) == LoadsState::Vectorize) {
+ canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
+ SPtrInfo) == LoadsState::Vectorize) {
UserMaxVF = InterleaveFactor * VF;
} else {
InterleaveFactor = 0;
@@ -9419,8 +9449,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
ArrayRef<Value *> VL = TE.Scalars;
OrdersType Order;
SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
LoadsState State = canVectorizeLoads(
- VL, VL.front(), Order, PointerOps);
+ VL, VL.front(), Order, PointerOps, SPtrInfo);
if (State == LoadsState::ScatterVectorize ||
State == LoadsState::CompressVectorize)
return false;
@@ -9438,11 +9469,11 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
[&, Slice = Slice](unsigned Idx) {
OrdersType Order;
SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
return canVectorizeLoads(
Slice.slice(Idx * UserMaxVF, UserMaxVF),
- Slice[Idx * UserMaxVF], Order,
- PointerOps) ==
- LoadsState::ScatterVectorize;
+ Slice[Idx * UserMaxVF], Order, PointerOps,
+ SPtrInfo) == LoadsState::ScatterVectorize;
}))
UserMaxVF = MaxVF;
if (Slice.size() != ConsecutiveNodesSize)
@@ -9799,7 +9830,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
const InstructionsState &S, ArrayRef<Value *> VL,
bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
- SmallVectorImpl<Value *> &PointerOps) {
+ SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
assert(S.getMainOp() &&
"Expected instructions with same/alternate opcodes only.");
@@ -9901,7 +9932,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
});
});
};
- switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
+ switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
case LoadsState::Vectorize:
return TreeEntry::Vectorize;
case LoadsState::CompressVectorize:
@@ -11374,8 +11405,9 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
OrdersType CurrentOrder;
SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
TreeEntry::EntryState State = getScalarsVectorizationState(
- S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
+ S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
if (State == TreeEntry::NeedToGather) {
newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
return;
@@ -11535,6 +11567,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
+ TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
TE->dump());
break;
@@ -12923,8 +12956,9 @@ void BoUpSLP::transformNodes() {
if (S.getOpcode() == Instruction::Load) {
OrdersType Order;
SmallVector<Value *> PointerOps;
- LoadsState Res =
- canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
+ StridedPtrInfo SPtrInfo;
+ LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
+ PointerOps, SPtrInfo);
AllStrided &= Res == LoadsState::StridedVectorize ||
Res == LoadsState::ScatterVectorize ||
Res == LoadsState::Gather;
@@ -13030,10 +13064,18 @@ void BoUpSLP::transformNodes() {
InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
Instruction::Load, VecTy, BaseLI->getPointerOperand(),
/*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
- if (StridedCost < OriginalVecCost)
+ if (StridedCost < OriginalVecCost) {
// Strided load is more profitable than consecutive load + reverse -
// transform the node to strided load.
+ Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
+ ->getPointerOperand()
+ ->getType());
+ StridedPtrInfo SPtrInfo;
+ SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
+ SPtrInfo.Ty = VecTy;
+ TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
E.State = TreeEntry::StridedVectorize;
+ }
}
break;
}
@@ -19474,6 +19516,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
LoadInst *LI = cast<LoadInst>(VL0);
Instruction *NewLI;
+ FixedVectorType *StridedLoadTy = nullptr;
Value *PO = LI->getPointerOperand();
if (E->State == TreeEntry::Vectorize) {
NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
@@ -19511,43 +19554,36 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
PO = IsReverseOrder ? PtrN : Ptr0;
- std::optional<int64_t> Diff = getPointersDiff(
- VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
Type *StrideTy = DL->getIndexType(PO->getType());
Value *StrideVal;
- if (Diff) {
- int64_t Stride =
- *Diff / (static_cast<int64_t>(E->Scalars.size()) - 1);
- StrideVal =
- ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
- DL->getTypeAllocSize(ScalarTy));
- } else {
- SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
- transform(E->Scalars, PointerOps.begin(), [](Value *V) {
- return cast<LoadInst>(V)->getPointerOperand();
- });
- OrdersType Order;
- const SCEV *StrideSCEV =
- calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
- assert(StrideSCEV && "At this point stride should be known");
+ const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
+ StridedLoadTy = SPtrInfo.Ty;
+ assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
+ unsigned StridedLoadEC =
+ StridedLoadTy->getElementCount().getKnownMinValue();
+
+ Value *Stride = SPtrInfo.StrideVal;
+ if (!Stride) {
+ const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
+ assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
SCEVExpander Expander(*SE, *DL, "strided-load-vec");
- Value *Stride = Expander.expandCodeFor(
- StrideSCEV, StrideSCEV->getType(), &*Builder.GetInsertPoint());
- Value *NewStride =
- Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
- StrideVal = Builder.CreateMul(
- NewStride,
- ConstantInt::get(
- StrideTy,
- (IsReverseOrder ? -1 : 1) *
- static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
- }
+ Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
+ &*Builder.GetInsertPoint());
+ }
+ Value *NewStride =
+ Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
+ StrideVal = Builder.CreateMul(
+ NewStride, ConstantInt::get(
+ StrideTy, (IsReverseOrder ? -1 : 1) *
+ static_cast<int>(
+ DL->getTypeAllocSize(ScalarTy))));
Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
auto *Inst = Builder.CreateIntrinsic(
Intrinsic::experimental_vp_strided_load,
- {VecTy, PO->getType(), StrideTy},
- {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
- Builder.getInt32(E->Scalars.size())});
+ {StridedLoadTy, PO->getType(), StrideTy},
+ {PO, StrideVal,
+ Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
+ Builder.getInt32(StridedLoadEC)});
Inst->addParamAttr(
/*ArgNo=*/0,
Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
|
|
@llvm/pr-subscribers-vectorizers Author: Mikhail Gudim (mgudim) ChangesIn order to avoid recalculating stride of strided load twice save it in a map. Full diff: https://github.com/llvm/llvm-project/pull/157706.diff 1 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1cfcd3ffbd664..aef2581aab615 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1916,6 +1916,19 @@ class BoUpSLP {
class ShuffleCostEstimator;
class ShuffleInstructionBuilder;
+ /// If we decide to generate strided load / store, this struct contains all
+ /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
+ /// and analyzeConstantStrideCandidate. Note that Stride can be given either
+ /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
+ /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
+ /// size of element of FixedVectorType.
+ struct StridedPtrInfo {
+ Value *StrideVal = nullptr;
+ const SCEV *StrideSCEV = nullptr;
+ FixedVectorType *Ty = nullptr;
+ };
+ SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
+
public:
/// Tracks the state we can represent the loads in the given sequence.
enum class LoadsState {
@@ -2211,6 +2224,11 @@ class BoUpSLP {
/// TODO: If load combining is allowed in the IR optimizer, this analysis
/// may not be necessary.
bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
+ bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
+ const DataLayout &DL, ScalarEvolution &SE,
+ const bool IsAnyPointerUsedOutGraph, const int64_t Diff,
+ StridedPtrInfo &SPtrInfo) const;
/// Checks if the given array of loads can be represented as a vectorized,
/// scatter or just simple gather.
@@ -2225,6 +2243,7 @@ class BoUpSLP {
LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
SmallVectorImpl<unsigned> &Order,
SmallVectorImpl<Value *> &PointerOps,
+ StridedPtrInfo &SPtrInfo,
unsigned *BestVF = nullptr,
bool TryRecursiveCheck = true) const;
@@ -4469,11 +4488,10 @@ class BoUpSLP {
/// Checks if the specified list of the instructions/values can be vectorized
/// and fills required data before actual scheduling of the instructions.
- TreeEntry::EntryState
- getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
- bool IsScatterVectorizeUserTE,
- OrdersType &CurrentOrder,
- SmallVectorImpl<Value *> &PointerOps);
+ TreeEntry::EntryState getScalarsVectorizationState(
+ const InstructionsState &S, ArrayRef<Value *> VL,
+ bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
+ SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
/// Maps a specific scalar to its tree entry(ies).
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -6446,6 +6464,7 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
++Cnt;
}
}
+
return Stride;
}
@@ -6789,12 +6808,13 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
/// 4. Any pointer operand is an instruction with the users outside of the
/// current graph (for masked gathers extra extractelement instructions
/// might be required).
-static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
- ArrayRef<unsigned> Order,
- const TargetTransformInfo &TTI, const DataLayout &DL,
- ScalarEvolution &SE,
- const bool IsAnyPointerUsedOutGraph,
- const int64_t Diff) {
+bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order,
+ const TargetTransformInfo &TTI,
+ const DataLayout &DL, ScalarEvolution &SE,
+ const bool IsAnyPointerUsedOutGraph,
+ const int64_t Diff,
+ StridedPtrInfo &SPtrInfo) const {
const size_t Sz = VL.size();
const uint64_t AbsoluteDiff = std::abs(Diff);
Type *ScalarTy = VL.front()->getType();
@@ -6836,17 +6856,20 @@ static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
break;
}
- if (Dists.size() == Sz)
+ if (Dists.size() == Sz) {
+ Type *StrideTy = DL.getIndexType(Ptr0->getType());
+ SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
+ SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
return true;
+ }
}
return false;
}
-BoUpSLP::LoadsState
-BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
- SmallVectorImpl<unsigned> &Order,
- SmallVectorImpl<Value *> &PointerOps,
- unsigned *BestVF, bool TryRecursiveCheck) const {
+BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
+ ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
+ SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
+ unsigned *BestVF, bool TryRecursiveCheck) const {
// Check that a vectorized load would load the same memory as a scalar
// load. For example, we don't want to vectorize loads that are smaller
// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
@@ -6884,9 +6907,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
if (!IsSorted) {
if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
- if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
- calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
+ if (const SCEV *Stride =
+ calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
+ Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+ SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
+ SPtrInfo.StrideSCEV = Stride;
return LoadsState::StridedVectorize;
+ }
}
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -6930,7 +6957,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
});
if (IsPossibleStrided &&
isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
- IsAnyPointerUsedOutGraph, *Diff))
+ IsAnyPointerUsedOutGraph, *Diff, SPtrInfo))
return LoadsState::StridedVectorize;
}
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7014,9 +7041,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
SmallVector<unsigned> Order;
SmallVector<Value *> PointerOps;
- LoadsState LS =
- canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
- /*TryRecursiveCheck=*/false);
+ LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
+ PointerOps, SPtrInfo, BestVF,
+ /*TryRecursiveCheck=*/false);
// Check that the sorted loads are consecutive.
if (LS == LoadsState::Gather) {
if (BestVF) {
@@ -7688,9 +7715,10 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
// extra analysis later, so include such nodes into a special list.
if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
OrdersType CurrentOrder;
LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
- CurrentOrder, PointerOps);
+ CurrentOrder, PointerOps, SPtrInfo);
if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
Res == LoadsState::CompressVectorize)
return std::move(CurrentOrder);
@@ -9193,8 +9221,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// Try to build vector load.
ArrayRef<Value *> Values(
reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
+ StridedPtrInfo SPtrInfo;
LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
- PointerOps, &BestVF);
+ PointerOps, SPtrInfo, &BestVF);
if (LS != LoadsState::Gather ||
(BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
if (LS == LoadsState::ScatterVectorize) {
@@ -9388,6 +9417,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
unsigned VF = *CommonVF;
OrdersType Order;
SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
// Segmented load detected - vectorize at maximum vector factor.
if (InterleaveFactor <= Slice.size() &&
TTI.isLegalInterleavedAccessType(
@@ -9396,8 +9426,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
cast<LoadInst>(Slice.front())->getAlign(),
cast<LoadInst>(Slice.front())
->getPointerAddressSpace()) &&
- canVectorizeLoads(Slice, Slice.front(), Order,
- PointerOps) == LoadsState::Vectorize) {
+ canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
+ SPtrInfo) == LoadsState::Vectorize) {
UserMaxVF = InterleaveFactor * VF;
} else {
InterleaveFactor = 0;
@@ -9419,8 +9449,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
ArrayRef<Value *> VL = TE.Scalars;
OrdersType Order;
SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
LoadsState State = canVectorizeLoads(
- VL, VL.front(), Order, PointerOps);
+ VL, VL.front(), Order, PointerOps, SPtrInfo);
if (State == LoadsState::ScatterVectorize ||
State == LoadsState::CompressVectorize)
return false;
@@ -9438,11 +9469,11 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
[&, Slice = Slice](unsigned Idx) {
OrdersType Order;
SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
return canVectorizeLoads(
Slice.slice(Idx * UserMaxVF, UserMaxVF),
- Slice[Idx * UserMaxVF], Order,
- PointerOps) ==
- LoadsState::ScatterVectorize;
+ Slice[Idx * UserMaxVF], Order, PointerOps,
+ SPtrInfo) == LoadsState::ScatterVectorize;
}))
UserMaxVF = MaxVF;
if (Slice.size() != ConsecutiveNodesSize)
@@ -9799,7 +9830,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
const InstructionsState &S, ArrayRef<Value *> VL,
bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
- SmallVectorImpl<Value *> &PointerOps) {
+ SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
assert(S.getMainOp() &&
"Expected instructions with same/alternate opcodes only.");
@@ -9901,7 +9932,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
});
});
};
- switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
+ switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
case LoadsState::Vectorize:
return TreeEntry::Vectorize;
case LoadsState::CompressVectorize:
@@ -11374,8 +11405,9 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
OrdersType CurrentOrder;
SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
TreeEntry::EntryState State = getScalarsVectorizationState(
- S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
+ S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
if (State == TreeEntry::NeedToGather) {
newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
return;
@@ -11535,6 +11567,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
+ TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
TE->dump());
break;
@@ -12923,8 +12956,9 @@ void BoUpSLP::transformNodes() {
if (S.getOpcode() == Instruction::Load) {
OrdersType Order;
SmallVector<Value *> PointerOps;
- LoadsState Res =
- canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
+ StridedPtrInfo SPtrInfo;
+ LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
+ PointerOps, SPtrInfo);
AllStrided &= Res == LoadsState::StridedVectorize ||
Res == LoadsState::ScatterVectorize ||
Res == LoadsState::Gather;
@@ -13030,10 +13064,18 @@ void BoUpSLP::transformNodes() {
InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
Instruction::Load, VecTy, BaseLI->getPointerOperand(),
/*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
- if (StridedCost < OriginalVecCost)
+ if (StridedCost < OriginalVecCost) {
// Strided load is more profitable than consecutive load + reverse -
// transform the node to strided load.
+ Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
+ ->getPointerOperand()
+ ->getType());
+ StridedPtrInfo SPtrInfo;
+ SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
+ SPtrInfo.Ty = VecTy;
+ TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
E.State = TreeEntry::StridedVectorize;
+ }
}
break;
}
@@ -19474,6 +19516,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
LoadInst *LI = cast<LoadInst>(VL0);
Instruction *NewLI;
+ FixedVectorType *StridedLoadTy = nullptr;
Value *PO = LI->getPointerOperand();
if (E->State == TreeEntry::Vectorize) {
NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
@@ -19511,43 +19554,36 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
PO = IsReverseOrder ? PtrN : Ptr0;
- std::optional<int64_t> Diff = getPointersDiff(
- VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
Type *StrideTy = DL->getIndexType(PO->getType());
Value *StrideVal;
- if (Diff) {
- int64_t Stride =
- *Diff / (static_cast<int64_t>(E->Scalars.size()) - 1);
- StrideVal =
- ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
- DL->getTypeAllocSize(ScalarTy));
- } else {
- SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
- transform(E->Scalars, PointerOps.begin(), [](Value *V) {
- return cast<LoadInst>(V)->getPointerOperand();
- });
- OrdersType Order;
- const SCEV *StrideSCEV =
- calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
- assert(StrideSCEV && "At this point stride should be known");
+ const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
+ StridedLoadTy = SPtrInfo.Ty;
+ assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
+ unsigned StridedLoadEC =
+ StridedLoadTy->getElementCount().getKnownMinValue();
+
+ Value *Stride = SPtrInfo.StrideVal;
+ if (!Stride) {
+ const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
+ assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
SCEVExpander Expander(*SE, *DL, "strided-load-vec");
- Value *Stride = Expander.expandCodeFor(
- StrideSCEV, StrideSCEV->getType(), &*Builder.GetInsertPoint());
- Value *NewStride =
- Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
- StrideVal = Builder.CreateMul(
- NewStride,
- ConstantInt::get(
- StrideTy,
- (IsReverseOrder ? -1 : 1) *
- static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
- }
+ Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
+ &*Builder.GetInsertPoint());
+ }
+ Value *NewStride =
+ Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
+ StrideVal = Builder.CreateMul(
+ NewStride, ConstantInt::get(
+ StrideTy, (IsReverseOrder ? -1 : 1) *
+ static_cast<int>(
+ DL->getTypeAllocSize(ScalarTy))));
Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
auto *Inst = Builder.CreateIntrinsic(
Intrinsic::experimental_vp_strided_load,
- {VecTy, PO->getType(), StrideTy},
- {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
- Builder.getInt32(E->Scalars.size())});
+ {StridedLoadTy, PO->getType(), StrideTy},
+ {PO, StrideVal,
+ Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
+ Builder.getInt32(StridedLoadEC)});
Inst->addParamAttr(
/*ArgNo=*/0,
Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1); | |
| SPtrInfo.StrideVal = ConstantInt::get(StrideTy, -1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think 1 is correct because when we generate strided load in vectorizeTree there is this line:
StrideVal = Builder.CreateMul(
NewStride, ConstantInt::get(
StrideTy, (IsReverseOrder ? -1 : 1) * ....
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let me see if there is a test for this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to remove this check and generate correct stride here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
alright.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In RISCVTargetTransformInfo I see this line unsigned NumLoads = getEstimatedVLFor(VTy); used to get how many regular loads is equivalent to strided access. But why is it true for all implementations? What if some implementations loads x elements of every cycle where x != 1?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@alexey-bataev Since it is pretty hard to make slp generate -1 stride, can I add CLI options for testing purpuses? This way we can test that -1 stride change and move on with the load widening.
I can look at the cost model issue later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah, if an implementation can retire x elements per cycle for strided loads (x != 1), or has native segmented/strided hardware with better throughput, the TTI should reflect that on a per-subtarget basis.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to remove this check and generate correct stride here
I don't think we can remove this check. What if some other code (in reorderTopToBottom() for example) created reverse order?
In order to avoid recalculating stride of strided load twice save it in a map.
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
In order to avoid recalculating stride of strided load twice save it in a map.