Skip to content

Commit 28e3bc6

Browse files
committed
[LV][TTI] Calculate cost of extracting last index in a scalable vector
There are a couple of places in the loop vectoriser where we want to calculate the cost of extracting the last lane in a vector. However, we wrongly assume that asking for the cost of extracting lane (VF.getKnownMinValue() - 1) is an accurate representation of the cost of extracting the last lane. For SVE at least, this is non-trivial as it requires the use of whilelo and lastb instructions. To solve this problem I have added a new getReverseVectorInstrCost interface where the index is used in reverse from the end of the vector. Suppose a vector has a given ElementCount EC, the extracted/inserted lane would be EC - 1 - Index. For scalable vectors this index is unknown at compile time. I've added a AArch64 hook that better represents the cost, and also a RISCV hook that maintains compatibility with the behaviour prior to this PR. I've also taken the liberty of adding support in vplan for calculating the cost of VPInstruction::ExtractLastElement.
1 parent ff68f71 commit 28e3bc6

File tree

11 files changed

+102
-23
lines changed

11 files changed

+102
-23
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,6 +1512,14 @@ class TargetTransformInfo {
15121512
TTI::TargetCostKind CostKind,
15131513
unsigned Index = -1) const;
15141514

1515+
/// \return The expected cost of inserting or extracting a lane that is \p
1516+
/// Index from the end of a vector, i.e. the mathematical expression for
1517+
/// the lane is (VF - 1 - Index). This is required for scalable vectors where
1518+
/// the exact lane index is unknown at compile time.
1519+
LLVM_ABI InstructionCost
1520+
getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
1521+
TTI::TargetCostKind CostKind, unsigned Index) const;
1522+
15151523
/// \return The expected cost of aggregate inserts and extracts. This is
15161524
/// used when the instruction is not available; a typical use case is to
15171525
/// provision the cost of vectorization/scalarization in vectorizer passes.

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,13 @@ class TargetTransformInfoImplBase {
809809
return 1;
810810
}
811811

812+
virtual InstructionCost
813+
getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
814+
TTI::TargetCostKind CostKind,
815+
unsigned Index) const {
816+
return 1;
817+
}
818+
812819
virtual InstructionCost
813820
getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
814821
const APInt &DemandedDstElts,

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1444,6 +1444,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
14441444
Op1);
14451445
}
14461446

1447+
InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
1448+
TTI::TargetCostKind CostKind,
1449+
unsigned Index) const override {
1450+
unsigned NewIndex = -1;
1451+
if (auto *FVTy = dyn_cast<FixedVectorType>(Val)) {
1452+
assert(Index < FVTy->getNumElements() &&
1453+
"Unexpected index from end of vector");
1454+
NewIndex = FVTy->getNumElements() - 1 - Index;
1455+
}
1456+
return thisT()->getVectorInstrCost(Opcode, Val, CostKind, NewIndex, nullptr,
1457+
nullptr);
1458+
}
1459+
14471460
InstructionCost
14481461
getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
14491462
const APInt &DemandedDstElts,

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,6 +1130,16 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
11301130
return Cost;
11311131
}
11321132

1133+
InstructionCost
1134+
TargetTransformInfo::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
1135+
TTI::TargetCostKind CostKind,
1136+
unsigned Index) const {
1137+
InstructionCost Cost =
1138+
TTIImpl->getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
1139+
assert(Cost >= 0 && "TTI should not produce negative costs!");
1140+
return Cost;
1141+
}
1142+
11331143
InstructionCost TargetTransformInfo::getInsertExtractValueCost(
11341144
unsigned Opcode, TTI::TargetCostKind CostKind) const {
11351145
assert((Opcode == Instruction::InsertValue ||

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3986,6 +3986,23 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
39863986
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
39873987
}
39883988

3989+
InstructionCost
3990+
AArch64TTIImpl::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
3991+
TTI::TargetCostKind CostKind,
3992+
unsigned Index) const {
3993+
if (auto *FixedVecTy = dyn_cast<FixedVectorType>(Val))
3994+
return BaseT::getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
3995+
3996+
// This typically requires both while and lastb instructions in order
3997+
// to extract the last element. If this is in a loop the while
3998+
// instruction can at least be hoisted out, although it will consume a
3999+
// predicate register. The cost should be more expensive than the base
4000+
// extract cost, which is 2 for most CPUs.
4001+
return CostKind == TTI::TCK_CodeSize
4002+
? 2
4003+
: ST->getVectorInsertExtractBaseCost() + 1;
4004+
}
4005+
39894006
InstructionCost AArch64TTIImpl::getScalarizationOverhead(
39904007
VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
39914008
TTI::TargetCostKind CostKind, bool ForPoisonSrc,

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,10 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
220220
TTI::TargetCostKind CostKind,
221221
unsigned Index) const override;
222222

223+
InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
224+
TTI::TargetCostKind CostKind,
225+
unsigned Index) const override;
226+
223227
InstructionCost
224228
getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
225229
TTI::TargetCostKind CostKind) const override;

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2415,6 +2415,23 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
24152415
return BaseCost + SlideCost;
24162416
}
24172417

2418+
InstructionCost
2419+
RISCVTTIImpl::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
2420+
TTI::TargetCostKind CostKind,
2421+
unsigned Index) const {
2422+
if (auto *FixedVecTy = dyn_cast<FixedVectorType>(Val))
2423+
return BaseT::getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
2424+
2425+
// TODO: This code replicates what LoopVectorize.cpp used to do when asking
2426+
// for the cost of extracting the last lane of a scalable vector. It probably
2427+
// needs a more accurate cost.
2428+
ElementCount EC = cast<VectorType>(Val)->getElementCount();
2429+
assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2430+
return getVectorInstrCost(Opcode, Val, CostKind,
2431+
EC.getKnownMinValue() - 1 - Index, nullptr,
2432+
nullptr);
2433+
}
2434+
24182435
InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
24192436
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
24202437
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,10 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
243243
unsigned Index, const Value *Op0,
244244
const Value *Op1) const override;
245245

246+
InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
247+
TTI::TargetCostKind CostKind,
248+
unsigned Index) const override;
249+
246250
InstructionCost getArithmeticInstrCost(
247251
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
248252
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5297,13 +5297,13 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
52975297
// VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
52985298
// the actual generated code, which involves extracting the last element of
52995299
// a scalable vector where the lane to extract is unknown at compile time.
5300-
return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5301-
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5302-
CostKind) +
5303-
(IsLoopInvariantStoreValue
5304-
? 0
5305-
: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5306-
CostKind, VF.getKnownMinValue() - 1));
5300+
InstructionCost Cost =
5301+
TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5302+
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
5303+
if (!IsLoopInvariantStoreValue)
5304+
Cost += TTI.getVectorInstrCostFromEnd(Instruction::ExtractElement, VectorTy,
5305+
CostKind, 0);
5306+
return Cost;
53075307
}
53085308

53095309
InstructionCost

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1011,6 +1011,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
10111011
I32Ty, {Arg0Ty, I32Ty, I1Ty});
10121012
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
10131013
}
1014+
case VPInstruction::ExtractLastElement: {
1015+
// Add on the cost of extracting the element.
1016+
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1017+
return Ctx.TTI.getVectorInstrCostFromEnd(Instruction::ExtractElement, VecTy,
1018+
Ctx.CostKind, 0);
1019+
}
10141020
case VPInstruction::ExtractPenultimateElement:
10151021
if (VF == ElementCount::getScalable(1))
10161022
return InstructionCost::getInvalid();

0 commit comments

Comments
 (0)