Skip to content

Commit 222f4b1

Browse files
committed
[LV][TTI] Calculate cost of extracting last index in a scalable vector
There are a couple of places in the loop vectoriser where we want to calculate the cost of extracting the last lane in a vector. However, we wrongly assume that asking for the cost of extracting lane (VF.getKnownMinValue() - 1) is an accurate representation of the cost of extracting the last lane. For SVE at least, this is non-trivial as it requires the use of whilelo and lastb instructions. This patch adds support for querying the cost of extracting the last lane by passing a new negative value to getVectorInstrCost that's different to -1. An index of -1 means completely unknown, whereas -2 means the last element. I've also taken the liberty of adding support in vplan for calculating the cost of VPInstruction::ExtractLastElement as I happened to spot the opcode after a rebase.
1 parent 863c81e commit 222f4b1

File tree

6 files changed

+59
-43
lines changed

6 files changed

+59
-43
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1467,7 +1467,7 @@ class TargetTransformInfo {
14671467

14681468
enum : int {
14691469
UnknownIndex = -1,
1470-
// This will be expanded in a future patch.
1470+
LastIndex = -2,
14711471
};
14721472

14731473
static inline bool isKnownVectorIndex(int Index) { return Index >= 0; }

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3716,6 +3716,18 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
37163716
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
37173717
assert(Val->isVectorTy() && "This must be a vector type");
37183718

3719+
if (Index == TargetTransformInfo::LastIndex) {
3720+
if (isa<ScalableVectorType>(Val)) {
3721+
// This typically requires both while and lastb instructions in order
3722+
// to extract the last element. If this is in a loop the while
3723+
// instruction can at least be hoisted out, although it will consume a
3724+
// predicate register. The cost should be more expensive than the base
3725+
// extract cost, which is 2 for most CPUs.
3726+
return CostKind == TTI::TCK_CodeSize ? 2 : 3;
3727+
}
3728+
Index = cast<FixedVectorType>(Val)->getNumElements() - 1;
3729+
}
3730+
37193731
if (TargetTransformInfo::isKnownVectorIndex(Index)) {
37203732
// Legalize the type.
37213733
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5342,17 +5342,16 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
53425342
StoreInst *SI = cast<StoreInst>(I);
53435343

53445344
bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5345-
// TODO: We have existing tests that request the cost of extracting element
5346-
// VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5347-
// the actual generated code, which involves extracting the last element of
5348-
// a scalable vector where the lane to extract is unknown at compile time.
5349-
return TTI.getAddressComputationCost(ValTy) +
5350-
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5351-
CostKind) +
5352-
(IsLoopInvariantStoreValue
5353-
? 0
5354-
: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5355-
CostKind, VF.getKnownMinValue() - 1));
5345+
InstructionCost Cost =
5346+
TTI.getAddressComputationCost(ValTy) +
5347+
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) +
5348+
(IsLoopInvariantStoreValue
5349+
? 0
5350+
: TTI.getVectorInstrCost(
5351+
Instruction::ExtractElement, VectorTy, CostKind,
5352+
VF.isScalable() ? TargetTransformInfo::LastIndex
5353+
: VF.getKnownMinValue() - 1));
5354+
return Cost;
53565355
}
53575356

53585357
InstructionCost

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
791791
}
792792

793793
switch (getOpcode()) {
794+
case VPInstruction::ExtractLastElement: {
795+
// Add on the cost of extracting the element.
796+
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
797+
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
798+
Ctx.CostKind,
799+
TargetTransformInfo::LastIndex);
800+
}
794801
case Instruction::ExtractElement: {
795802
// Add on the cost of extracting the element.
796803
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);

llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -917,32 +917,23 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
917917
; TFNONE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
918918
; TFNONE-NEXT: [[ENTRY:.*]]:
919919
; TFNONE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
920-
; TFNONE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
921-
; TFNONE-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2
922-
; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
920+
; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
923921
; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
924922
; TFNONE: [[VECTOR_PH]]:
925-
; TFNONE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
926-
; TFNONE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
927-
; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
923+
; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
928924
; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
929-
; TFNONE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
930-
; TFNONE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
931925
; TFNONE-NEXT: br label %[[VECTOR_BODY:.*]]
932926
; TFNONE: [[VECTOR_BODY]]:
933927
; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
934928
; TFNONE-NEXT: [[TMP7:%.*]] = load double, ptr [[P2]], align 8
935-
; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP7]], i64 0
936-
; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
937-
; TFNONE-NEXT: [[TMP8:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true))
938-
; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP8]], zeroinitializer
939-
; TFNONE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
940-
; TFNONE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
941-
; TFNONE-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 2
942-
; TFNONE-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1
943-
; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i32 [[TMP13]]
929+
; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
930+
; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
931+
; TFNONE-NEXT: [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]])
932+
; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer
933+
; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
934+
; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
944935
; TFNONE-NEXT: store double [[TMP14]], ptr [[P]], align 8
945-
; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
936+
; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
946937
; TFNONE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
947938
; TFNONE-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
948939
; TFNONE: [[MIDDLE_BLOCK]]:

llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) {
88
; CHECK-LABEL: define void @vf_will_not_generate_any_vector_insts(
99
; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
1010
; CHECK-NEXT: [[ENTRY:.*]]:
11-
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
11+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
12+
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP0]])
13+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]]
14+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
1215
; CHECK: [[VECTOR_MEMCHECK]]:
1316
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
1417
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
@@ -17,23 +20,27 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) {
1720
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1821
; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
1922
; CHECK: [[VECTOR_PH]]:
20-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
21-
; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer
23+
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
24+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
25+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
26+
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
27+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[DST]], i64 0
28+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
2229
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
2330
; CHECK: [[VECTOR_BODY]]:
2431
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
25-
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]]
26-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
27-
; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
28-
; CHECK-NEXT: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
29-
; CHECK-NEXT: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> splat (i1 true)), !alias.scope [[META3]], !noalias [[META0]]
30-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
31-
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
32-
; CHECK-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
32+
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]]
33+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 1 x i32> poison, i32 [[TMP4]], i64 0
34+
; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 1 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
35+
; CHECK-NEXT: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> [[BROADCAST_SPLAT3]], <vscale x 1 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 1 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
36+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
37+
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
38+
; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
3339
; CHECK: [[MIDDLE_BLOCK]]:
34-
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
40+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
41+
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
3542
; CHECK: [[SCALAR_PH]]:
36-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
43+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
3744
; CHECK-NEXT: br label %[[LOOP:.*]]
3845
; CHECK: [[LOOP]]:
3946
; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[TMP3:%.*]], %[[LOOP]] ]

0 commit comments

Comments
 (0)