diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 9186419715cc4..1e03209e888bf 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1512,6 +1512,14 @@ class TargetTransformInfo { TTI::TargetCostKind CostKind, unsigned Index = -1) const; + /// \return The expected cost of inserting or extracting a lane that is \p + /// Index elements from the end of a vector, i.e. the mathematical expression + /// for the lane is (VF - 1 - Index). This is required for scalable vectors + /// where the exact lane index is unknown at compile time. + LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd( + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, + unsigned Index) const; + /// \return The expected cost of aggregate inserts and extracts. This is /// used when the instruction is not available; a typical use case is to /// provision the cost of vectorization/scalarization in vectorizer passes. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 200cbafbaa6e2..252acf381a8e1 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -809,6 +809,13 @@ class TargetTransformInfoImplBase { return 1; } + virtual InstructionCost + getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { + return 1; + } + virtual InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index aa9d1f0a1ccea..27320b510b950 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1444,6 +1444,20 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { Op1); } + InstructionCost + getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const override { + unsigned NewIndex = -1; + if (auto *FVTy = dyn_cast(Val)) { + assert(Index < FVTy->getNumElements() && + "Unexpected index from end of vector"); + NewIndex = FVTy->getNumElements() - 1 - Index; + } + return thisT()->getVectorInstrCost(Opcode, Val, CostKind, NewIndex, nullptr, + nullptr); + } + InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 3141060a710ce..323ab8b1ddad1 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1130,6 +1130,15 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val, return Cost; } +InstructionCost TargetTransformInfo::getIndexedVectorInstrCostFromEnd( + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, + unsigned Index) const { + InstructionCost Cost = + TTIImpl->getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, Index); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + InstructionCost TargetTransformInfo::getInsertExtractValueCost( unsigned Opcode, TTI::TargetCostKind CostKind) const { assert((Opcode == Instruction::InsertValue || diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index fc332d5320181..cd3b85dd52173 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3986,6 +3986,24 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I); } +InstructionCost +AArch64TTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { + if (isa(Val)) + return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, + Index); + + // This typically requires both while and lastb instructions in order + // to extract the last element. If this is in a loop the while + // instruction can at least be hoisted out, although it will consume a + // predicate register. The cost should be more expensive than the base + // extract cost, which is 2 for most CPUs. + return CostKind == TTI::TCK_CodeSize + ? 2 + : ST->getVectorInsertExtractBaseCost() + 1; +} + InstructionCost AArch64TTIImpl::getScalarizationOverhead( VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 9c96fdd427814..42ae962b3b426 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -220,6 +220,11 @@ class AArch64TTIImpl final : public BasicTTIImplBase { TTI::TargetCostKind CostKind, unsigned Index) const override; + InstructionCost + getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const override; + InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 85b3059d87da7..c707fb110b10c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2415,6 +2415,24 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return BaseCost + SlideCost; } +InstructionCost +RISCVTTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { + if (isa(Val)) + return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind, + Index); + + // TODO: This code replicates what LoopVectorize.cpp used to do when asking + // for the cost of extracting the last lane of a scalable vector. It probably + // needs a more accurate cost. + ElementCount EC = cast(Val)->getElementCount(); + assert(Index < EC.getKnownMinValue() && "Unexpected reverse index"); + return getVectorInstrCost(Opcode, Val, CostKind, + EC.getKnownMinValue() - 1 - Index, nullptr, + nullptr); +} + InstructionCost RISCVTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 6a1f4b3e3bedf..b632f25b963f7 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -243,6 +243,11 @@ class RISCVTTIImpl final : public BasicTTIImplBase { unsigned Index, const Value *Op0, const Value *Op1) const override; + InstructionCost + getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const override; + InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None}, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 675a230bd2c94..4f795b4d0fb9e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5297,13 +5297,13 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent // the actual generated code, which involves extracting the last element of // a scalable vector where the lane to extract is unknown at compile time. - return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) + - TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, - CostKind) + - (IsLoopInvariantStoreValue - ? 0 - : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - CostKind, VF.getKnownMinValue() - 1)); + InstructionCost Cost = + TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) + + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind); + if (!IsLoopInvariantStoreValue) + Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement, + VectorTy, CostKind, 0); + return Cost; } InstructionCost diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index fa62547d374cd..2a5d14cb6fa09 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1011,6 +1011,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, I32Ty, {Arg0Ty, I32Ty, I1Ty}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); } + case VPInstruction::ExtractLastElement: { + // Add on the cost of extracting the element. + auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement, + VecTy, Ctx.CostKind, 0); + } case VPInstruction::ExtractPenultimateElement: if (VF == ElementCount::getScalable(1)) return InstructionCost::getInvalid(); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 58f2af73bd04c..11bb4d234f3f3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -903,30 +903,23 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFNONE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { ; TFNONE-NEXT: [[ENTRY:.*]]: ; TFNONE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; TFNONE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 ; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; TFNONE: [[VECTOR_PH]]: -; TFNONE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] +; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2 ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; TFNONE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFNONE: [[VECTOR_BODY]]: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFNONE-NEXT: [[TMP7:%.*]] = load double, ptr [[P2]], align 8 -; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[TMP7]], i64 0 -; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TFNONE-NEXT: [[TMP8:%.*]] = call @exp_masked_scalable( [[BROADCAST_SPLAT]], splat (i1 true)) -; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ogt [[TMP8]], zeroinitializer -; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], zeroinitializer, splat (double 1.000000e+00) -; TFNONE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() -; TFNONE-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 2 -; TFNONE-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1 -; TFNONE-NEXT: [[TMP14:%.*]] = extractelement [[PREDPHI]], i32 [[TMP13]] +; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0 +; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer +; TFNONE-NEXT: [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]]) +; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer +; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) +; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1 ; TFNONE-NEXT: store double [[TMP14]], ptr [[P]], align 8 -; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; TFNONE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; TFNONE: [[MIDDLE_BLOCK]]: