-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[LV][TTI] Calculate cost of extracting last index in a scalable vector #144086
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-powerpc @llvm/pr-subscribers-backend-arm Author: David Sherwood (david-arm) ChangesThere are a couple of places in the loop vectoriser where we This patch adds support for querying the cost of extracting I've also taken the liberty of adding support in vplan for Patch is 53.13 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144086.diff 28 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8f4ce80ada5ed..eb0d17bdd44b4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1443,7 +1443,7 @@ class TargetTransformInfo {
/// Index = -1 to indicate that there is no information about the index value.
LLVM_ABI InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index, TTI::TargetCostKind CostKind) const;
+ int Index, TTI::TargetCostKind CostKind) const;
/// \return The expected cost of control-flow related instructions such as
/// Phi, Ret, Br, Switch.
@@ -1465,6 +1465,13 @@ class TargetTransformInfo {
OperandValueInfo Op2Info = {OK_AnyValue, OP_None},
const Instruction *I = nullptr) const;
+ enum : int {
+ UnknownIndex = -1,
+ LastIndex = -2,
+ };
+
+ static inline bool isKnownVectorIndex(int Index) { return Index >= 0; }
+
/// \return The expected cost of vector Insert and Extract.
/// Use -1 to indicate that there is no information on the index value.
/// This is used when the instruction is not available; a typical use
@@ -1472,7 +1479,7 @@ class TargetTransformInfo {
/// vectorizer passes.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index = -1,
+ int Index = UnknownIndex,
const Value *Op0 = nullptr,
const Value *Op1 = nullptr) const;
@@ -1486,7 +1493,7 @@ class TargetTransformInfo {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
LLVM_ABI InstructionCost getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const;
@@ -1498,7 +1505,7 @@ class TargetTransformInfo {
/// exists (e.g., from basic blocks during transformation).
LLVM_ABI InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index = -1) const;
+ int Index = UnknownIndex) const;
/// \return The expected cost of aggregate inserts and extracts. This is
/// used when the instruction is not available; a typical use case is to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a80b4c5179bad..e8037a2e208ab 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -758,7 +758,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index, TTI::TargetCostKind CostKind) const {
+ int Index, TTI::TargetCostKind CostKind) const {
return 1;
}
@@ -781,7 +781,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
return 1;
}
@@ -791,7 +791,7 @@ class TargetTransformInfoImplBase {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
virtual InstructionCost getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return 1;
@@ -799,7 +799,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
return 1;
}
@@ -1522,7 +1522,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
auto *IE = dyn_cast<InsertElementInst>(U);
if (!IE)
return TTI::TCC_Basic; // FIXME
- unsigned Idx = -1;
+ int Idx = TargetTransformInfo::UnknownIndex;
if (auto *CI = dyn_cast<ConstantInt>(Operands[2]))
if (CI->getValue().getActiveBits() <= 32)
Idx = CI->getZExtValue();
@@ -1641,7 +1641,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
auto *EEI = dyn_cast<ExtractElementInst>(U);
if (!EEI)
return TTI::TCC_Basic; // FIXME
- unsigned Idx = -1;
+ int Idx = TargetTransformInfo::UnknownIndex;
if (auto *CI = dyn_cast<ConstantInt>(Operands[1]))
if (CI->getValue().getActiveBits() <= 32)
Idx = CI->getZExtValue();
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 574152e254f15..e9f2698ccbf8e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1341,7 +1341,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index,
+ int Index,
TTI::TargetCostKind CostKind) const override {
return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind, Index, nullptr, nullptr) +
@@ -1409,8 +1409,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override {
return getRegUsageForType(Val->getScalarType());
}
@@ -1420,8 +1420,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, Value *Scalar,
+ TTI::TargetCostKind CostKind, int Index,
+ Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>>
ScalarUserAndIdx) const override {
return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
@@ -1430,7 +1430,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const override {
+ int Index) const override {
Value *Op0 = nullptr;
Value *Op1 = nullptr;
if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2d053e55bdfa9..86846009fa60a 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1058,7 +1058,7 @@ InstructionCost TargetTransformInfo::getCastInstrCost(
}
InstructionCost TargetTransformInfo::getExtractWithExtendCost(
- unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index,
+ unsigned Opcode, Type *Dst, VectorType *VecTy, int Index,
TTI::TargetCostKind CostKind) const {
InstructionCost Cost =
TTIImpl->getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
@@ -1088,7 +1088,7 @@ InstructionCost TargetTransformInfo::getCmpSelInstrCost(
}
InstructionCost TargetTransformInfo::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
const Value *Op0, const Value *Op1) const {
assert((Opcode == Instruction::InsertElement ||
Opcode == Instruction::ExtractElement) &&
@@ -1100,7 +1100,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
}
InstructionCost TargetTransformInfo::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert((Opcode == Instruction::InsertElement ||
@@ -1115,7 +1115,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
InstructionCost
TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
// FIXME: Assert that Opcode is either InsertElement or ExtractElement.
// This is mentioned in the interface description and respected by all
// callers, but never asserted upon.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0232ac421aeda..94711dd63d0d1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3642,7 +3642,7 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
InstructionCost
AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
- VectorType *VecTy, unsigned Index,
+ VectorType *VecTy, int Index,
TTI::TargetCostKind CostKind) const {
// Make sure we were given a valid extend opcode.
@@ -3711,12 +3711,24 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
}
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
bool HasRealUse, const Instruction *I, Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert(Val->isVectorTy() && "This must be a vector type");
- if (Index != -1U) {
+ if (Index == TargetTransformInfo::LastIndex) {
+ if (isa<ScalableVectorType>(Val)) {
+ // This typically requires both while and lastb instructions in order
+ // to extract the last element. If this is in a loop the while
+ // instruction can at least be hoisted out, although it will consume a
+ // predicate register. The cost should be more expensive than the base
+ // extract cost, which is 2 for most CPUs.
+ return CostKind == TTI::TCK_CodeSize ? 2 : 3;
+ }
+ Index = cast<FixedVectorType>(Val)->getNumElements() - 1;
+ }
+
+ if (TargetTransformInfo::isKnownVectorIndex(Index)) {
// Legalize the type.
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
@@ -3884,8 +3896,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index,
- const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
bool HasRealUse =
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
@@ -3893,7 +3904,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}
InstructionCost AArch64TTIImpl::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
@@ -3903,7 +3914,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
true /* HasRealUse */, &I);
}
@@ -4052,10 +4063,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// loading the vector from constant pool or in some cases, may also result
// in scalarization. For now, we are approximating this with the
// scalarization cost.
- auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
- CostKind, -1, nullptr, nullptr);
- auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
- CostKind, -1, nullptr, nullptr);
+ auto ExtractCost =
+ 2 * getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
+ auto InsertCost = getVectorInstrCost(
+ Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr, nullptr);
unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
return ExtractCost + InsertCost +
NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
@@ -4153,9 +4167,11 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// On AArch64, without SVE, vector divisions are expanded
// into scalar divisions of each pair of elements.
Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
- -1, nullptr, nullptr);
- Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
- nullptr, nullptr);
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
+ Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
}
// TODO: if one of the arguments is scalar, then it's not necessary to
@@ -4186,11 +4202,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
return LT.first;
return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
(getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
- getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
- nullptr, nullptr) *
+ getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr) *
2 +
- getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
- nullptr, nullptr));
+ getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr));
case ISD::ADD:
case ISD::XOR:
case ISD::OR:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 664c360032ea3..96dc151eec783 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -73,7 +73,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCostHelper(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
@@ -197,15 +197,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index,
+ int Index,
TTI::TargetCostKind CostKind) const override;
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
/// \param ScalarUserAndIdx encodes the information about extracts from a
@@ -213,14 +213,14 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, Value *Scalar,
+ TTI::TargetCostKind CostKind, int Index,
+ Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>>
ScalarUserAndIdx) const override;
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const override;
+ int Index) const override;
InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 58bfc0b80b24f..3eb0b02f47d32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -834,7 +834,7 @@ GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
TTI:...
[truncated]
|
|
@llvm/pr-subscribers-backend-hexagon Author: David Sherwood (david-arm) ChangesThere are a couple of places in the loop vectoriser where we This patch adds support for querying the cost of extracting I've also taken the liberty of adding support in vplan for Patch is 53.13 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144086.diff 28 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8f4ce80ada5ed..eb0d17bdd44b4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1443,7 +1443,7 @@ class TargetTransformInfo {
/// Index = -1 to indicate that there is no information about the index value.
LLVM_ABI InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index, TTI::TargetCostKind CostKind) const;
+ int Index, TTI::TargetCostKind CostKind) const;
/// \return The expected cost of control-flow related instructions such as
/// Phi, Ret, Br, Switch.
@@ -1465,6 +1465,13 @@ class TargetTransformInfo {
OperandValueInfo Op2Info = {OK_AnyValue, OP_None},
const Instruction *I = nullptr) const;
+ enum : int {
+ UnknownIndex = -1,
+ LastIndex = -2,
+ };
+
+ static inline bool isKnownVectorIndex(int Index) { return Index >= 0; }
+
/// \return The expected cost of vector Insert and Extract.
/// Use -1 to indicate that there is no information on the index value.
/// This is used when the instruction is not available; a typical use
@@ -1472,7 +1479,7 @@ class TargetTransformInfo {
/// vectorizer passes.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index = -1,
+ int Index = UnknownIndex,
const Value *Op0 = nullptr,
const Value *Op1 = nullptr) const;
@@ -1486,7 +1493,7 @@ class TargetTransformInfo {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
LLVM_ABI InstructionCost getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const;
@@ -1498,7 +1505,7 @@ class TargetTransformInfo {
/// exists (e.g., from basic blocks during transformation).
LLVM_ABI InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index = -1) const;
+ int Index = UnknownIndex) const;
/// \return The expected cost of aggregate inserts and extracts. This is
/// used when the instruction is not available; a typical use case is to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a80b4c5179bad..e8037a2e208ab 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -758,7 +758,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index, TTI::TargetCostKind CostKind) const {
+ int Index, TTI::TargetCostKind CostKind) const {
return 1;
}
@@ -781,7 +781,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
return 1;
}
@@ -791,7 +791,7 @@ class TargetTransformInfoImplBase {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
virtual InstructionCost getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return 1;
@@ -799,7 +799,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
return 1;
}
@@ -1522,7 +1522,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
auto *IE = dyn_cast<InsertElementInst>(U);
if (!IE)
return TTI::TCC_Basic; // FIXME
- unsigned Idx = -1;
+ int Idx = TargetTransformInfo::UnknownIndex;
if (auto *CI = dyn_cast<ConstantInt>(Operands[2]))
if (CI->getValue().getActiveBits() <= 32)
Idx = CI->getZExtValue();
@@ -1641,7 +1641,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
auto *EEI = dyn_cast<ExtractElementInst>(U);
if (!EEI)
return TTI::TCC_Basic; // FIXME
- unsigned Idx = -1;
+ int Idx = TargetTransformInfo::UnknownIndex;
if (auto *CI = dyn_cast<ConstantInt>(Operands[1]))
if (CI->getValue().getActiveBits() <= 32)
Idx = CI->getZExtValue();
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 574152e254f15..e9f2698ccbf8e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1341,7 +1341,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index,
+ int Index,
TTI::TargetCostKind CostKind) const override {
return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind, Index, nullptr, nullptr) +
@@ -1409,8 +1409,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override {
return getRegUsageForType(Val->getScalarType());
}
@@ -1420,8 +1420,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, Value *Scalar,
+ TTI::TargetCostKind CostKind, int Index,
+ Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>>
ScalarUserAndIdx) const override {
return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
@@ -1430,7 +1430,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const override {
+ int Index) const override {
Value *Op0 = nullptr;
Value *Op1 = nullptr;
if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2d053e55bdfa9..86846009fa60a 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1058,7 +1058,7 @@ InstructionCost TargetTransformInfo::getCastInstrCost(
}
InstructionCost TargetTransformInfo::getExtractWithExtendCost(
- unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index,
+ unsigned Opcode, Type *Dst, VectorType *VecTy, int Index,
TTI::TargetCostKind CostKind) const {
InstructionCost Cost =
TTIImpl->getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
@@ -1088,7 +1088,7 @@ InstructionCost TargetTransformInfo::getCmpSelInstrCost(
}
InstructionCost TargetTransformInfo::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
const Value *Op0, const Value *Op1) const {
assert((Opcode == Instruction::InsertElement ||
Opcode == Instruction::ExtractElement) &&
@@ -1100,7 +1100,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
}
InstructionCost TargetTransformInfo::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert((Opcode == Instruction::InsertElement ||
@@ -1115,7 +1115,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
InstructionCost
TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
// FIXME: Assert that Opcode is either InsertElement or ExtractElement.
// This is mentioned in the interface description and respected by all
// callers, but never asserted upon.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0232ac421aeda..94711dd63d0d1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3642,7 +3642,7 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
InstructionCost
AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
- VectorType *VecTy, unsigned Index,
+ VectorType *VecTy, int Index,
TTI::TargetCostKind CostKind) const {
// Make sure we were given a valid extend opcode.
@@ -3711,12 +3711,24 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
}
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
bool HasRealUse, const Instruction *I, Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert(Val->isVectorTy() && "This must be a vector type");
- if (Index != -1U) {
+ if (Index == TargetTransformInfo::LastIndex) {
+ if (isa<ScalableVectorType>(Val)) {
+ // This typically requires both while and lastb instructions in order
+ // to extract the last element. If this is in a loop the while
+ // instruction can at least be hoisted out, although it will consume a
+ // predicate register. The cost should be more expensive than the base
+ // extract cost, which is 2 for most CPUs.
+ return CostKind == TTI::TCK_CodeSize ? 2 : 3;
+ }
+ Index = cast<FixedVectorType>(Val)->getNumElements() - 1;
+ }
+
+ if (TargetTransformInfo::isKnownVectorIndex(Index)) {
// Legalize the type.
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
@@ -3884,8 +3896,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index,
- const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
bool HasRealUse =
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
@@ -3893,7 +3904,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}
InstructionCost AArch64TTIImpl::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
@@ -3903,7 +3914,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
true /* HasRealUse */, &I);
}
@@ -4052,10 +4063,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// loading the vector from constant pool or in some cases, may also result
// in scalarization. For now, we are approximating this with the
// scalarization cost.
- auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
- CostKind, -1, nullptr, nullptr);
- auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
- CostKind, -1, nullptr, nullptr);
+ auto ExtractCost =
+ 2 * getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
+ auto InsertCost = getVectorInstrCost(
+ Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr, nullptr);
unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
return ExtractCost + InsertCost +
NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
@@ -4153,9 +4167,11 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// On AArch64, without SVE, vector divisions are expanded
// into scalar divisions of each pair of elements.
Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
- -1, nullptr, nullptr);
- Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
- nullptr, nullptr);
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
+ Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
}
// TODO: if one of the arguments is scalar, then it's not necessary to
@@ -4186,11 +4202,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
return LT.first;
return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
(getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
- getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
- nullptr, nullptr) *
+ getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr) *
2 +
- getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
- nullptr, nullptr));
+ getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr));
case ISD::ADD:
case ISD::XOR:
case ISD::OR:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 664c360032ea3..96dc151eec783 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -73,7 +73,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCostHelper(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
@@ -197,15 +197,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index,
+ int Index,
TTI::TargetCostKind CostKind) const override;
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
/// \param ScalarUserAndIdx encodes the information about extracts from a
@@ -213,14 +213,14 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, Value *Scalar,
+ TTI::TargetCostKind CostKind, int Index,
+ Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>>
ScalarUserAndIdx) const override;
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const override;
+ int Index) const override;
InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 58bfc0b80b24f..3eb0b02f47d32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -834,7 +834,7 @@ GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
TTI:...
[truncated]
|
|
@llvm/pr-subscribers-backend-amdgpu Author: David Sherwood (david-arm) ChangesThere are a couple of places in the loop vectoriser where we This patch adds support for querying the cost of extracting I've also taken the liberty of adding support in vplan for Patch is 53.13 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144086.diff 28 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8f4ce80ada5ed..eb0d17bdd44b4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1443,7 +1443,7 @@ class TargetTransformInfo {
/// Index = -1 to indicate that there is no information about the index value.
LLVM_ABI InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index, TTI::TargetCostKind CostKind) const;
+ int Index, TTI::TargetCostKind CostKind) const;
/// \return The expected cost of control-flow related instructions such as
/// Phi, Ret, Br, Switch.
@@ -1465,6 +1465,13 @@ class TargetTransformInfo {
OperandValueInfo Op2Info = {OK_AnyValue, OP_None},
const Instruction *I = nullptr) const;
+ enum : int {
+ UnknownIndex = -1,
+ LastIndex = -2,
+ };
+
+ static inline bool isKnownVectorIndex(int Index) { return Index >= 0; }
+
/// \return The expected cost of vector Insert and Extract.
/// Use -1 to indicate that there is no information on the index value.
/// This is used when the instruction is not available; a typical use
@@ -1472,7 +1479,7 @@ class TargetTransformInfo {
/// vectorizer passes.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index = -1,
+ int Index = UnknownIndex,
const Value *Op0 = nullptr,
const Value *Op1 = nullptr) const;
@@ -1486,7 +1493,7 @@ class TargetTransformInfo {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
LLVM_ABI InstructionCost getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const;
@@ -1498,7 +1505,7 @@ class TargetTransformInfo {
/// exists (e.g., from basic blocks during transformation).
LLVM_ABI InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index = -1) const;
+ int Index = UnknownIndex) const;
/// \return The expected cost of aggregate inserts and extracts. This is
/// used when the instruction is not available; a typical use case is to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a80b4c5179bad..e8037a2e208ab 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -758,7 +758,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index, TTI::TargetCostKind CostKind) const {
+ int Index, TTI::TargetCostKind CostKind) const {
return 1;
}
@@ -781,7 +781,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
return 1;
}
@@ -791,7 +791,7 @@ class TargetTransformInfoImplBase {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
virtual InstructionCost getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return 1;
@@ -799,7 +799,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
return 1;
}
@@ -1522,7 +1522,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
auto *IE = dyn_cast<InsertElementInst>(U);
if (!IE)
return TTI::TCC_Basic; // FIXME
- unsigned Idx = -1;
+ int Idx = TargetTransformInfo::UnknownIndex;
if (auto *CI = dyn_cast<ConstantInt>(Operands[2]))
if (CI->getValue().getActiveBits() <= 32)
Idx = CI->getZExtValue();
@@ -1641,7 +1641,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
auto *EEI = dyn_cast<ExtractElementInst>(U);
if (!EEI)
return TTI::TCC_Basic; // FIXME
- unsigned Idx = -1;
+ int Idx = TargetTransformInfo::UnknownIndex;
if (auto *CI = dyn_cast<ConstantInt>(Operands[1]))
if (CI->getValue().getActiveBits() <= 32)
Idx = CI->getZExtValue();
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 574152e254f15..e9f2698ccbf8e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1341,7 +1341,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index,
+ int Index,
TTI::TargetCostKind CostKind) const override {
return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind, Index, nullptr, nullptr) +
@@ -1409,8 +1409,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override {
return getRegUsageForType(Val->getScalarType());
}
@@ -1420,8 +1420,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, Value *Scalar,
+ TTI::TargetCostKind CostKind, int Index,
+ Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>>
ScalarUserAndIdx) const override {
return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
@@ -1430,7 +1430,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const override {
+ int Index) const override {
Value *Op0 = nullptr;
Value *Op1 = nullptr;
if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2d053e55bdfa9..86846009fa60a 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1058,7 +1058,7 @@ InstructionCost TargetTransformInfo::getCastInstrCost(
}
InstructionCost TargetTransformInfo::getExtractWithExtendCost(
- unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index,
+ unsigned Opcode, Type *Dst, VectorType *VecTy, int Index,
TTI::TargetCostKind CostKind) const {
InstructionCost Cost =
TTIImpl->getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
@@ -1088,7 +1088,7 @@ InstructionCost TargetTransformInfo::getCmpSelInstrCost(
}
InstructionCost TargetTransformInfo::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
const Value *Op0, const Value *Op1) const {
assert((Opcode == Instruction::InsertElement ||
Opcode == Instruction::ExtractElement) &&
@@ -1100,7 +1100,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
}
InstructionCost TargetTransformInfo::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert((Opcode == Instruction::InsertElement ||
@@ -1115,7 +1115,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
InstructionCost
TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
// FIXME: Assert that Opcode is either InsertElement or ExtractElement.
// This is mentioned in the interface description and respected by all
// callers, but never asserted upon.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0232ac421aeda..94711dd63d0d1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3642,7 +3642,7 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
InstructionCost
AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
- VectorType *VecTy, unsigned Index,
+ VectorType *VecTy, int Index,
TTI::TargetCostKind CostKind) const {
// Make sure we were given a valid extend opcode.
@@ -3711,12 +3711,24 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
}
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
bool HasRealUse, const Instruction *I, Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert(Val->isVectorTy() && "This must be a vector type");
- if (Index != -1U) {
+ if (Index == TargetTransformInfo::LastIndex) {
+ if (isa<ScalableVectorType>(Val)) {
+ // This typically requires both while and lastb instructions in order
+ // to extract the last element. If this is in a loop the while
+ // instruction can at least be hoisted out, although it will consume a
+ // predicate register. The cost should be more expensive than the base
+ // extract cost, which is 2 for most CPUs.
+ return CostKind == TTI::TCK_CodeSize ? 2 : 3;
+ }
+ Index = cast<FixedVectorType>(Val)->getNumElements() - 1;
+ }
+
+ if (TargetTransformInfo::isKnownVectorIndex(Index)) {
// Legalize the type.
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
@@ -3884,8 +3896,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index,
- const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
bool HasRealUse =
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
@@ -3893,7 +3904,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}
InstructionCost AArch64TTIImpl::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
@@ -3903,7 +3914,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
true /* HasRealUse */, &I);
}
@@ -4052,10 +4063,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// loading the vector from constant pool or in some cases, may also result
// in scalarization. For now, we are approximating this with the
// scalarization cost.
- auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
- CostKind, -1, nullptr, nullptr);
- auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
- CostKind, -1, nullptr, nullptr);
+ auto ExtractCost =
+ 2 * getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
+ auto InsertCost = getVectorInstrCost(
+ Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr, nullptr);
unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
return ExtractCost + InsertCost +
NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
@@ -4153,9 +4167,11 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// On AArch64, without SVE, vector divisions are expanded
// into scalar divisions of each pair of elements.
Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
- -1, nullptr, nullptr);
- Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
- nullptr, nullptr);
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
+ Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
}
// TODO: if one of the arguments is scalar, then it's not necessary to
@@ -4186,11 +4202,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
return LT.first;
return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
(getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
- getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
- nullptr, nullptr) *
+ getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr) *
2 +
- getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
- nullptr, nullptr));
+ getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr));
case ISD::ADD:
case ISD::XOR:
case ISD::OR:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 664c360032ea3..96dc151eec783 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -73,7 +73,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCostHelper(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
@@ -197,15 +197,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index,
+ int Index,
TTI::TargetCostKind CostKind) const override;
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
/// \param ScalarUserAndIdx encodes the information about extracts from a
@@ -213,14 +213,14 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, Value *Scalar,
+ TTI::TargetCostKind CostKind, int Index,
+ Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>>
ScalarUserAndIdx) const override;
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const override;
+ int Index) const override;
InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 58bfc0b80b24f..3eb0b02f47d32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -834,7 +834,7 @@ GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
TTI:...
[truncated]
|
|
@llvm/pr-subscribers-llvm-transforms Author: David Sherwood (david-arm) ChangesThere are a couple of places in the loop vectoriser where we This patch adds support for querying the cost of extracting I've also taken the liberty of adding support in vplan for Patch is 53.13 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144086.diff 28 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8f4ce80ada5ed..eb0d17bdd44b4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1443,7 +1443,7 @@ class TargetTransformInfo {
/// Index = -1 to indicate that there is no information about the index value.
LLVM_ABI InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index, TTI::TargetCostKind CostKind) const;
+ int Index, TTI::TargetCostKind CostKind) const;
/// \return The expected cost of control-flow related instructions such as
/// Phi, Ret, Br, Switch.
@@ -1465,6 +1465,13 @@ class TargetTransformInfo {
OperandValueInfo Op2Info = {OK_AnyValue, OP_None},
const Instruction *I = nullptr) const;
+ enum : int {
+ UnknownIndex = -1,
+ LastIndex = -2,
+ };
+
+ static inline bool isKnownVectorIndex(int Index) { return Index >= 0; }
+
/// \return The expected cost of vector Insert and Extract.
/// Use -1 to indicate that there is no information on the index value.
/// This is used when the instruction is not available; a typical use
@@ -1472,7 +1479,7 @@ class TargetTransformInfo {
/// vectorizer passes.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index = -1,
+ int Index = UnknownIndex,
const Value *Op0 = nullptr,
const Value *Op1 = nullptr) const;
@@ -1486,7 +1493,7 @@ class TargetTransformInfo {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
LLVM_ABI InstructionCost getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const;
@@ -1498,7 +1505,7 @@ class TargetTransformInfo {
/// exists (e.g., from basic blocks during transformation).
LLVM_ABI InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index = -1) const;
+ int Index = UnknownIndex) const;
/// \return The expected cost of aggregate inserts and extracts. This is
/// used when the instruction is not available; a typical use case is to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a80b4c5179bad..e8037a2e208ab 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -758,7 +758,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index, TTI::TargetCostKind CostKind) const {
+ int Index, TTI::TargetCostKind CostKind) const {
return 1;
}
@@ -781,7 +781,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
return 1;
}
@@ -791,7 +791,7 @@ class TargetTransformInfoImplBase {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
virtual InstructionCost getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return 1;
@@ -799,7 +799,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
return 1;
}
@@ -1522,7 +1522,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
auto *IE = dyn_cast<InsertElementInst>(U);
if (!IE)
return TTI::TCC_Basic; // FIXME
- unsigned Idx = -1;
+ int Idx = TargetTransformInfo::UnknownIndex;
if (auto *CI = dyn_cast<ConstantInt>(Operands[2]))
if (CI->getValue().getActiveBits() <= 32)
Idx = CI->getZExtValue();
@@ -1641,7 +1641,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
auto *EEI = dyn_cast<ExtractElementInst>(U);
if (!EEI)
return TTI::TCC_Basic; // FIXME
- unsigned Idx = -1;
+ int Idx = TargetTransformInfo::UnknownIndex;
if (auto *CI = dyn_cast<ConstantInt>(Operands[1]))
if (CI->getValue().getActiveBits() <= 32)
Idx = CI->getZExtValue();
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 574152e254f15..e9f2698ccbf8e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1341,7 +1341,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index,
+ int Index,
TTI::TargetCostKind CostKind) const override {
return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind, Index, nullptr, nullptr) +
@@ -1409,8 +1409,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override {
return getRegUsageForType(Val->getScalarType());
}
@@ -1420,8 +1420,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, Value *Scalar,
+ TTI::TargetCostKind CostKind, int Index,
+ Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>>
ScalarUserAndIdx) const override {
return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
@@ -1430,7 +1430,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const override {
+ int Index) const override {
Value *Op0 = nullptr;
Value *Op1 = nullptr;
if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2d053e55bdfa9..86846009fa60a 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1058,7 +1058,7 @@ InstructionCost TargetTransformInfo::getCastInstrCost(
}
InstructionCost TargetTransformInfo::getExtractWithExtendCost(
- unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index,
+ unsigned Opcode, Type *Dst, VectorType *VecTy, int Index,
TTI::TargetCostKind CostKind) const {
InstructionCost Cost =
TTIImpl->getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
@@ -1088,7 +1088,7 @@ InstructionCost TargetTransformInfo::getCmpSelInstrCost(
}
InstructionCost TargetTransformInfo::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
const Value *Op0, const Value *Op1) const {
assert((Opcode == Instruction::InsertElement ||
Opcode == Instruction::ExtractElement) &&
@@ -1100,7 +1100,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
}
InstructionCost TargetTransformInfo::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert((Opcode == Instruction::InsertElement ||
@@ -1115,7 +1115,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
InstructionCost
TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
// FIXME: Assert that Opcode is either InsertElement or ExtractElement.
// This is mentioned in the interface description and respected by all
// callers, but never asserted upon.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0232ac421aeda..94711dd63d0d1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3642,7 +3642,7 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
InstructionCost
AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
- VectorType *VecTy, unsigned Index,
+ VectorType *VecTy, int Index,
TTI::TargetCostKind CostKind) const {
// Make sure we were given a valid extend opcode.
@@ -3711,12 +3711,24 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
}
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
bool HasRealUse, const Instruction *I, Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert(Val->isVectorTy() && "This must be a vector type");
- if (Index != -1U) {
+ if (Index == TargetTransformInfo::LastIndex) {
+ if (isa<ScalableVectorType>(Val)) {
+ // This typically requires both while and lastb instructions in order
+ // to extract the last element. If this is in a loop the while
+ // instruction can at least be hoisted out, although it will consume a
+ // predicate register. The cost should be more expensive than the base
+ // extract cost, which is 2 for most CPUs.
+ return CostKind == TTI::TCK_CodeSize ? 2 : 3;
+ }
+ Index = cast<FixedVectorType>(Val)->getNumElements() - 1;
+ }
+
+ if (TargetTransformInfo::isKnownVectorIndex(Index)) {
// Legalize the type.
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
@@ -3884,8 +3896,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index,
- const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
bool HasRealUse =
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
@@ -3893,7 +3904,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}
InstructionCost AArch64TTIImpl::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
@@ -3903,7 +3914,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
true /* HasRealUse */, &I);
}
@@ -4052,10 +4063,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// loading the vector from constant pool or in some cases, may also result
// in scalarization. For now, we are approximating this with the
// scalarization cost.
- auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
- CostKind, -1, nullptr, nullptr);
- auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
- CostKind, -1, nullptr, nullptr);
+ auto ExtractCost =
+ 2 * getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
+ auto InsertCost = getVectorInstrCost(
+ Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr, nullptr);
unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
return ExtractCost + InsertCost +
NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
@@ -4153,9 +4167,11 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// On AArch64, without SVE, vector divisions are expanded
// into scalar divisions of each pair of elements.
Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
- -1, nullptr, nullptr);
- Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
- nullptr, nullptr);
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
+ Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
}
// TODO: if one of the arguments is scalar, then it's not necessary to
@@ -4186,11 +4202,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
return LT.first;
return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
(getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
- getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
- nullptr, nullptr) *
+ getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr) *
2 +
- getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
- nullptr, nullptr));
+ getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr));
case ISD::ADD:
case ISD::XOR:
case ISD::OR:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 664c360032ea3..96dc151eec783 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -73,7 +73,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCostHelper(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
@@ -197,15 +197,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index,
+ int Index,
TTI::TargetCostKind CostKind) const override;
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
/// \param ScalarUserAndIdx encodes the information about extracts from a
@@ -213,14 +213,14 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, Value *Scalar,
+ TTI::TargetCostKind CostKind, int Index,
+ Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>>
ScalarUserAndIdx) const override;
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const override;
+ int Index) const override;
InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 58bfc0b80b24f..3eb0b02f47d32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -834,7 +834,7 @@ GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
TTI:...
[truncated]
|
|
This PR follows on from #143526, where I discovered an issue with the vectoriser cost model regarding the cost of last lane extraction. |
|
This PR contains both a NFC patch to change the type of the index, and the follow-on patch to add support for calculating the cost of extracting the last index. I'm happy to split these into separate PRs if reviewers think that's better. |
| : TTI.getVectorInstrCost( | ||
| Instruction::ExtractElement, VectorTy, CostKind, | ||
| VF.isScalable() ? TargetTransformInfo::LastIndex | ||
| : VF.getKnownMinValue() - 1)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should now work?
| : VF.getKnownMinValue() - 1)); | |
| : VF.getFixedValue() - 1)); |
| /// 'Idx' being the extract lane. | ||
| LLVM_ABI InstructionCost getVectorInstrCost( | ||
| unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, | ||
| unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to update the TTI hooks to check if LastIndex has been passed, to avoid unexpected results now that we are calling it with a potentially unexpected value -2?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have done that implicitly by changing all the TTI hooks to call isKnownVectorIndex when testing if the index is known or not. Any code that previously compared against -1 now calls isKnownVectorIndex, which returns false whether the index is -1 or -2 so as far as I can tell the change is NFC for all other targets.
There may be places where target hooks were never checking for -1 and always assuming a known index - I figured in those cases my patch doesn't make things any more broken than they were already.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yes I see. For those places, could we assert that we either have UnknownIndex or a known index, i.e. no LastIndex, except for RISV which also supports scalable vectors and would be queried with LastIndex?
|
Would it not make sense to change the interface to take an ElementCount? I believe that's the correct type for representing an arbitrary vector index.
We can still canonicalise Element::getFixed(-1) to mean unknown as it does today, although even then we should probably use an optional. Update: I now realise the above is somewhat bogus because |
|
Given my faux par, as a counter recommendation what about splitting the index in two. Pass the index as an ElementCount and have another parameter to set the direction. This has the advantages I mention above and for the last index you'd use an index of 0 with a direction parameter of back/tail/reverse etc. |
| // instruction can at least be hoisted out, although it will consume a | ||
| // predicate register. The cost should be more expensive than the base | ||
| // extract cost, which is 2 for most CPUs. | ||
| return CostKind == TTI::TCK_CodeSize ? 2 : 3; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use the base vector instruction cost directly? ST->getVectorInsertExtractBaseCost() + 1
222f4b1 to
28a7da6
Compare
|
I've now chosen what I think is probably the simplest and least invasive route, which is to add a new getReverseVectorInstrCost specifically for calculating inserts/extracts of lanes from the end of a vector. It's only really useful for targets with scalable vectors and it avoids having to update the interfaces of all the other targets that don't care. |
davemgreen
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getReverseVectorInstrCost sounds like it would cost llvm.vector.reverse instructions. What do you think about getVectorInstrCostFromEnd or something like it?
fhahn
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getReverseVectorInstrCost sounds like it would cost llvm.vector.reverse instructions. What do you think about getVectorInstrCostFromEnd or something like it?
This is only for vector extracts/inserts from the end, right? It think it might be good to be explcity about that in the name, although getVectorInstrCost similarly only applies to insert/extracts....
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The reason for the new TTI function is so you don't need to write code like this but can instead write code that is independent of the vector type.
It would be better if for fixed length types the base implementation of the new function simply calls the existing one with a modified index.
|
Hopefully I've addressed everyone's comments with the new version of the hook - getVectorInstrCostFromEnd. I just noticed a conflict so I'll rebase the PR. |
2c0909c to
5c72770
Compare
davemgreen
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The cost and AArch64 parts look OK to me. (But I'm not married to the name if someone has a better suggestion).
paulwalker-arm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A couple of suggestions but otherwise this look good to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding "indexed" (i.e. getIndexedVectorInstrCostFromEnd) would add context to the function and make it clearer what "FromEnd" means. Or perhaps getReverseIndexedVectorInstrCost to extend your original naming.
In the same way, the original function could use an "indexed" variant (i.e. getIndexedVectorInstrCost) to improve clarity but that's nothing to do with this PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you want either that is \p Index elements from the end of a vector or that is indexed from the end of a vector?
There are a couple of places in the loop vectoriser where we want to calculate the cost of extracting the last lane in a vector. However, we wrongly assume that asking for the cost of extracting lane (VF.getKnownMinValue() - 1) is an accurate representation of the cost of extracting the last lane. For SVE at least, this is non-trivial as it requires the use of whilelo and lastb instructions. To solve this problem I have added a new getReverseVectorInstrCost interface where the index is used in reverse from the end of the vector. Suppose a vector has a given ElementCount EC, the extracted/inserted lane would be EC - 1 - Index. For scalable vectors this index is unknown at compile time. I've added a AArch64 hook that better represents the cost, and also a RISCV hook that maintains compatibility with the behaviour prior to this PR. I've also taken the liberty of adding support in vplan for calculating the cost of VPInstruction::ExtractLastElement.
5c72770 to
9623220
Compare
|
Rebased to fix a merge conflict in the test. |
There are a couple of places in the loop vectoriser where we
want to calculate the cost of extracting the last lane in a
vector. However, we wrongly assume that asking for the cost
of extracting lane (VF.getKnownMinValue() - 1) is an accurate
representation of the cost of extracting the last lane. For
SVE at least, this is non-trivial as it requires the use of
whilelo and lastb instructions.
To solve this problem I have added a new
getReverseVectorInstrCost interface where the index is used
in reverse from the end of the vector. Suppose a vector has
a given ElementCount EC, the extracted/inserted lane would be
EC - 1 - Index. For scalable vectors this index is unknown at
compile time. I've added a AArch64 hook that better represents
the cost, and also a RISCV hook that maintains compatibility
with the behaviour prior to this PR.
I've also taken the liberty of adding support in vplan for
calculating the cost of VPInstruction::ExtractLastElement.