-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[LV] Vectorize conditional scalar assignments #158088
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-llvm-analysis Author: Graham Hunter (huntergr-arm) ChangesBased on Michael Maitland's previous work: This PR uses the existing recurrences code instead of introducing a I've enabled it by default to see the impact on tests; if there are I will be doing some performance runs on AArch64 to figure out Patch is 204.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/158088.diff 19 Files Affected:
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index f9e6da6d0846a..afa175704a7b1 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -70,6 +70,9 @@ enum class RecurKind {
FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of
///< (x,y) is increasing loop induction, and both x and y
///< are integer type, producing a UMax reduction.
+ FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y
+ ///< can be any scalar type, one is the current recurrence
+ ///< value, and the other is an arbitrary value.
// clang-format on
// TODO: Any_of and FindLast reduction need not be restricted to integer type
// only.
@@ -183,6 +186,12 @@ class RecurrenceDescriptor {
PHINode *OrigPhi, Instruction *I,
ScalarEvolution &SE);
+ /// Returns a struct describing whether the instruction is of the form
+ /// Select(Cmp(A, B), X, Y)
+ /// where one of (X, Y) is the Phi value and the other is an arbitrary value.
+ LLVM_ABI static InstDesc isFindLastPattern(Instruction *I, PHINode *Phi,
+ Loop *TheLoop);
+
/// Returns a struct describing if the instruction is a
/// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
LLVM_ABI static InstDesc isConditionalRdxPattern(Instruction *I);
@@ -299,6 +308,12 @@ class RecurrenceDescriptor {
isFindLastIVRecurrenceKind(Kind);
}
+ /// Returns true if the recurrence kind is of the form
+ /// select(cmp(),x,y) where one of (x,y) is an arbitrary value.
+ static bool isFindLastRecurrenceKind(RecurKind Kind) {
+ return Kind == RecurKind::FindLast;
+ }
+
/// Returns the type of the recurrence. This type can be narrower than the
/// actual type of the Phi if the recurrence has been type-promoted.
Type *getRecurrenceType() const { return RecurrenceType; }
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index b8c540ce4b99d..bd87e9de46bd5 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -56,6 +56,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ // TODO: Make type-agnostic.
+ case RecurKind::FindLast:
return true;
}
return false;
@@ -426,6 +428,8 @@ bool RecurrenceDescriptor::AddReductionVar(
++NumCmpSelectPatternInst;
if (isAnyOfRecurrenceKind(Kind) && IsASelect)
++NumCmpSelectPatternInst;
+ if (isFindLastRecurrenceKind(Kind) && IsASelect)
+ ++NumCmpSelectPatternInst;
// Check whether we found a reduction operator.
FoundReduxOp |= !IsAPhi && Cur != Start;
@@ -789,6 +793,38 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
return InstDesc(false, I);
}
+RecurrenceDescriptor::InstDesc
+RecurrenceDescriptor::isFindLastPattern(Instruction *I, PHINode *Phi,
+ Loop *TheLoop) {
+ // Must be a scalar.
+ Type *Type = Phi->getType();
+ if (!Type->isIntegerTy() && !Type->isFloatingPointTy() &&
+ !Type->isPointerTy())
+ return InstDesc(false, I);
+
+ SelectInst *Select = dyn_cast<SelectInst>(I);
+ if (!Select)
+ return InstDesc(false, I);
+
+ // FIXME: Support more complex patterns, including multiple selects.
+ // Phi or Select must be used only outside the loop,
+ // except for each other.
+ auto IsOnlyUsedOutsideLoop = [&](Value *V, Value *Ignore) {
+ return all_of(V->users(), [Ignore, TheLoop](User *U) {
+ if (U == Ignore)
+ return true;
+ if (auto *I = dyn_cast<Instruction>(U))
+ return !TheLoop->contains(I);
+ return false;
+ });
+ };
+ if (!IsOnlyUsedOutsideLoop(Phi, Select) ||
+ !IsOnlyUsedOutsideLoop(Select, Phi))
+ return InstDesc(false, I);
+
+ return InstDesc(I, RecurKind::FindLast);
+}
+
RecurrenceDescriptor::InstDesc
RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
const InstDesc &Prev) {
@@ -927,6 +963,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
return isConditionalRdxPattern(I);
if (isFindIVRecurrenceKind(Kind) && SE)
return isFindIVPattern(Kind, L, OrigPhi, I, *SE);
+ if (isFindLastRecurrenceKind(Kind))
+ return isFindLastPattern(I, OrigPhi, L);
[[fallthrough]];
case Instruction::FCmp:
case Instruction::ICmp:
@@ -1123,7 +1161,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
<< "\n");
return true;
}
-
+ if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
+ LLVM_DEBUG(dbgs() << "Found a FindLast reduction PHI." << *Phi << "\n");
+ return true;
+ }
// Not a reduction of known type.
return false;
}
@@ -1245,6 +1287,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
case RecurKind::SMin:
case RecurKind::UMax:
case RecurKind::UMin:
+ case RecurKind::FindLast:
return Instruction::ICmp;
case RecurKind::FMax:
case RecurKind::FMin:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 92321a76dbd80..6595c6e770be0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1004,6 +1004,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
+ case Intrinsic::experimental_vector_extract_last_active:
+ if (ST->isSVEAvailable()) {
+ auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
+ // This should turn into chained clastb instructions.
+ return LegalCost;
+ }
+ break;
default:
break;
}
@@ -5325,6 +5332,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
case RecurKind::FMax:
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
+ case RecurKind::FindLast:
return true;
default:
return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b4acda80cfb93..ea85685cdf7b8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4047,6 +4047,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenIntrinsicSC:
case VPDef::VPWidenSC:
case VPDef::VPWidenSelectSC:
+ case VPDef::VPWidenSelectVectorSC:
case VPDef::VPBlendSC:
case VPDef::VPFirstOrderRecurrencePHISC:
case VPDef::VPHistogramSC:
@@ -4546,6 +4547,11 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
IsaPred<VPReductionPHIRecipe>);
+ // FIXME: implement interleaving for FindLast transform correctly.
+ for (auto &[_, RdxDesc] : Legal->getReductionVars())
+ if (RecurrenceDescriptor::isFindLastRecurrenceKind(RdxDesc.getRecurrenceKind()))
+ return 1;
+
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0) {
@@ -8687,6 +8693,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
*Plan, Builder))
return nullptr;
+ // Create whole-vector selects for find-last recurrences.
+ VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences,
+ *Plan, RecipeBuilder, Legal);
+
if (useActiveLaneMask(Style)) {
// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
// TailFoldingStyle is visible there.
@@ -8779,6 +8789,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind Kind = PhiR->getRecurrenceKind();
assert(
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
"AnyOf and FindIV reductions are not allowed for in-loop reductions");
@@ -8987,6 +8998,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
FinalReductionResult =
Builder.createNaryOp(VPInstruction::ComputeAnyOfResult,
{PhiR, Start, NewExitingVPV}, ExitDL);
+ } else if (RecurrenceDescriptor::isFindLastRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ FinalReductionResult = Builder.createNaryOp(
+ VPInstruction::ExtractLastActive, {NewExitingVPV}, ExitDL);
} else {
VPIRFlags Flags =
RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind)
@@ -9076,7 +9091,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind RK = RdxDesc.getRecurrenceKind();
if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
- !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
+ !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) &&
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(RK))) {
VPBuilder PHBuilder(Plan->getVectorPreheader());
VPValue *Iden = Plan->getOrAddLiveIn(
getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
@@ -10069,6 +10085,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;
+ // FIXME: Enable interleaving for last_active reductions.
+ if (any_of(LVL.getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ return RecurrenceDescriptor::isFindLastRecurrenceKind(RdxDesc.getRecurrenceKind());
+ })) {
+ LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
+ << "to conditional scalar assignments.\n");
+ IntDiagMsg = {
+ "ConditionalAssignmentPreventsScalarInterleaving",
+ "Unable to interleave without vectorization due to conditional "
+ "assignments"};
+ InterleaveLoop = false;
+ IC = 1;
+ }
+
// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 75cace77ec534..7b25731af19d8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -24868,6 +24868,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25009,6 +25010,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25115,6 +25117,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 53291a931530f..2ffe68fedee05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -548,6 +548,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenIntrinsicSC:
case VPRecipeBase::VPWidenSC:
case VPRecipeBase::VPWidenSelectSC:
+ case VPRecipeBase::VPWidenSelectVectorSC:
case VPRecipeBase::VPBlendSC:
case VPRecipeBase::VPPredInstPHISC:
case VPRecipeBase::VPCanonicalIVPHISC:
@@ -1059,6 +1060,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
ResumeForEpilogue,
/// Returns the value for vscale.
VScale,
+ // Extracts the last active lane based on a predicate vector operand.
+ ExtractLastActive,
};
private:
@@ -1749,6 +1752,47 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
unsigned getOpcode() const { return Instruction::Select; }
+ VPValue *getCond() const { return getOperand(0); }
+
+ bool isInvariantCond() const {
+ return getCond()->isDefinedOutsideLoopRegions();
+ }
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getCond() && isInvariantCond();
+ }
+};
+
+/// A recipe for selecting whole vector values.
+struct VPWidenSelectVectorRecipe : public VPRecipeWithIRFlags {
+ VPWidenSelectVectorRecipe(ArrayRef<VPValue *> Operands)
+ : VPRecipeWithIRFlags(VPDef::VPWidenSelectVectorSC, Operands) {}
+
+ ~VPWidenSelectVectorRecipe() override = default;
+
+ VPWidenSelectVectorRecipe *clone() override {
+ SmallVector<VPValue *, 3> Operands(operands());
+ return new VPWidenSelectVectorRecipe(Operands);
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenSelectVectorSC)
+
+ /// Produce a widened version of the select instruction.
+ void execute(VPTransformState &State) override;
+
+ /// Return the cost of this VPWidenSelectVectorRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
VPValue *getCond() const {
return getOperand(0);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index d400ceff7797c..a299ab8593a2f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -115,7 +115,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractPenultimateElement: {
+ case VPInstruction::ExtractPenultimateElement:
+ case VPInstruction::ExtractLastActive: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
@@ -308,7 +309,11 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
})
.Case<VPExpressionRecipe>([this](const auto *R) {
return inferScalarType(R->getOperandOfResultType());
- });
+ })
+ .Case<VPWidenSelectVectorRecipe>(
+ [this](const VPWidenSelectVectorRecipe *R) {
+ return inferScalarType(R->getOperand(1));
+ });
assert(ResultTy && "could not infer type for the given VPValue");
CachedTypes[V] = ResultTy;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bf51489543098..598fa4888fe8a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -86,7 +86,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenLoadSC:
case VPWidenPHISC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -134,7 +135,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -177,7 +179,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPWidenPHISC:
case VPWidenPointerInductionSC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -522,6 +525,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ActiveLaneMask:
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ReductionStartVector:
+ case VPInstruction::ExtractLastActive:
return 3;
case VPInstruction::ComputeFindIVResult:
return 4;
@@ -983,6 +987,17 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case VPInstruction::ResumeForEpilogue:
return State.get(getOperand(0), true);
+ case VPInstruction::ExtractLastActive: {
+ Value *Data = State.get(getOperand(0));
+ Value *Mask = State.get(getOperand(1));
+ Value *Default = State.get(getOperand(2), /*IsScalar=*/true);
+ Type *VTy = Data->getType();
+
+ Module *M = State.Builder.GetInsertBlock()->getModule();
+ Function *ExtractLast = Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::experimental_vector_extract_last_active, {VTy});
+ return Builder.CreateCall(ExtractLast, {Data, Mask, Default});
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -1119,6 +1134,14 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
+ case VPInstruction::ExtractLastActive: {
+ Type *ScalarTy = Ctx.Types.inferScalarType(this);
+ Type *VecTy = toVectorTy(ScalarTy, VF);
+ Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+ IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_extract_last_active,
+ ScalarTy, {VecTy, MaskTy, ScalarTy});
+ return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
+ }
case VPInstruction::FirstOrderRecurrenceSplice: {
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
SmallVector<int> Mask(VF.getKnownMinValue());
@@ -1174,6 +1197,7 @@ bool VPInstruction::isVectorToScalar() const {
getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ComputeFindIVResult ||
+ getOpcode() == VPInstruction::ExtractLastActive ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
}
@@ -1243,6 +1267,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
+ case VPInstruction::ExtractLastActive:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::Not:
@@ -1414,6 +1439,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
+ case VPInstruction::ExtractLastActive:
+ O << "extract-last-active";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -1927,7 +1955,9 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
Mask->printAsOperand(O, SlotTracker);
}
}
+#endif
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-SELECT ";
@@ -2002,6 +2032,42 @@ InstructionCost VPWidenS...
[truncated]
|
|
@llvm/pr-subscribers-vectorizers Author: Graham Hunter (huntergr-arm) ChangesBased on Michael Maitland's previous work: This PR uses the existing recurrences code instead of introducing a I've enabled it by default to see the impact on tests; if there are I will be doing some performance runs on AArch64 to figure out Patch is 204.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/158088.diff 19 Files Affected:
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index f9e6da6d0846a..afa175704a7b1 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -70,6 +70,9 @@ enum class RecurKind {
FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of
///< (x,y) is increasing loop induction, and both x and y
///< are integer type, producing a UMax reduction.
+ FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y
+ ///< can be any scalar type, one is the current recurrence
+ ///< value, and the other is an arbitrary value.
// clang-format on
// TODO: Any_of and FindLast reduction need not be restricted to integer type
// only.
@@ -183,6 +186,12 @@ class RecurrenceDescriptor {
PHINode *OrigPhi, Instruction *I,
ScalarEvolution &SE);
+ /// Returns a struct describing whether the instruction is of the form
+ /// Select(Cmp(A, B), X, Y)
+ /// where one of (X, Y) is the Phi value and the other is an arbitrary value.
+ LLVM_ABI static InstDesc isFindLastPattern(Instruction *I, PHINode *Phi,
+ Loop *TheLoop);
+
/// Returns a struct describing if the instruction is a
/// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
LLVM_ABI static InstDesc isConditionalRdxPattern(Instruction *I);
@@ -299,6 +308,12 @@ class RecurrenceDescriptor {
isFindLastIVRecurrenceKind(Kind);
}
+ /// Returns true if the recurrence kind is of the form
+ /// select(cmp(),x,y) where one of (x,y) is an arbitrary value.
+ static bool isFindLastRecurrenceKind(RecurKind Kind) {
+ return Kind == RecurKind::FindLast;
+ }
+
/// Returns the type of the recurrence. This type can be narrower than the
/// actual type of the Phi if the recurrence has been type-promoted.
Type *getRecurrenceType() const { return RecurrenceType; }
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index b8c540ce4b99d..bd87e9de46bd5 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -56,6 +56,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ // TODO: Make type-agnostic.
+ case RecurKind::FindLast:
return true;
}
return false;
@@ -426,6 +428,8 @@ bool RecurrenceDescriptor::AddReductionVar(
++NumCmpSelectPatternInst;
if (isAnyOfRecurrenceKind(Kind) && IsASelect)
++NumCmpSelectPatternInst;
+ if (isFindLastRecurrenceKind(Kind) && IsASelect)
+ ++NumCmpSelectPatternInst;
// Check whether we found a reduction operator.
FoundReduxOp |= !IsAPhi && Cur != Start;
@@ -789,6 +793,38 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
return InstDesc(false, I);
}
+RecurrenceDescriptor::InstDesc
+RecurrenceDescriptor::isFindLastPattern(Instruction *I, PHINode *Phi,
+ Loop *TheLoop) {
+ // Must be a scalar.
+ Type *Type = Phi->getType();
+ if (!Type->isIntegerTy() && !Type->isFloatingPointTy() &&
+ !Type->isPointerTy())
+ return InstDesc(false, I);
+
+ SelectInst *Select = dyn_cast<SelectInst>(I);
+ if (!Select)
+ return InstDesc(false, I);
+
+ // FIXME: Support more complex patterns, including multiple selects.
+ // Phi or Select must be used only outside the loop,
+ // except for each other.
+ auto IsOnlyUsedOutsideLoop = [&](Value *V, Value *Ignore) {
+ return all_of(V->users(), [Ignore, TheLoop](User *U) {
+ if (U == Ignore)
+ return true;
+ if (auto *I = dyn_cast<Instruction>(U))
+ return !TheLoop->contains(I);
+ return false;
+ });
+ };
+ if (!IsOnlyUsedOutsideLoop(Phi, Select) ||
+ !IsOnlyUsedOutsideLoop(Select, Phi))
+ return InstDesc(false, I);
+
+ return InstDesc(I, RecurKind::FindLast);
+}
+
RecurrenceDescriptor::InstDesc
RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
const InstDesc &Prev) {
@@ -927,6 +963,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
return isConditionalRdxPattern(I);
if (isFindIVRecurrenceKind(Kind) && SE)
return isFindIVPattern(Kind, L, OrigPhi, I, *SE);
+ if (isFindLastRecurrenceKind(Kind))
+ return isFindLastPattern(I, OrigPhi, L);
[[fallthrough]];
case Instruction::FCmp:
case Instruction::ICmp:
@@ -1123,7 +1161,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
<< "\n");
return true;
}
-
+ if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
+ LLVM_DEBUG(dbgs() << "Found a FindLast reduction PHI." << *Phi << "\n");
+ return true;
+ }
// Not a reduction of known type.
return false;
}
@@ -1245,6 +1287,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
case RecurKind::SMin:
case RecurKind::UMax:
case RecurKind::UMin:
+ case RecurKind::FindLast:
return Instruction::ICmp;
case RecurKind::FMax:
case RecurKind::FMin:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 92321a76dbd80..6595c6e770be0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1004,6 +1004,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
+ case Intrinsic::experimental_vector_extract_last_active:
+ if (ST->isSVEAvailable()) {
+ auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
+ // This should turn into chained clastb instructions.
+ return LegalCost;
+ }
+ break;
default:
break;
}
@@ -5325,6 +5332,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
case RecurKind::FMax:
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
+ case RecurKind::FindLast:
return true;
default:
return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b4acda80cfb93..ea85685cdf7b8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4047,6 +4047,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenIntrinsicSC:
case VPDef::VPWidenSC:
case VPDef::VPWidenSelectSC:
+ case VPDef::VPWidenSelectVectorSC:
case VPDef::VPBlendSC:
case VPDef::VPFirstOrderRecurrencePHISC:
case VPDef::VPHistogramSC:
@@ -4546,6 +4547,11 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
IsaPred<VPReductionPHIRecipe>);
+ // FIXME: implement interleaving for FindLast transform correctly.
+ for (auto &[_, RdxDesc] : Legal->getReductionVars())
+ if (RecurrenceDescriptor::isFindLastRecurrenceKind(RdxDesc.getRecurrenceKind()))
+ return 1;
+
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0) {
@@ -8687,6 +8693,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
*Plan, Builder))
return nullptr;
+ // Create whole-vector selects for find-last recurrences.
+ VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences,
+ *Plan, RecipeBuilder, Legal);
+
if (useActiveLaneMask(Style)) {
// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
// TailFoldingStyle is visible there.
@@ -8779,6 +8789,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind Kind = PhiR->getRecurrenceKind();
assert(
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
"AnyOf and FindIV reductions are not allowed for in-loop reductions");
@@ -8987,6 +8998,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
FinalReductionResult =
Builder.createNaryOp(VPInstruction::ComputeAnyOfResult,
{PhiR, Start, NewExitingVPV}, ExitDL);
+ } else if (RecurrenceDescriptor::isFindLastRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ FinalReductionResult = Builder.createNaryOp(
+ VPInstruction::ExtractLastActive, {NewExitingVPV}, ExitDL);
} else {
VPIRFlags Flags =
RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind)
@@ -9076,7 +9091,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind RK = RdxDesc.getRecurrenceKind();
if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
- !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
+ !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) &&
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(RK))) {
VPBuilder PHBuilder(Plan->getVectorPreheader());
VPValue *Iden = Plan->getOrAddLiveIn(
getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
@@ -10069,6 +10085,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;
+ // FIXME: Enable interleaving for last_active reductions.
+ if (any_of(LVL.getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ return RecurrenceDescriptor::isFindLastRecurrenceKind(RdxDesc.getRecurrenceKind());
+ })) {
+ LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
+ << "to conditional scalar assignments.\n");
+ IntDiagMsg = {
+ "ConditionalAssignmentPreventsScalarInterleaving",
+ "Unable to interleave without vectorization due to conditional "
+ "assignments"};
+ InterleaveLoop = false;
+ IC = 1;
+ }
+
// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 75cace77ec534..7b25731af19d8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -24868,6 +24868,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25009,6 +25010,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25115,6 +25117,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 53291a931530f..2ffe68fedee05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -548,6 +548,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenIntrinsicSC:
case VPRecipeBase::VPWidenSC:
case VPRecipeBase::VPWidenSelectSC:
+ case VPRecipeBase::VPWidenSelectVectorSC:
case VPRecipeBase::VPBlendSC:
case VPRecipeBase::VPPredInstPHISC:
case VPRecipeBase::VPCanonicalIVPHISC:
@@ -1059,6 +1060,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
ResumeForEpilogue,
/// Returns the value for vscale.
VScale,
+ // Extracts the last active lane based on a predicate vector operand.
+ ExtractLastActive,
};
private:
@@ -1749,6 +1752,47 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
unsigned getOpcode() const { return Instruction::Select; }
+ VPValue *getCond() const { return getOperand(0); }
+
+ bool isInvariantCond() const {
+ return getCond()->isDefinedOutsideLoopRegions();
+ }
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getCond() && isInvariantCond();
+ }
+};
+
+/// A recipe for selecting whole vector values.
+struct VPWidenSelectVectorRecipe : public VPRecipeWithIRFlags {
+ VPWidenSelectVectorRecipe(ArrayRef<VPValue *> Operands)
+ : VPRecipeWithIRFlags(VPDef::VPWidenSelectVectorSC, Operands) {}
+
+ ~VPWidenSelectVectorRecipe() override = default;
+
+ VPWidenSelectVectorRecipe *clone() override {
+ SmallVector<VPValue *, 3> Operands(operands());
+ return new VPWidenSelectVectorRecipe(Operands);
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenSelectVectorSC)
+
+ /// Produce a widened version of the select instruction.
+ void execute(VPTransformState &State) override;
+
+ /// Return the cost of this VPWidenSelectVectorRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
VPValue *getCond() const {
return getOperand(0);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index d400ceff7797c..a299ab8593a2f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -115,7 +115,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractPenultimateElement: {
+ case VPInstruction::ExtractPenultimateElement:
+ case VPInstruction::ExtractLastActive: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
@@ -308,7 +309,11 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
})
.Case<VPExpressionRecipe>([this](const auto *R) {
return inferScalarType(R->getOperandOfResultType());
- });
+ })
+ .Case<VPWidenSelectVectorRecipe>(
+ [this](const VPWidenSelectVectorRecipe *R) {
+ return inferScalarType(R->getOperand(1));
+ });
assert(ResultTy && "could not infer type for the given VPValue");
CachedTypes[V] = ResultTy;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bf51489543098..598fa4888fe8a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -86,7 +86,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenLoadSC:
case VPWidenPHISC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -134,7 +135,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -177,7 +179,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPWidenPHISC:
case VPWidenPointerInductionSC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -522,6 +525,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ActiveLaneMask:
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ReductionStartVector:
+ case VPInstruction::ExtractLastActive:
return 3;
case VPInstruction::ComputeFindIVResult:
return 4;
@@ -983,6 +987,17 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case VPInstruction::ResumeForEpilogue:
return State.get(getOperand(0), true);
+ case VPInstruction::ExtractLastActive: {
+ Value *Data = State.get(getOperand(0));
+ Value *Mask = State.get(getOperand(1));
+ Value *Default = State.get(getOperand(2), /*IsScalar=*/true);
+ Type *VTy = Data->getType();
+
+ Module *M = State.Builder.GetInsertBlock()->getModule();
+ Function *ExtractLast = Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::experimental_vector_extract_last_active, {VTy});
+ return Builder.CreateCall(ExtractLast, {Data, Mask, Default});
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -1119,6 +1134,14 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
+ case VPInstruction::ExtractLastActive: {
+ Type *ScalarTy = Ctx.Types.inferScalarType(this);
+ Type *VecTy = toVectorTy(ScalarTy, VF);
+ Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+ IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_extract_last_active,
+ ScalarTy, {VecTy, MaskTy, ScalarTy});
+ return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
+ }
case VPInstruction::FirstOrderRecurrenceSplice: {
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
SmallVector<int> Mask(VF.getKnownMinValue());
@@ -1174,6 +1197,7 @@ bool VPInstruction::isVectorToScalar() const {
getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ComputeFindIVResult ||
+ getOpcode() == VPInstruction::ExtractLastActive ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
}
@@ -1243,6 +1267,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
+ case VPInstruction::ExtractLastActive:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::Not:
@@ -1414,6 +1439,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
+ case VPInstruction::ExtractLastActive:
+ O << "extract-last-active";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -1927,7 +1955,9 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
Mask->printAsOperand(O, SlotTracker);
}
}
+#endif
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-SELECT ";
@@ -2002,6 +2032,42 @@ InstructionCost VPWidenS...
[truncated]
|
|
@llvm/pr-subscribers-backend-aarch64 Author: Graham Hunter (huntergr-arm) ChangesBased on Michael Maitland's previous work: This PR uses the existing recurrences code instead of introducing a I've enabled it by default to see the impact on tests; if there are I will be doing some performance runs on AArch64 to figure out Patch is 204.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/158088.diff 19 Files Affected:
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index f9e6da6d0846a..afa175704a7b1 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -70,6 +70,9 @@ enum class RecurKind {
FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of
///< (x,y) is increasing loop induction, and both x and y
///< are integer type, producing a UMax reduction.
+ FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y
+ ///< can be any scalar type, one is the current recurrence
+ ///< value, and the other is an arbitrary value.
// clang-format on
// TODO: Any_of and FindLast reduction need not be restricted to integer type
// only.
@@ -183,6 +186,12 @@ class RecurrenceDescriptor {
PHINode *OrigPhi, Instruction *I,
ScalarEvolution &SE);
+ /// Returns a struct describing whether the instruction is of the form
+ /// Select(Cmp(A, B), X, Y)
+ /// where one of (X, Y) is the Phi value and the other is an arbitrary value.
+ LLVM_ABI static InstDesc isFindLastPattern(Instruction *I, PHINode *Phi,
+ Loop *TheLoop);
+
/// Returns a struct describing if the instruction is a
/// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
LLVM_ABI static InstDesc isConditionalRdxPattern(Instruction *I);
@@ -299,6 +308,12 @@ class RecurrenceDescriptor {
isFindLastIVRecurrenceKind(Kind);
}
+ /// Returns true if the recurrence kind is of the form
+ /// select(cmp(),x,y) where one of (x,y) is an arbitrary value.
+ static bool isFindLastRecurrenceKind(RecurKind Kind) {
+ return Kind == RecurKind::FindLast;
+ }
+
/// Returns the type of the recurrence. This type can be narrower than the
/// actual type of the Phi if the recurrence has been type-promoted.
Type *getRecurrenceType() const { return RecurrenceType; }
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index b8c540ce4b99d..bd87e9de46bd5 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -56,6 +56,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ // TODO: Make type-agnostic.
+ case RecurKind::FindLast:
return true;
}
return false;
@@ -426,6 +428,8 @@ bool RecurrenceDescriptor::AddReductionVar(
++NumCmpSelectPatternInst;
if (isAnyOfRecurrenceKind(Kind) && IsASelect)
++NumCmpSelectPatternInst;
+ if (isFindLastRecurrenceKind(Kind) && IsASelect)
+ ++NumCmpSelectPatternInst;
// Check whether we found a reduction operator.
FoundReduxOp |= !IsAPhi && Cur != Start;
@@ -789,6 +793,38 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
return InstDesc(false, I);
}
+RecurrenceDescriptor::InstDesc
+RecurrenceDescriptor::isFindLastPattern(Instruction *I, PHINode *Phi,
+ Loop *TheLoop) {
+ // Must be a scalar.
+ Type *Type = Phi->getType();
+ if (!Type->isIntegerTy() && !Type->isFloatingPointTy() &&
+ !Type->isPointerTy())
+ return InstDesc(false, I);
+
+ SelectInst *Select = dyn_cast<SelectInst>(I);
+ if (!Select)
+ return InstDesc(false, I);
+
+ // FIXME: Support more complex patterns, including multiple selects.
+ // Phi or Select must be used only outside the loop,
+ // except for each other.
+ auto IsOnlyUsedOutsideLoop = [&](Value *V, Value *Ignore) {
+ return all_of(V->users(), [Ignore, TheLoop](User *U) {
+ if (U == Ignore)
+ return true;
+ if (auto *I = dyn_cast<Instruction>(U))
+ return !TheLoop->contains(I);
+ return false;
+ });
+ };
+ if (!IsOnlyUsedOutsideLoop(Phi, Select) ||
+ !IsOnlyUsedOutsideLoop(Select, Phi))
+ return InstDesc(false, I);
+
+ return InstDesc(I, RecurKind::FindLast);
+}
+
RecurrenceDescriptor::InstDesc
RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
const InstDesc &Prev) {
@@ -927,6 +963,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
return isConditionalRdxPattern(I);
if (isFindIVRecurrenceKind(Kind) && SE)
return isFindIVPattern(Kind, L, OrigPhi, I, *SE);
+ if (isFindLastRecurrenceKind(Kind))
+ return isFindLastPattern(I, OrigPhi, L);
[[fallthrough]];
case Instruction::FCmp:
case Instruction::ICmp:
@@ -1123,7 +1161,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
<< "\n");
return true;
}
-
+ if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
+ LLVM_DEBUG(dbgs() << "Found a FindLast reduction PHI." << *Phi << "\n");
+ return true;
+ }
// Not a reduction of known type.
return false;
}
@@ -1245,6 +1287,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
case RecurKind::SMin:
case RecurKind::UMax:
case RecurKind::UMin:
+ case RecurKind::FindLast:
return Instruction::ICmp;
case RecurKind::FMax:
case RecurKind::FMin:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 92321a76dbd80..6595c6e770be0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1004,6 +1004,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
+ case Intrinsic::experimental_vector_extract_last_active:
+ if (ST->isSVEAvailable()) {
+ auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
+ // This should turn into chained clastb instructions.
+ return LegalCost;
+ }
+ break;
default:
break;
}
@@ -5325,6 +5332,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
case RecurKind::FMax:
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
+ case RecurKind::FindLast:
return true;
default:
return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b4acda80cfb93..ea85685cdf7b8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4047,6 +4047,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenIntrinsicSC:
case VPDef::VPWidenSC:
case VPDef::VPWidenSelectSC:
+ case VPDef::VPWidenSelectVectorSC:
case VPDef::VPBlendSC:
case VPDef::VPFirstOrderRecurrencePHISC:
case VPDef::VPHistogramSC:
@@ -4546,6 +4547,11 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
IsaPred<VPReductionPHIRecipe>);
+ // FIXME: implement interleaving for FindLast transform correctly.
+ for (auto &[_, RdxDesc] : Legal->getReductionVars())
+ if (RecurrenceDescriptor::isFindLastRecurrenceKind(RdxDesc.getRecurrenceKind()))
+ return 1;
+
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0) {
@@ -8687,6 +8693,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
*Plan, Builder))
return nullptr;
+ // Create whole-vector selects for find-last recurrences.
+ VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences,
+ *Plan, RecipeBuilder, Legal);
+
if (useActiveLaneMask(Style)) {
// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
// TailFoldingStyle is visible there.
@@ -8779,6 +8789,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind Kind = PhiR->getRecurrenceKind();
assert(
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
"AnyOf and FindIV reductions are not allowed for in-loop reductions");
@@ -8987,6 +8998,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
FinalReductionResult =
Builder.createNaryOp(VPInstruction::ComputeAnyOfResult,
{PhiR, Start, NewExitingVPV}, ExitDL);
+ } else if (RecurrenceDescriptor::isFindLastRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ FinalReductionResult = Builder.createNaryOp(
+ VPInstruction::ExtractLastActive, {NewExitingVPV}, ExitDL);
} else {
VPIRFlags Flags =
RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind)
@@ -9076,7 +9091,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind RK = RdxDesc.getRecurrenceKind();
if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
- !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
+ !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) &&
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(RK))) {
VPBuilder PHBuilder(Plan->getVectorPreheader());
VPValue *Iden = Plan->getOrAddLiveIn(
getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
@@ -10069,6 +10085,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;
+ // FIXME: Enable interleaving for last_active reductions.
+ if (any_of(LVL.getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ return RecurrenceDescriptor::isFindLastRecurrenceKind(RdxDesc.getRecurrenceKind());
+ })) {
+ LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
+ << "to conditional scalar assignments.\n");
+ IntDiagMsg = {
+ "ConditionalAssignmentPreventsScalarInterleaving",
+ "Unable to interleave without vectorization due to conditional "
+ "assignments"};
+ InterleaveLoop = false;
+ IC = 1;
+ }
+
// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 75cace77ec534..7b25731af19d8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -24868,6 +24868,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25009,6 +25010,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25115,6 +25117,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 53291a931530f..2ffe68fedee05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -548,6 +548,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenIntrinsicSC:
case VPRecipeBase::VPWidenSC:
case VPRecipeBase::VPWidenSelectSC:
+ case VPRecipeBase::VPWidenSelectVectorSC:
case VPRecipeBase::VPBlendSC:
case VPRecipeBase::VPPredInstPHISC:
case VPRecipeBase::VPCanonicalIVPHISC:
@@ -1059,6 +1060,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
ResumeForEpilogue,
/// Returns the value for vscale.
VScale,
+ // Extracts the last active lane based on a predicate vector operand.
+ ExtractLastActive,
};
private:
@@ -1749,6 +1752,47 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
unsigned getOpcode() const { return Instruction::Select; }
+ VPValue *getCond() const { return getOperand(0); }
+
+ bool isInvariantCond() const {
+ return getCond()->isDefinedOutsideLoopRegions();
+ }
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getCond() && isInvariantCond();
+ }
+};
+
+/// A recipe for selecting whole vector values.
+struct VPWidenSelectVectorRecipe : public VPRecipeWithIRFlags {
+ VPWidenSelectVectorRecipe(ArrayRef<VPValue *> Operands)
+ : VPRecipeWithIRFlags(VPDef::VPWidenSelectVectorSC, Operands) {}
+
+ ~VPWidenSelectVectorRecipe() override = default;
+
+ VPWidenSelectVectorRecipe *clone() override {
+ SmallVector<VPValue *, 3> Operands(operands());
+ return new VPWidenSelectVectorRecipe(Operands);
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenSelectVectorSC)
+
+ /// Produce a widened version of the select instruction.
+ void execute(VPTransformState &State) override;
+
+ /// Return the cost of this VPWidenSelectVectorRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
VPValue *getCond() const {
return getOperand(0);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index d400ceff7797c..a299ab8593a2f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -115,7 +115,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractPenultimateElement: {
+ case VPInstruction::ExtractPenultimateElement:
+ case VPInstruction::ExtractLastActive: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
@@ -308,7 +309,11 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
})
.Case<VPExpressionRecipe>([this](const auto *R) {
return inferScalarType(R->getOperandOfResultType());
- });
+ })
+ .Case<VPWidenSelectVectorRecipe>(
+ [this](const VPWidenSelectVectorRecipe *R) {
+ return inferScalarType(R->getOperand(1));
+ });
assert(ResultTy && "could not infer type for the given VPValue");
CachedTypes[V] = ResultTy;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bf51489543098..598fa4888fe8a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -86,7 +86,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenLoadSC:
case VPWidenPHISC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -134,7 +135,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -177,7 +179,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPWidenPHISC:
case VPWidenPointerInductionSC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -522,6 +525,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ActiveLaneMask:
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ReductionStartVector:
+ case VPInstruction::ExtractLastActive:
return 3;
case VPInstruction::ComputeFindIVResult:
return 4;
@@ -983,6 +987,17 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case VPInstruction::ResumeForEpilogue:
return State.get(getOperand(0), true);
+ case VPInstruction::ExtractLastActive: {
+ Value *Data = State.get(getOperand(0));
+ Value *Mask = State.get(getOperand(1));
+ Value *Default = State.get(getOperand(2), /*IsScalar=*/true);
+ Type *VTy = Data->getType();
+
+ Module *M = State.Builder.GetInsertBlock()->getModule();
+ Function *ExtractLast = Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::experimental_vector_extract_last_active, {VTy});
+ return Builder.CreateCall(ExtractLast, {Data, Mask, Default});
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -1119,6 +1134,14 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
+ case VPInstruction::ExtractLastActive: {
+ Type *ScalarTy = Ctx.Types.inferScalarType(this);
+ Type *VecTy = toVectorTy(ScalarTy, VF);
+ Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+ IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_extract_last_active,
+ ScalarTy, {VecTy, MaskTy, ScalarTy});
+ return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
+ }
case VPInstruction::FirstOrderRecurrenceSplice: {
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
SmallVector<int> Mask(VF.getKnownMinValue());
@@ -1174,6 +1197,7 @@ bool VPInstruction::isVectorToScalar() const {
getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ComputeFindIVResult ||
+ getOpcode() == VPInstruction::ExtractLastActive ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
}
@@ -1243,6 +1267,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
+ case VPInstruction::ExtractLastActive:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::Not:
@@ -1414,6 +1439,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
+ case VPInstruction::ExtractLastActive:
+ O << "extract-last-active";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -1927,7 +1955,9 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
Mask->printAsOperand(O, SlotTracker);
}
}
+#endif
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-SELECT ";
@@ -2002,6 +2032,42 @@ InstructionCost VPWidenS...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Mel-Chen
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you. I've reviewed part of the code, and will continue the review in October, as I'll be on vacation until the end of September.
| ///< (x,y) is increasing loop induction, and both x and y | ||
| ///< are integer type, producing a UMax reduction. | ||
| FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y | ||
| ///< can be any scalar type, one is the current recurrence |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since isIntegerRecurrenceKind returns true for FindLast, I suggest
| ///< can be any scalar type, one is the current recurrence | |
| ///< are integer type, one is the current recurrence |
llvm/lib/Analysis/IVDescriptors.cpp
Outdated
| if (isFindLastRecurrenceKind(Kind) && IsASelect) | ||
| ++NumCmpSelectPatternInst; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why we need this?
llvm/lib/Analysis/IVDescriptors.cpp
Outdated
| RecurrenceDescriptor::isFindLastPattern(Instruction *I, PHINode *Phi, | ||
| Loop *TheLoop) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we reuse RecurrenceDescriptor::isFindIVPattern?
llvm/lib/Analysis/IVDescriptors.cpp
Outdated
| case RecurKind::SMin: | ||
| case RecurKind::UMax: | ||
| case RecurKind::UMin: | ||
| case RecurKind::FindLast: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#162252
icmp and fcmp shouldn’t be part of the recurrence chain. I think the opcode should return Instruction::Select instead.
d066b28 to
1ff7778
Compare
|
Rebased, addressed comments. Thanks for reviewing :) |
MacDue
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some initial comments (mostly nitpicks), I've not looked at everything in full detail yet, but the general concept makes sense to me.
| m_Value(NonRdxPhi))))) | ||
| return InstDesc(false, I); | ||
|
|
||
| if (isFindLastRecurrenceKind(Kind)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Above (not this line), this function is called isFindIVPattern, and has a large comment explaining that with an example. Maybe update that documentation to explain the "FindLastRecurance" case (and possibly rename the function to account for that too?).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's also the comment above for We are looking for selects of the form: which should now in include the FindLast case.
1ff7778 to
e6fc0cf
Compare
fhahn
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will be doing some performance runs on AArch64 to figure out
the cost model, as we mostly regard vector selects as per-lane
instead of selecting the whole vector at once.
Did you already get a chance to run preliminary benchmarks? Would be very curious what the impact is, possibly for some microbenchmarks (https://github.com/llvm/llvm-test-suite/tree/main/MicroBenchmarks/LoopVectorization would be a good place)
| } | ||
| break; | ||
| } | ||
| case Intrinsic::experimental_vector_extract_last_active: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could this be split off, with dedicated cost model tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Split out as #165739
| } | ||
| }; | ||
|
|
||
| /// A recipe for selecting whole vector values based on a scalar condition. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we need a dedicated recipe for this? If we have a scalar condition, broadcasting should be fine, although more vebose than needed?
We should be able to handle single-scalar conditions as well, which may be enough? #165506 (comment)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, we don't need a dedicated recipe. I'll rebase on top of that other PR when available.
| m_Value(NonRdxPhi))))) | ||
| return InstDesc(false, I); | ||
|
|
||
| if (isFindLastRecurrenceKind(Kind)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's also the comment above for We are looking for selects of the form: which should now in include the FindLast case.
e6fc0cf to
abea562
Compare
|
Rebased on top of the cost model change, and I removed the extra select recipe in favor of the broadcast approach. We can change that to the scalar condition later. |
| ; CHECK-VF4IC1: [[FOR_BODY]]: | ||
| ; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] | ||
| ; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] | ||
| ; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Again here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the name/comment for not_vectorized_select_icmp_truncated_iv_out_of_bound missed being updated.
| ; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE | ||
|
|
||
| target triple = "aarch64-linux-gnu" | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about testing a pointer type for the data phi, and some negative tests for the cases we bail out for in isFindPattern (e.g. multiple users in the loop)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. Added tests for pointers and floats, then multiple in-loop users of the select, multiple users of the compare, and chained selects.
Based on Michael Maitland's previous work: llvm#121222 This PR uses the existing recurrences code instead of introducing a new pass just for CSA autovec. I've also made recipes that are more generic. I've enabled it by default to see the impact on tests; if there are regressions we can put it behind a cli option.
abea562 to
049428d
Compare
So running with SVE enabled, we vectorize 7 loops in find-last.cpp from the test suite. I don't see any noticeable performance impact from the vectorization, but total execution time is on the order of a couple of milliseconds, so I maybe need to increase the amount of data for a microbenchmark to see anything. That said, this is the more generic form of this reduction. I intend to follow this part up with an in-loop version of the reduction, so that targets with SVE can just use clastb inside the loop instead of maintaining the extra phi for the mask. I'll try some spec runs and see if there's any interesting results there. |
MacDue
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Generally LGTM. Thanks for addressing my comments!
Needing to add the VPLastActiveMaskPHIRecipe is a bit unfortunate, but I don't have another suggestion (maybe other reviewers do).
Co-authored-by: Benjamin Maxwell <[email protected]>
sdesmalen-arm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Left a few nits, but otherwise looks reasonable to me.
| case RecurKind::FindFirstIVUMin: | ||
| case RecurKind::FindLastIVSMax: | ||
| case RecurKind::FindLastIVUMax: | ||
| // TODO: Make type-agnostic. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this is not type-agnostic, should this be reflected in the name of the recurrence kind?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It isn't for the other FindFirst/FindLast (though that might be inferred by U/S Min/Max) or AnyOf. I think it's just the fp-based reduction types that are prefixed with an extra F.
I did experiment with treating FindLast separately in AddReductionVar when it checks the type and everything was fine, but decided to leave that out of the initial patch.
| // Override IC if user provided an interleave count. | ||
| IC = UserIC > 0 ? UserIC : IC; | ||
|
|
||
| // FIXME: Enable interleaving for last_active reductions. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What would be required to enable interleaving?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Completing the final reduction outside of the loop, especially the mask phis.
fhahn
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So running with SVE enabled, we vectorize 7 loops in find-last.cpp from the test suite. I don't see any noticeable performance impact from the vectorization, but total execution time is on the order of a couple of milliseconds, so I maybe need to increase the amount of data for a microbenchmark to see anything.
Yes that is only intended to test correctness; as I mentioned earlier, it would be good to add some MicroBenchmarks, to make this easier to measure
llvm/lib/Analysis/IVDescriptors.cpp
Outdated
| for (User *U : I->users()) { | ||
| if (U == OrigPhi) | ||
| continue; | ||
| if (auto *UI = dyn_cast<Instruction>(U); UI && !TheLoop->contains(UI)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we generally avoid this kind of pattern in LV/IVDesc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed. This was basically a (mutated) leftover from Michael's code, since it had entirely separate analysis logic.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the FIXME still relevant then?
// FIXME: Support more complex patterns, including multiple selects.
// The Select must be used only outside the loop and by the PHI.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, though some (all?) of the changes might not be in that area of code. The extra unit tests cover some of the patterns we might want.
| continue; | ||
| } | ||
| } | ||
| } else if (isa<VPLastActiveMaskPHIRecipe>(R)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is this needed? Epilogue vectorization is disabled, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added that before disabling it entirely. Removed (along with the recipe)
| for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) { | ||
| if (RecurrenceDescriptor::isFindLastRecurrenceKind( | ||
| RdxDesc.getRecurrenceKind())) { | ||
| VPRecipeBase *PhiR = RecipeBuilder.getRecipe(Phi); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please avoid going through Legal/RecipeBuilder here if possible and instead use the information directly available in VPlan.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
| #endif | ||
| }; | ||
|
|
||
| // TODO: Can we unify the PHI recipe hierarchy a bit? VPPredInstPHISC is close |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
THis just lowers to a wide phi, right? Can you just use VPWidenPHIRecipe?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It does, but VPWidenPHIRecipe::clone casts the underlying value to a PHINode. Since there isn't one, that will assert if/when we enable e.g. interleaving. I think that's the same for VPActiveLaneMaskPHIRecipe, which has a comment noting it would be good to drop the specialized recipe in favour of VPWidenPHIRecipe.
I could adjust ::clone() to work (using cast_if_present) if that works for you? I'm not sure if there's any other uses of the underlying value for those recipes; I'll try it and see if any test fails.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems that there isn't a computeCost method for VPWidenPHIRecipe yet either.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done; assuming you're happy with VPWidenPHIRecipe no longer requiring an underlying value.
| ; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> | ||
| ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9> | ||
| ; CHECK-NEXT: LAST-ACTIVE-MASK-PHI vp<%4> = phi ir<false>, vp<%8> | ||
| ; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> | ||
| ; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5> | ||
| ; CHECK-NEXT: vp<%6> = vector-pointer ir<%ld.addr> | ||
| ; CHECK-NEXT: WIDEN ir<%ld> = load vp<%6> | ||
| ; CHECK-NEXT: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld> | ||
| ; CHECK-NEXT: EMIT vp<%7> = any-of ir<%select.cmp> | ||
| ; CHECK-NEXT: EMIT vp<%8> = select vp<%7>, ir<%select.cmp>, vp<%4> | ||
| ; CHECK-NEXT: EMIT vp<%9> = select vp<%7>, ir<%ld>, ir<%data.phi> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please use patterns for the unnamed vpvalues throughout
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
Based on Michael Maitland's previous work:
#121222
This PR uses the existing recurrences code instead of introducing a
new pass just for CSA autovec. I've also made recipes that are more
generic.
I've enabled it by default to see the impact on tests; if there are
regressions we can put it behind a cli option. I haven't corrected
all the comments for the tests, I'll wait until we decide whether
to keep it enabled by default first.
I will be doing some performance runs on AArch64 to figure out
the cost model, as we mostly regard vector selects as per-lane
instead of selecting the whole vector at once.