-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[VPlan] Simplify Plan's entry in removeBranchOnConst. #154510
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
8c30518 to
1824795
Compare
|
@llvm/pr-subscribers-backend-powerpc @llvm/pr-subscribers-backend-risc-v Author: Florian Hahn (fhahn) ChangesAfter #153643, there may be a Simplify those in removeBranchOnConst. This removes a number of In some cases, it may also make the original scalar loop unreachable, A couple of places that assume the scalar loop to still be valid need Depends on #153643. Patch is 2.60 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154510.diff 332 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 98554310c74df..64cbf509a3118 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2357,9 +2357,9 @@ EpilogueVectorizerMainLoop::createIterationCountCheck(ElementCount VF,
/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
-static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
+static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPlan &Plan, VPBasicBlock *VPBB,
BasicBlock *IRBB) {
- VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
+ VPIRBasicBlock *IRVPBB = Plan.createVPIRBasicBlock(IRBB);
auto IP = IRVPBB->begin();
for (auto &R : make_early_inc_range(VPBB->phis()))
R.moveBefore(*IRVPBB, IP);
@@ -2571,6 +2571,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Remove redundant induction instructions.
cse(HeaderBB);
+ if (Plan.getScalarPreheader()->getNumPredecessors() == 0)
+ return;
+
// Set/update profile weights for the vector and remainder loops as original
// loop iterations are now distributed among them. Note that original loop
// becomes the scalar remainder loop after vectorization.
@@ -7226,6 +7229,12 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::simplifyRecipes(BestVPlan);
VPlanTransforms::removeBranchOnConst(BestVPlan);
+ if (BestVPlan.getEntry()->getSingleSuccessor() ==
+ BestVPlan.getScalarPreheader()) {
+ // TODO: Should not even try to vectorize.
+ return DenseMap<const SCEV *, Value *>();
+ }
+
VPlanTransforms::narrowInterleaveGroups(
BestVPlan, BestVF,
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
@@ -7268,7 +7277,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BasicBlock *EntryBB =
cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock();
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
- replaceVPBBWithIRVPBB(BestVPlan.getScalarPreheader(),
+ replaceVPBBWithIRVPBB(BestVPlan, BestVPlan.getScalarPreheader(),
State.CFG.PrevBB->getSingleSuccessor());
VPlanTransforms::removeDeadRecipes(BestVPlan);
@@ -7351,8 +7360,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
} else {
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
- if (MDNode *LID = OrigLoop->getLoopID())
- L->setLoopID(LID);
+ if (BestVPlan.getScalarPreheader()->getNumPredecessors() > 0)
+ if (MDNode *LID = OrigLoop->getLoopID())
+ L->setLoopID(LID);
LoopVectorizeHints Hints(L, true, *ORE);
Hints.setAlreadyVectorized();
@@ -7383,6 +7393,16 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
addRuntimeUnrollDisableMetaData(L);
}
+ if (BestVPlan.getScalarPreheader()->getNumPredecessors() == 0) {
+ // If the original loop became unreachable, we need to delete it.
+ auto Blocks = OrigLoop->getBlocksVector();
+ Blocks.push_back(cast<VPIRBasicBlock>(BestVPlan.getScalarPreheader())
+ ->getIRBasicBlock());
+ for (auto *BB : Blocks)
+ LI->removeBlock(BB);
+ LI->erase(OrigLoop);
+ }
+
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
ILV.fixVectorizedLoop(State);
@@ -7460,7 +7480,8 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
// generated here dominates the vector epilog iter check.
EPI.TripCount = Count;
} else {
- VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+ VectorPHVPBB =
+ replaceVPBBWithIRVPBB(Plan, VectorPHVPBB, LoopVectorPreHeader);
}
BranchInst &BI =
@@ -7493,7 +7514,7 @@ BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
BasicBlock *VecEpilogueIterationCountCheck =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
nullptr, "vec.epilog.iter.check", true);
- VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+ VectorPHVPBB = replaceVPBBWithIRVPBB(Plan, VectorPHVPBB, LoopVectorPreHeader);
emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
VecEpilogueIterationCountCheck);
@@ -10213,11 +10234,22 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
+ if (ORE->allowExtraAnalysis(LV_NAME))
+ checkMixedPrecision(L, ORE);
+
bool DisableRuntimeUnroll = false;
MDNode *OrigLoopID = L->getLoopID();
+ bool LoopRemoved = false;
{
using namespace ore;
if (!VectorizeLoop) {
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+ L->getHeader())
+ << "interleaved loop (interleaved count: "
+ << NV("InterleaveCount", IC) << ")";
+ });
+
assert(IC > 1 && "interleave count should not be 1 or 0");
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
@@ -10234,14 +10266,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
VF.MinProfitableTripCount);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
-
- ORE->emit([&]() {
- return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
- L->getHeader())
- << "interleaved loop (interleaved count: "
- << NV("InterleaveCount", IC) << ")";
- });
+ LoopRemoved = BestPlan.getScalarPreheader()->getNumPredecessors() == 0;
} else {
+ // Report the vectorization decision.
+ reportVectorization(ORE, L, VF, IC);
+
// If we decided that it is *legal* to vectorize the loop, then do it.
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
@@ -10311,23 +10340,23 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// rarely used is not worth unrolling.
if (!Checks.hasChecks())
DisableRuntimeUnroll = true;
+ LoopRemoved = BestPlan.getScalarPreheader()->getNumPredecessors() == 0;
}
- // Report the vectorization decision.
- reportVectorization(ORE, L, VF, IC);
}
-
- if (ORE->allowExtraAnalysis(LV_NAME))
- checkMixedPrecision(L, ORE);
}
assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
"DT not preserved correctly");
+ if (LoopRemoved)
+ return true;
+
std::optional<MDNode *> RemainderLoopID =
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupEpilogue});
if (RemainderLoopID) {
- L->setLoopID(*RemainderLoopID);
+ if (!LoopRemoved)
+ L->setLoopID(*RemainderLoopID);
} else {
if (DisableRuntimeUnroll)
addRuntimeUnrollDisableMetaData(L);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1438dc366b55d..4a7618f40164b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -972,12 +972,14 @@ void VPlan::execute(VPTransformState *State) {
setName("Final VPlan");
LLVM_DEBUG(dump());
- // Disconnect scalar preheader and scalar header, as the dominator tree edge
- // will be updated as part of VPlan execution. This allows keeping the DTU
- // logic generic during VPlan execution.
BasicBlock *ScalarPh = State->CFG.ExitBB;
- State->CFG.DTU.applyUpdates(
- {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
+ if (getScalarPreheader()->getNumPredecessors() > 0) {
+ // Disconnect scalar preheader and scalar header, as the dominator tree edge
+ // will be updated as part of VPlan execution. This allows keeping the DTU
+ // logic generic during VPlan execution.
+ State->CFG.DTU.applyUpdates(
+ {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
+ }
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
Entry);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d32d2a9ad11f7..8e7fc24080c31 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1920,7 +1920,7 @@ void VPlanTransforms::removeBranchOnConst(VPlan &Plan) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getEntry()))) {
VPValue *Cond;
- if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() ||
+ if (VPBB->getNumSuccessors() != 2 || VPBB->empty() ||
!match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
continue;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 9e1d325a4d8d6..2959e9440e753 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -49,6 +49,7 @@ inline bool isSingleScalar(const VPValue *VPV) {
case Instruction::GetElementPtr:
case Instruction::ICmp:
case Instruction::FCmp:
+ case Instruction::Select:
case VPInstruction::Broadcast:
case VPInstruction::PtrAdd:
return true;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index c18f9f2fae06b..ddfdb257ed49a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -52,8 +52,8 @@ define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) {
; CHECK-NEXT: [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
+; CHECK-NEXT: br i1 [[TMP18]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: for.end:
; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP17]])
; CHECK-NEXT: ret i64 [[TMP19]]
;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
index e44ddbce34fd5..58965c19ae1cc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
@@ -202,8 +202,8 @@ exit:
define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i1 %c.0) {
; CHECK-LABEL: define void @test_blend_feeding_replicated_store_2(
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i1 [[C_0:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[C_0]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
@@ -366,12 +366,11 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
; CHECK-NEXT: [[TMP71:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
; CHECK-NEXT: br i1 [[TMP71]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br label %[[SCALAR_PH]]
+; CHECK-NEXT: br label %[[SCALAR_PH:.*]]
; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ 96, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC1]], align 1
; CHECK-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
index f099c22333c3e..387bb4302de60 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -6,8 +6,8 @@ target triple = "arm64-apple-macosx11.0.0"
define void @fshl_operand_first_order_recurrence(ptr %dst, ptr noalias %src) {
; CHECK-LABEL: define void @fshl_operand_first_order_recurrence(
; CHECK-SAME: ptr [[DST:%.*]], ptr noalias [[SRC:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
@@ -30,14 +30,12 @@ define void @fshl_operand_first_order_recurrence(ptr %dst, ptr noalias %src) {
; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[WIDE_LOAD1]], i32 1
-; CHECK-NEXT: br label %[[SCALAR_PH]]
+; CHECK-NEXT: br label %[[SCALAR_PH:.*]]
; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 100, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[RECUR:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L]] = load i64, ptr [[GEP_SRC]], align 8
; CHECK-NEXT: [[OR:%.*]] = tail call i64 @llvm.fshl.i64(i64 1, i64 [[RECUR]], i64 1)
@@ -73,7 +71,7 @@ define void @powi_call(ptr %P) {
; CHECK-LABEL: define void @powi_call(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
@@ -83,7 +81,7 @@ define void @powi_call(ptr %P) {
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
-; CHECK: [[SCALAR_PH]]:
+; CHECK: [[SCALAR_PH:.*]]:
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -93,7 +91,7 @@ define void @powi_call(ptr %P) {
; CHECK-NEXT: store double [[POWI]], ptr [[GEP]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -224,5 +222,4 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 626242667e203..944f2699d6e62 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -5,7 +5,7 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK-LABEL: define void @clamped_tc_8(
; CHECK-SAME: ptr captures(none) [[DST:%.*]], i32 [[N:%.*]], i64 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
@@ -36,7 +36,7 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[P_OUT_TAIL_09:%.*]] = phi ptr [ [[DST]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
; CHECK-NEXT: [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP19]]
@@ -45,7 +45,7 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_OUT_TAIL_09]], i64 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
;
@@ -79,7 +79,7 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[REM]], 7
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD]], 3
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SHR]] to i64
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
@@ -104,13 +104,13 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = ph...
[truncated]
|
Move the emission of remarks for the vectorization decision before executing the plan, in preparation for #154510.
1824795 to
6afb1b7
Compare
6afb1b7 to
33afce8
Compare
fhahn
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping :)
…e (NFCI). Move the emission of remarks for the vectorization decision before executing the plan, in preparation for llvm/llvm-project#154510.
| static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPlan &Plan, VPBasicBlock *VPBB, | ||
| BasicBlock *IRBB) { | ||
| VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB); | ||
| VPIRBasicBlock *IRVPBB = Plan.createVPIRBasicBlock(IRBB); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change is needed because VPBB may no longer be reachable from Plan's entry. VPBB must still have a single predecessor, as documented above, but that pred might be pred free? If only some callers pass an unreachable VPBB, Plan could be an optional parameter, to keep other callers intact.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, updated the comment to say that all predecessors/successors are rewired, if any and added Plan as optional parameter, which must be passed if the may be unreachable, thanks!
| if (Plan.getScalarPreheader()->getNumPredecessors() == 0) | ||
| return; | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Early exit if remainder/default scalar loop is dead, w/o setting/updating profile weights for the vector loop? Worth a comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had to move the code, becuase in order to preserve profile info on the vector loop when the remainder loop is dead requires retrieving the profile info before removing the remainder loop and conditionally setting the profile info on the vector/remainder loops as available.
| if (BestVPlan.getEntry()->getSingleSuccessor() == | ||
| BestVPlan.getScalarPreheader()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removeBranchOnConst() could conceivably bypass the vector loop; this actually happens in few tests. Worth emitting a missed-vectorization remark.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, added an analysis remark. I am not sure if missed-vectorization would be accurate, because this is for cases where we would create a dead vector loop and should not even try to vectorize.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, it appears the loop isn't vectorized because the Trip Count guard is known to always jump to the scalar loop, i.e., where VFxUF is known to exceed TC, so conceptually a smaller VFxUF could work. But tests include unvectorizable non-loop cases where TC<=1, which should better be cleaned up before calling LV, certainly before reaching LVP::executePlan().
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed, we already have a TODO where we created the known True condition
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have a TODO here too; wondering if the message should specify that vectorization is dead or never executes - due to insufficient trip-count.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated message to mention insufficient trip count, thanks
| // replace the vectorizer-specific hints below). | ||
| if (MDNode *LID = OrigLoop->getLoopID()) | ||
| L->setLoopID(LID); | ||
| if (BestVPlan.getScalarPreheader()->getNumPredecessors() > 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (BestVPlan.getScalarPreheader()->getNumPredecessors() > 0) | |
| } else if (BestVPlan.getScalarPreheader()->getNumPredecessors() > 0) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The code below will execute in in the general else{, so I left it as-is for now.
| } | ||
|
|
||
| if (BestVPlan.getScalarPreheader()->getNumPredecessors() == 0) { | ||
| // If the original loop became unreachable, we need to delete it. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"Native" VPlan constructs can simply be discarded when they become dead or unreached (even then lazily), whereas VPlan constructs that model the scalar loop are kept orphaned after becoming unreachable, to be processed here.
This raises an old thought: VPlan specifies code to be generated, using its blocks and recipes; how/could VPlan be extended to also dismantle existing code, perhaps using "anti-recipes" or "anti-blocks" whose execute() performs the desired clean-up.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, unfortunately we cannot do this when the VPIRBasicBlock for the scalar preheader gets executed, as we only execute reachable VPBBs.
I moved it for now to VPlan::execute. Unfortunately doing in the destructor of VPIRBasicBlock won't work either, because we need access to LoopInfo.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Having it in VPlan::execute is great!
As a potential follow-up: the VPIRBasicBlock of the scalar preheader and/or a VPIRRegionBlock of the scalar loop could be marked for destruction in VPlan if unreachable (as in CreatedBlocks), indicating that its loop should be removed from LoopInfo etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, they are implicitly marked for destruction (all VPIRBasicBlocks that are unreachable are tracked in CreatedBlocks) and can be destroyed in the destructor; but they need LoopInfo/DT passed somehow, to notify those about the removal
| @@ -53,76 +53,35 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { | |||
| ; VF8UF2-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { | |||
| ; VF8UF2-NEXT: [[ENTRY:.*]]: | |||
| ; VF8UF2-NEXT: [[AND:%.*]] = and i64 [[N]], 15 | |||
| ; VF8UF2-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Another case skipping the vector loop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yep
| ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], [[G_64]] | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 | ||
| ; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yep removed dead vector loop
| @@ -12,15 +12,13 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { | |||
| ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF | |||
| ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF | |||
| ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count | |||
| ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count | |||
| ; CHECK-NEXT: ir<16> = original trip-count | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Simplification due to branch-on-const of entry?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had to update the test because otherwise the vector loop would be dead
| ; CHECK-NEXT: } | ||
| ; | ||
| entry: | ||
| %and = and i64 %N, 15 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, test changed?
| ; CHECK-NEXT: [[TMP4:%.*]] = fdiv fast <4 x float> splat (float 1.000000e+00), [[BROADCAST_SPLAT]] | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = fdiv fast <4 x float> splat (float 1.000000e+00), [[BROADCAST_SPLAT]] | ||
| ; CHECK-NEXT: [[TMP15:%.*]] = fdiv fast <4 x float> splat (float 1.000000e+00), [[BROADCAST_SPLAT]] | ||
| ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] | ||
| ; CHECK: vector.body: | ||
| ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]] | ||
| ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA3:![0-9]+]] | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP0]] | ||
| ; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP1]], align 4, !tbaa [[TBAA3]] | ||
| ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 | ||
| ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]] | ||
| ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16 | ||
| ; CHECK-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP7]], align 4, !tbaa [[TBAA3]] | ||
| ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[WIDE_LOAD_1]], [[TMP4]] | ||
| ; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[TMP7]], align 4, !tbaa [[TBAA3]] | ||
| ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]] | ||
| ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i64 32 | ||
| ; CHECK-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x float>, ptr [[TMP10]], align 4, !tbaa [[TBAA3]] | ||
| ; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <4 x float> [[WIDE_LOAD_2]], [[TMP2]] | ||
| ; CHECK-NEXT: store <4 x float> [[TMP11]], ptr [[TMP10]], align 4, !tbaa [[TBAA3]] | ||
| ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]] | ||
| ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP12]], i64 48 | ||
| ; CHECK-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x float>, ptr [[TMP13]], align 4, !tbaa [[TBAA3]] | ||
| ; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x float> [[WIDE_LOAD_3]], [[TMP15]] | ||
| ; CHECK-NEXT: store <4 x float> [[TMP14]], ptr [[TMP13]], align 4, !tbaa [[TBAA3]] | ||
| ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 16 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed now, thanks
Split off from #154510, add helper to check if a block has any predecessors.
Split off from llvm/llvm-project#154510, add helper to check if a block has any predecessors.
|
ping :) |
| if (BestVPlan.getEntry()->getSingleSuccessor() == | ||
| BestVPlan.getScalarPreheader()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, it appears the loop isn't vectorized because the Trip Count guard is known to always jump to the scalar loop, i.e., where VFxUF is known to exceed TC, so conceptually a smaller VFxUF could work. But tests include unvectorizable non-loop cases where TC<=1, which should better be cleaned up before calling LV, certainly before reaching LVP::executePlan().
| bool VectorizingEpilogue, | ||
| unsigned EstimatedVFxUF, | ||
| bool DisableRuntimeUnroll); | ||
| /// loop on the vector loop and replaces vectorizer-specific metadata |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// loop on the vector loop and replaces vectorizer-specific metadata | |
| /// loop on the vector loop and replaces vectorizer-specific metadata. |
i.e., w/o change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doen thanks
| bool VectorizingEpilogue, | ||
| unsigned EstimatedVFxUF, | ||
| bool DisableRuntimeUnroll); | ||
| /// loop on the vector loop and replaces vectorizer-specific metadata |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Augment documentation to cover additional parameters?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added thanks
| BasicBlock *ScalarPh = State->CFG.ExitBB; | ||
| State->CFG.DTU.applyUpdates( | ||
| {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}}); | ||
| if (getScalarPreheader()->hasPredecessors()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (getScalarPreheader()->hasPredecessors()) { | |
| VPBasicBlock *VPScalarPh = getScalarPreheader(); | |
| if (VPScalarPh->hasPredecessors()) { |
reuseful below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks, (named ScalarPhVPBB for consitency with other VPBB variables
| Loop *OrigLoop = State->LI->getLoopFor( | ||
| cast<VPIRBasicBlock>(getScalarPreheader()->getSingleSuccessor()) | ||
| ->getIRBasicBlock()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| Loop *OrigLoop = State->LI->getLoopFor( | |
| cast<VPIRBasicBlock>(getScalarPreheader()->getSingleSuccessor()) | |
| ->getIRBasicBlock()); | |
| Loop *OrigLoop = State->LI->getLoopFor( | |
| getScalarHeader()->getIRBasicBlock()); |
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done thanks
| // If the original loop is unreachable, we need to delete it. | ||
| auto Blocks = OrigLoop->getBlocksVector(); | ||
| Blocks.push_back( | ||
| cast<VPIRBasicBlock>(getScalarPreheader())->getIRBasicBlock()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the scalar preheader is a VPIRBB, should getScalarPreheader() return it as such?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The VPBB for the scalar preheader only gets replaced with the VPIRBB late, so I don't think that's possible yet
| // the epilogue loop, to ensure it is only updated once, or when the became | ||
| // unreachable. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // the epilogue loop, to ensure it is only updated once, or when the became | |
| // unreachable. | |
| // the epilogue loop to ensure it is updated only once. Also skip the update | |
| // when the scalar loop became unreachable. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated thanks
| vp_depth_first_shallow(Plan.getEntry()))) { | ||
| VPValue *Cond; | ||
| if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() || | ||
| if (VPBB->getNumSuccessors() != 2 || VPBB->empty() || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If a branch-on-cond will be added only later, the following match will early-continue; but this match may fail when a block (the entry, with two successors) is currently free of recipes? Probably worth an explaining comment.
| ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) | ||
| ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]] | ||
| ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] | ||
| ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
May be worth leaving a TODO.
| ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 | ||
| ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] | ||
| ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] | ||
| ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why has the metadata been dropped on the scalar loop?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The scalar loop is unreachable now, which means we have to remove it from LoopInfo as an unreachable block is dominated by any other unreachable block, breaking LoopInfo verification.
From the PR description
In some cases, it may also make the original scalar loop unreachable,
because we know it will never execute. In that case, we need to remove
the loop from LoopInfo, because all unreachable blocks may dominate each
other, making LoopInfo invalid. In those cases, we can also completely
remove the loop, for which I'll share a follow-up patch.
| ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 | ||
| ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 | ||
| ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] | ||
| ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Again, this doesn't look right, not sure why loop vectoriser is killing off metadata from the original scalar loop?
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py | ||
| ; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s --check-prefix=CHECK | ||
| ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -disable-output %s 2>&1 | FileCheck %s --check-prefix=CHECK-COST | ||
| ; RUN: opt -passes=loop-vectorize,instcombine,simplifycfg < %s -S -o - | FileCheck %s --check-prefix=CHECK |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why are we adding extra passes here? Can you not just simply check the vectoriser output? It adds lots of extra test changes that aren't related to the patch as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this was added back by accident when updating to latest main
| ; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 | ||
| ; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4 | ||
| ; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] | ||
| ; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Scalar loop metadata dropped.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same as above, loop now unreachable
| %for.next = sitofp i32 %iv to double | ||
| %iv.next = add nsw i32 %iv, 1 | ||
| %ec = icmp eq i32 %iv.next, 1025 | ||
| %ec = icmp eq i32 %iv.next, %n |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why has this changed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Without it there would be no merge phis in scalar.ph, which seems to be what the test needs to check.
| store i8 1, ptr %arrayidx, align 1 | ||
| %iv.next = add nuw nsw i64 %iv, 1 | ||
| %exitcond = icmp ne i64 %iv.next, %n | ||
| %exitcond = icmp ne i64 %iv.next, 1024 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why has this changed? I thought @hassnaaHamdi explicitly wanted to make this a variable in #157512. I think you can revert the change to this file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Restored the code, thanks
ayalz
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This LGTM with minor last suggestions. Pending @david-arm's consent.
| /// the average trip count and invocation weight of the original loop (\p | ||
| /// OrigAverageTripCount and \p OrigLoopInvocationWeight respectively. They | ||
| /// cannot be retrieved after the plan has been executed, as the original loop | ||
| /// may have been removed. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// may have been removed. | |
| /// may have been removed). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Closed ) above thanks
| if (BestVPlan.getEntry()->getSingleSuccessor() == | ||
| BestVPlan.getScalarPreheader()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have a TODO here too; wondering if the message should specify that vectorization is dead or never executes - due to insufficient trip-count.
| // iterations of the loop are handled in one vector iteration, so instead | ||
| // use the value of vscale used for tuning. | ||
| setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF); | ||
| if (OrigAverageTripCount) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Early exit, as originally in llvm::setProfileInfoAfterUnrolling()?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks!
| vp_depth_first_shallow(Plan.getEntry()))) { | ||
| VPValue *Cond; | ||
| if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() || | ||
| if (VPBB->getNumSuccessors() != 2 || VPBB->empty() || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. Would it suffice to check only if VPBB terminator matches a BranchOnCond? Is the emptiness check needed to prevent failure of this match? Check for two successors could then be an assert.
| vp_depth_first_shallow(Plan.getEntry()))) { | ||
| VPValue *Cond; | ||
| if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() || | ||
| if (VPBB->getNumSuccessors() != 2 || VPBB->empty() || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Understood, plus even w/ a pair of successors VPBB may be empty. Is it then (necessary and) suffice to check
if (VPBB->empty() ||
!match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
continue;
assert(VPBB->getNumSuccessors() == 2 && "Two successors expected for BranchOnCond");
but redundant first check for two-successors may early-continue faster?
| return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationDead", | ||
| OrigLoop->getStartLoc(), | ||
| OrigLoop->getHeader()) | ||
| << "Created vector loop never executes due insufficient trip " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| << "Created vector loop never executes due insufficient trip " | |
| << "Created vector loop never executes due to insufficient trip " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yes, updated, thanks
…4510) After llvm/llvm-project#153643, there may be a BranchOnCond with constant condition in the entry block. Simplify those in removeBranchOnConst. This removes a number of redundant conditional branch from entry blocks. In some cases, it may also make the original scalar loop unreachable, because we know it will never execute. In that case, we need to remove the loop from LoopInfo, because all unreachable blocks may dominate each other, making LoopInfo invalid. In those cases, we can also completely remove the loop, for which I'll share a follow-up patch. Depends on llvm/llvm-project#153643. PR: llvm/llvm-project#154510
Build on top of llvm#154510 to completely remove dead scalar loops. Depends on llvm#154510. (Included in the PR)
Build on top of llvm/llvm-project#154510 to completely remove the blocks of dead scalar loops. Depends on llvm/llvm-project#154510. PR: llvm/llvm-project#155497
Build on top of llvm#154510 to completely remove the blocks of dead scalar loops. Depends on llvm#154510. PR: llvm#155497
After #153643, there may be a
BranchOnCond with constant condition in the entry block.
Simplify those in removeBranchOnConst. This removes a number of
redundant conditional branch from entry blocks.
In some cases, it may also make the original scalar loop unreachable,
because we know it will never execute. In that case, we need to remove
the loop from LoopInfo, because all unreachable blocks may dominate each
other, making LoopInfo invalid. In those cases, we can also completely
remove the loop, for which I'll share a follow-up patch.
A couple of places that assume the scalar loop to still be valid need
updating.
Depends on #153643.