From c274eea4e968a1c9ac7aa8288c04f8cd1e8ec2ee Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 18 Mar 2025 22:37:13 +0000 Subject: [PATCH 1/6] [VPlan] Replace VPRegionBlock with explicit CFG before execute (NFCI). !fixup update more tests. --- .../Transforms/Vectorize/LoopVectorize.cpp | 31 ++- llvm/lib/Transforms/Vectorize/VPlan.cpp | 191 ++++++++++-------- llvm/lib/Transforms/Vectorize/VPlan.h | 7 + .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 13 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 16 ++ .../Transforms/Vectorize/VPlanTransforms.h | 5 +- .../AArch64/epilog-iv-select-cmp.ll | 12 +- .../AArch64/reduction-recurrence-costs-sve.ll | 10 +- .../LoopVectorize/AArch64/vplan-printing.ll | 37 ++-- .../RISCV/riscv-vector-reverse.ll | 74 ++++--- .../RISCV/vplan-vp-select-intrinsics.ll | 51 +++-- .../LoopVectorize/vplan-predicate-switch.ll | 113 +++++------ 12 files changed, 295 insertions(+), 265 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0b8b0c7dcdfc9..ba9f01e2a330a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2760,6 +2760,15 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, return TTI.getIntrinsicInstrCost(CostAttrs, CostKind); } +static VPBasicBlock *getHeaderForMainVectorLoop(VPlan &Plan, + VPDominatorTree &VPDT) { + return find_singleton( + vp_depth_first_shallow(Plan.getEntry()), [&VPDT](VPBlockBase *VPB, bool) { + auto *VPBB = dyn_cast(VPB); + return VPBB && VPBB->isHeader(VPDT) ? VPBB : nullptr; + }); +} + void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Fix widened non-induction PHIs by setting up the PHI operands. if (EnableVPlanNativePath) @@ -2778,13 +2787,13 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { PSE.getSE()->forgetLoop(OrigLoop); PSE.getSE()->forgetBlockAndLoopDispositions(); - // Don't apply optimizations below when no vector region remains, as they all - // require a vector loop at the moment. - if (!State.Plan->getVectorLoopRegion()) + // Don't apply optimizations below when no vector loop remains, as they all + // require one at the moment. + VPBasicBlock *HeaderVPBB = + getHeaderForMainVectorLoop(*State.Plan, State.VPDT); + if (!HeaderVPBB) return; - VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); - VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB]; // Remove redundant induction instructions. @@ -2809,7 +2818,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { } void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { - auto Iter = vp_depth_first_deep(Plan.getEntry()); + auto Iter = vp_depth_first_shallow(Plan.getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { for (VPRecipeBase &P : VPBB->phis()) { VPWidenPHIRecipe *VPPhi = dyn_cast(&P); @@ -7799,6 +7808,9 @@ DenseMap LoopVectorizationPlanner::executePlan( BestVPlan, BestVF, TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); VPlanTransforms::removeDeadRecipes(BestVPlan); + + VPBasicBlock *MiddleVPBB = + BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr; VPlanTransforms::convertToConcreteRecipes(BestVPlan, *Legal->getWidestInductionType()); @@ -7894,14 +7906,14 @@ DenseMap LoopVectorizationPlanner::executePlan( // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). - if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) { + VPBasicBlock *HeaderVPBB = getHeaderForMainVectorLoop(BestVPlan, State.VPDT); + if (HeaderVPBB) { MDNode *OrigLoopID = OrigLoop->getLoopID(); std::optional VectorizedLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized}); - VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); if (VectorizedLoopID) { L->setLoopID(*VectorizedLoopID); @@ -7947,8 +7959,7 @@ DenseMap LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); // 4. Adjust branch weight of the branch in the middle block. - if (BestVPlan.getVectorLoopRegion()) { - auto *MiddleVPBB = BestVPlan.getMiddleBlock(); + if (HeaderVPBB) { auto *MiddleTerm = cast(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator()); if (MiddleTerm->isConditional() && diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 167aff737d3fd..6d35862ef6ad4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -207,6 +207,11 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { return Parent->getEnclosingBlockWithPredecessors(); } +bool VPBasicBlock::isHeader(const VPDominatorTree &VPDT) const { + return getPredecessors().size() == 2 && + VPDT.dominates(this, getPredecessors()[1]); +} + VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { iterator It = begin(); while (It != end() && It->isPhi()) @@ -424,7 +429,9 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) { for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock(); auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors(); - BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; + BasicBlock *PredBB = CFG.VPBB2IRBB.lookup(PredVPBB); + if (!PredBB) + continue; assert(PredBB && "Predecessor basic-block not found building successor."); auto *PredBBTerminator = PredBB->getTerminator(); @@ -432,6 +439,8 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) { auto *TermBr = dyn_cast(PredBBTerminator); if (isa(PredBBTerminator)) { + if (PredVPSuccessors.size() == 2) + continue; assert(PredVPSuccessors.size() == 1 && "Predecessor ending w/o branch must have single successor."); DebugLoc DL = PredBBTerminator->getDebugLoc(); @@ -487,11 +496,25 @@ void VPBasicBlock::execute(VPTransformState *State) { bool Replica = bool(State->Lane); BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. + if (isHeader(State->VPDT)) { + // Create and register the new vector loop. + Loop *PrevParentLoop = State->CurrentParentLoop; + State->CurrentParentLoop = State->LI->AllocateLoop(); + + // Insert the new loop into the loop nest and register the new basic blocks + // before calling any utilities such as SCEV that require valid LoopInfo. + if (PrevParentLoop) + PrevParentLoop->addChildLoop(State->CurrentParentLoop); + else + State->LI->addTopLevelLoop(State->CurrentParentLoop); + } + auto IsReplicateRegion = [](VPBlockBase *BB) { auto *R = dyn_cast_or_null(BB); - return R && R->isReplicator(); + assert((!R || R->isReplicator()) && + "only replicate region blocks should remain"); + return R; }; - // 1. Create an IR basic block. if ((Replica && this == getParent()->getEntry()) || IsReplicateRegion(getSingleHierarchicalPredecessor())) { @@ -514,6 +537,14 @@ void VPBasicBlock::execute(VPTransformState *State) { // 2. Fill the IR basic block with IR instructions. executeRecipes(State, NewBB); + + // If this block is a latch, update CurrentParentLoop. + if (any_of(getSuccessors(), [State, this](VPBlockBase *Succ) { + auto *VPBB = dyn_cast(Succ); + return VPBB && VPBB->isHeader(State->VPDT) && + State->VPDT.dominates(Succ, this); + })) + State->CurrentParentLoop = State->CurrentParentLoop->getParentLoop(); } VPBasicBlock *VPBasicBlock::clone() { @@ -725,35 +756,13 @@ VPRegionBlock *VPRegionBlock::clone() { } void VPRegionBlock::execute(VPTransformState *State) { - ReversePostOrderTraversal> - RPOT(Entry); - - if (!isReplicator()) { - // Create and register the new vector loop. - Loop *PrevParentLoop = State->CurrentParentLoop; - State->CurrentParentLoop = State->LI->AllocateLoop(); - - // Insert the new loop into the loop nest and register the new basic blocks - // before calling any utilities such as SCEV that require valid LoopInfo. - if (PrevParentLoop) - PrevParentLoop->addChildLoop(State->CurrentParentLoop); - else - State->LI->addTopLevelLoop(State->CurrentParentLoop); - - // Visit the VPBlocks connected to "this", starting from it. - for (VPBlockBase *Block : RPOT) { - LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); - Block->execute(State); - } - - State->CurrentParentLoop = PrevParentLoop; - return; - } - + assert(isReplicator() && + "Loop regions should have been lowered to plain CFG"); assert(!State->Lane && "Replicating a Region with non-null instance."); - - // Enter replicating mode. assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); + + ReversePostOrderTraversal> RPOT( + Entry); State->Lane = VPLane(0); for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; ++Lane) { @@ -847,6 +856,22 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, } #endif +void VPRegionBlock::removeRegion() { + auto *Header = cast(getEntry()); + VPBlockBase *Preheader = getSinglePredecessor(); + auto *Exiting = cast(getExiting()); + + VPBlockBase *Middle = getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(Preheader, this); + VPBlockUtils::disconnectBlocks(this, Middle); + + for (VPBlockBase *VPB : vp_depth_first_shallow(Entry)) + VPB->setParent(nullptr); + + VPBlockUtils::connectBlocks(Preheader, Header); + VPBlockUtils::connectBlocks(Exiting, Middle); +} + VPlan::VPlan(Loop *L) { setEntry(createVPIRBasicBlock(L->getLoopPreheader())); ScalarHeader = createVPIRBasicBlock(L->getHeader()); @@ -956,57 +981,57 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : RPOT) Block->execute(State); - State->CFG.DTU.flush(); - - auto *LoopRegion = getVectorLoopRegion(); - if (!LoopRegion) - return; - - VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); - BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; - // Fix the latch value of canonical, reduction and first-order recurrences // phis in the vector loop. - VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); - for (VPRecipeBase &R : Header->phis()) { - // Skip phi-like recipes that generate their backedege values themselves. - if (isa(&R)) + for (VPBasicBlock *Header : + VPBlockUtils::blocksOnly(vp_depth_first_shallow(Entry))) { + if (!Header->isHeader(State->VPDT)) continue; + for (VPRecipeBase &R : Header->phis()) { + if (isa(&R)) + continue; - if (isa(&R)) { - PHINode *Phi = nullptr; - if (isa(&R)) { - Phi = cast(State->get(R.getVPSingleValue())); - } else { - auto *WidenPhi = cast(&R); - assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && - "recipe generating only scalars should have been replaced"); - auto *GEP = cast(State->get(WidenPhi)); - Phi = cast(GEP->getPointerOperand()); + auto *LatchVPBB = cast(Header->getPredecessors()[1]); + BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; + + if (isa(&R)) { + PHINode *Phi = nullptr; + if (isa(&R)) { + Phi = cast(State->get(R.getVPSingleValue())); + } else { + auto *WidenPhi = cast(&R); + assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && + "recipe generating only scalars should have been replaced"); + auto *GEP = cast(State->get(WidenPhi)); + Phi = cast(GEP->getPointerOperand()); + } + + Phi->setIncomingBlock(1, VectorLatchBB); + + // Move the last step to the end of the latch block. This ensures + // consistent placement of all induction updates. + Instruction *Inc = cast(Phi->getIncomingValue(1)); + Inc->moveBefore( + std::prev(VectorLatchBB->getTerminator()->getIterator())); + + // Use the steps for the last part as backedge value for the induction. + if (auto *IV = dyn_cast(&R)) + Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); + continue; } - Phi->setIncomingBlock(1, VectorLatchBB); - - // Move the last step to the end of the latch block. This ensures - // consistent placement of all induction updates. - Instruction *Inc = cast(Phi->getIncomingValue(1)); - Inc->moveBefore(std::prev(VectorLatchBB->getTerminator()->getIterator())); - - // Use the steps for the last part as backedge value for the induction. - if (auto *IV = dyn_cast(&R)) - Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); - continue; + auto *PhiR = cast(&R); + // VPInstructions currently model scalar Phis only. + bool NeedsScalar = isa(PhiR) || + (isa(PhiR) && + cast(PhiR)->isInLoop()); + + Value *Phi = State->get(PhiR, NeedsScalar); + // VPHeaderPHIRecipe supports getBackedgeValue() but VPInstruction does + // not. + Value *Val = State->get(PhiR->getOperand(1), NeedsScalar); + cast(Phi)->addIncoming(Val, VectorLatchBB); } - - auto *PhiR = cast(&R); - // VPInstructions currently model scalar Phis only. - bool NeedsScalar = isa(PhiR) || - (isa(PhiR) && - cast(PhiR)->isInLoop()); - Value *Phi = State->get(PhiR, NeedsScalar); - // VPHeaderPHIRecipe supports getBackedgeValue() but VPInstruction does not. - Value *Val = State->get(PhiR->getOperand(1), NeedsScalar); - cast(Phi)->addIncoming(Val, VectorLatchBB); } } @@ -1365,16 +1390,16 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { #endif -/// Returns true if there is a vector loop region and \p VPV is defined in a -/// loop region. -static bool isDefinedInsideLoopRegions(const VPValue *VPV) { - const VPRecipeBase *DefR = VPV->getDefiningRecipe(); - return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() || - DefR->getParent()->getEnclosingLoopRegion()); -} - bool VPValue::isDefinedOutsideLoopRegions() const { - return !isDefinedInsideLoopRegions(this); + auto *DefR = getDefiningRecipe(); + if (!DefR) + return true; + + const VPBasicBlock *DefVPBB = DefR->getParent(); + auto *Plan = DefVPBB->getPlan(); + if (Plan->getVectorLoopRegion()) + return !DefR->getParent()->getEnclosingLoopRegion(); + return DefVPBB == Plan->getEntry(); } void VPValue::replaceAllUsesWith(VPValue *New) { replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; }); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 2c4cac7655ec9..3931583233bb6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3415,6 +3415,9 @@ class VPBasicBlock : public VPBlockBase { /// second predecessor is the exiting block of the region. const VPBasicBlock *getCFGPredecessor(unsigned Idx) const; + /// Returns true if the block is a loop header in a plain-CFG VPlan. + bool isHeader(const VPDominatorTree &VPDT) const; + protected: /// Execute the recipes in the IR basic block \p BB. void executeRecipes(VPTransformState *State, BasicBlock *BB); @@ -3566,6 +3569,10 @@ class VPRegionBlock : public VPBlockBase { /// Clone all blocks in the single-entry single-exit region of the block and /// their recipes without updating the operands of the cloned recipes. VPRegionBlock *clone() override; + + /// Remove the current region from its VPlan, connecting its predecessor to + /// its entry and exiting block to its successor. + void removeRegion(); }; /// VPlan models a candidate for vectorization, encoding various decisions take diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6a4ffac200b1c..58da8610a354b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -583,11 +583,10 @@ Value *VPInstruction::generate(VPTransformState &State) { CondBr->setSuccessor(0, nullptr); Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); - if (!getParent()->isExiting()) + VPBasicBlock *Header = cast(getParent()->getSuccessors()[1]); + if (!State.CFG.VPBB2IRBB.contains(Header)) return CondBr; - VPRegionBlock *ParentRegion = getParent()->getParent(); - VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); return CondBr; } @@ -598,9 +597,7 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Cond = Builder.CreateICmpEQ(IV, TC); // Now create the branch. - auto *Plan = getParent()->getPlan(); - VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); - VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); + VPBasicBlock *Header = cast(getParent()->getSuccessors()[1]); // Replace the temporary unreachable terminator with a new conditional // branch, hooking it up to backward destination (the header) now and to the @@ -1124,10 +1121,6 @@ void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent, void VPPhi::execute(VPTransformState &State) { State.setDebugLocFrom(getDebugLoc()); - assert(getParent() == - getParent()->getPlan()->getVectorLoopRegion()->getEntry() && - "VPInstructions with PHI opcodes must be used for header phis only " - "at the moment"); BasicBlock *VectorPH = State.CFG.VPBB2IRBB.at(getIncomingBlock(0)); Value *Start = State.get(getIncomingValue(0), VPLane(0)); PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, getName()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index b10b47cc1282a..e8481d0bf0c27 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2394,10 +2394,26 @@ void VPlanTransforms::createInterleaveGroups( void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy) { + // Replace loop regions with explicity CFG. + SmallVector LoopRegions; + for (VPRegionBlock *R : VPBlockUtils::blocksOnly( + vp_depth_first_deep(Plan.getEntry()))) { + if (!R->isReplicator()) + LoopRegions.push_back(R); + } + for (VPRegionBlock *R : LoopRegions) { + VPBlockBase *Header = R->getEntry(); + VPBlockBase *Latch = R->getExiting(); + R->removeRegion(); + // Add explicit backedge. + VPBlockUtils::connectBlocks(Latch, Header); + } + using namespace llvm::VPlanPatternMatch; VPTypeAnalysis TypeInfo(&CanonicalIVTy); SmallVector ToRemove; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_deep(Plan.getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index cb127d37661c7..4b80875b79f13 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -184,8 +184,9 @@ struct VPlanTransforms { VPRecipeBuilder &RecipeBuilder, VFRange &Range); - /// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p - /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. + /// Lower abstract recipes to concrete ones, that can be codegen'd and replace + /// loop regions with explicit CFG. Use \p CanonicalIVTy as type for all + /// un-typed live-ins in VPTypeAnalysis. static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy); /// Perform instcombine-like simplifications on recipes in \p Plan. Use \p diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll index c0806ea16a5fc..d4494089f7083 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll @@ -153,11 +153,10 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) ; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3]] = select i1 [[TMP2]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] -; CHECK-NEXT: [[TMP4]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI2]] -; CHECK-NEXT: [[TMP5]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI3]] -; CHECK-NEXT: [[TMP6]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI4]] +; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP1]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI2]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP1]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI3]] +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[TMP1]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -196,8 +195,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { ; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND15:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i32> [[VEC_IND15]], <4 x i32> [[VEC_PHI12]] +; CHECK-NEXT: [[TMP14]] = select <4 x i1> [[TMP11]], <4 x i32> [[VEC_IND15]], <4 x i32> [[VEC_PHI12]] ; CHECK-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX11]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], splat (i32 4) ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC8]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll index 969bb413f9c50..c2fe37ad214c6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll @@ -74,10 +74,7 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2) ; VSCALEFORTUNING2-NEXT: [[TMP13:%.*]] = and [[TMP12]], splat (i32 1) ; VSCALEFORTUNING2-NEXT: [[TMP14:%.*]] = xor [[TMP13]], splat (i32 1) ; VSCALEFORTUNING2-NEXT: [[TMP15:%.*]] = zext [[TMP14]] to -; VSCALEFORTUNING2-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 -; VSCALEFORTUNING2-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[TMP16]] -; VSCALEFORTUNING2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, ptr [[TMP17]], i64 0 -; VSCALEFORTUNING2-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; VSCALEFORTUNING2-NEXT: [[DOTSPLAT:%.*]] = getelementptr i32, ptr [[SRC_2]], [[TMP15]] ; VSCALEFORTUNING2-NEXT: [[TMP18:%.*]] = call i32 @llvm.vscale.i32() ; VSCALEFORTUNING2-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 4 ; VSCALEFORTUNING2-NEXT: [[TMP20:%.*]] = sub i32 [[TMP19]], 1 @@ -210,10 +207,7 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2) ; PRED-NEXT: [[TMP17:%.*]] = and [[TMP16]], splat (i32 1) ; PRED-NEXT: [[TMP18:%.*]] = xor [[TMP17]], splat (i32 1) ; PRED-NEXT: [[TMP19:%.*]] = zext [[TMP18]] to -; PRED-NEXT: [[TMP20:%.*]] = extractelement [[TMP19]], i32 0 -; PRED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[TMP20]] -; PRED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, ptr [[TMP21]], i64 0 -; PRED-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[DOTSPLAT:%.*]] = getelementptr i32, ptr [[SRC_2]], [[TMP19]] ; PRED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() ; PRED-NEXT: [[TMP23:%.*]] = mul i32 [[TMP22]], 4 ; PRED-NEXT: [[TMP24:%.*]] = sub i32 [[TMP23]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index 567aa63483771..2e9d90f762ccd 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -83,27 +83,24 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: Successor(s): ir-bb, ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: Successor(s): vector loop +; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: -; CHECK-NEXT: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[EP_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, ir<%add> (VF scaled by 1/4) -; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]> -; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> -; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]> -; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 -; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]> -; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> -; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> -; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> -; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul> -; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> -; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[EP_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, ir<%add> (VF scaled by 1/4) +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]> +; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> +; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]> +; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]> +; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> +; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> +; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> +; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul> +; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> +; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024> +; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add> diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 9e77a0ca8bcc9..0d77dfc50dd70 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -193,26 +193,23 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: IR %18 = mul i64 %17, 4 ; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1> ; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1> -; CHECK-NEXT: Successor(s): vector loop -; CHECK-EMPTY: -; CHECK-NEXT: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%add9> = add ir<[[L]]>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1 -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: WIDEN ir<%add9> = add ir<[[L]]>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> +; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1 +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> +; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%0>, ir<[[VEC_TC]]> @@ -444,26 +441,23 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: IR %18 = mul i64 %17, 4 ; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1> ; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1> -; CHECK-NEXT: Successor(s): vector loop -; CHECK-EMPTY: -; CHECK-NEXT: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%conv1> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1 -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00> +; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%conv1> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1 +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> +; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%0>, ir<[[VEC_TC]]> diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll index b2ec86ea3ec53..86647b1386ec5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll @@ -28,33 +28,30 @@ ; IF-EVL-NEXT: IR %n.vec = sub i64 %n.rnd.up, %n.mod.vf ; IF-EVL-NEXT: IR %7 = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: IR %8 = mul i64 %7, 4 - ; IF-EVL-NEXT: Successor(s): vector loop - - ; IF-EVL: vector loop: { - ; IF-EVL-NEXT: vector.body: - ; IF-EVL-NEXT: EMIT vp<[[IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[IV_NEXT_EXIT:%.+]]>, vector.body ] - ; IF-EVL-NEXT: EMIT vp<[[EVL_PHI:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[IV_NEX:%.+]]>, vector.body ] - ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> - ; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> - ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[EVL_PHI]]> - ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> - ; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> - ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[EVL_PHI]]> - ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> - ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> - ; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]> - ; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = sub ir<0>, ir<[[LD2]]> - ; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<[[LD2]]>, ir<[[SUB]]>, vp<[[EVL]]>) - ; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add vp<[[SELECT]]>, ir<[[LD1]]> - ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[EVL_PHI]]> - ; IF-EVL-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]> - ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]> - ; IF-EVL-NEXT: EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 - ; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> - ; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT]]> = add vp<[[IV]]>, ir<[[VFUF]]> - ; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, ir<[[VTC]]> - ; IF-EVL-NEXT: No successors - ; IF-EVL-NEXT: } + ; IF-EVL-NEXT: Successor(s): vector.body + ; IF-EVL-EMPTY: + ; IF-EVL-NEXT: vector.body: + ; IF-EVL-NEXT: EMIT vp<[[IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[IV_NEXT_EXIT:%.+]]>, vector.body ] + ; IF-EVL-NEXT: EMIT vp<[[EVL_PHI:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[IV_NEX:%.+]]>, vector.body ] + ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> + ; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> + ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[EVL_PHI]]> + ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> + ; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[EVL_PHI]]> + ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> + ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]> + ; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = sub ir<0>, ir<[[LD2]]> + ; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<[[LD2]]>, ir<[[SUB]]>, vp<[[EVL]]>) + ; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add vp<[[SELECT]]>, ir<[[LD1]]> + ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[EVL_PHI]]> + ; IF-EVL-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]> + ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 + ; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> + ; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT]]> = add vp<[[IV]]>, ir<[[VFUF]]> + ; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, ir<[[VTC]]> entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll index 61a5bd69b7ba3..59e2664cc1402 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll @@ -15,75 +15,72 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, 2 ; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf ; CHECK-NEXT: vp<[[END:%.+]]> = DERIVED-IV ir<%start> + ir<%n.vec> * ir<1> -; CHECK-NEXT: Successor(s): vector loop -; CHECK-EMPTY: -; CHECK-NEXT: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, default.2 ] -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, ir<2> -; CHECK-NEXT: EMIT vp<[[PTR:%.+]]> = ptradd ir<%start>, vp<[[STEPS]]> -; CHECK-NEXT: vp<[[WIDE_PTR:%.+]]> = vector-pointer vp<[[PTR]]> -; CHECK-NEXT: WIDEN ir<%l> = load vp<[[WIDE_PTR]]> -; CHECK-NEXT: EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, ir<-12> -; CHECK-NEXT: EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, ir<13> -; CHECK-NEXT: EMIT vp<[[OR_CASES:%.+]]> = or vp<[[C1]]>, vp<[[C2]]> -; CHECK-NEXT: EMIT vp<[[DEFAULT_MASK:%.+]]> = not vp<[[OR_CASES]]> -; CHECK-NEXT: Successor(s): pred.store -; CHECK-EMPTY: -; CHECK-NEXT: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[C2]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE store ir<0>, vp<[[PTR]]> -; CHECK-NEXT: Successor(s): pred.store.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.continue: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): if.then.2.0 +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, default.2 ] +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, ir<2> +; CHECK-NEXT: EMIT vp<[[PTR:%.+]]> = ptradd ir<%start>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[WIDE_PTR:%.+]]> = vector-pointer vp<[[PTR]]> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[WIDE_PTR]]> +; CHECK-NEXT: EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, ir<-12> +; CHECK-NEXT: EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, ir<13> +; CHECK-NEXT: EMIT vp<[[OR_CASES:%.+]]> = or vp<[[C1]]>, vp<[[C2]]> +; CHECK-NEXT: EMIT vp<[[DEFAULT_MASK:%.+]]> = not vp<[[OR_CASES]]> +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[C2]]> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE store ir<0>, vp<[[PTR]]> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): if.then.2.0 ; CHECK-EMPTY: -; CHECK-NEXT: if.then.2.0: -; CHECK-NEXT: Successor(s): pred.store +; CHECK-NEXT: if.then.2.0: +; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: -; CHECK-NEXT: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[C1]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[C1]]> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE store ir<42>, vp<[[PTR]]> -; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-NEXT: REPLICATE store ir<42>, vp<[[PTR]]> +; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: -; CHECK-NEXT: pred.store.continue: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): if.then.1.1 +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): if.then.1.1 ; CHECK-EMPTY: -; CHECK-NEXT: if.then.1.1: -; CHECK-NEXT: Successor(s): pred.store +; CHECK-NEXT: if.then.1.1: +; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: -; CHECK-NEXT: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[DEFAULT_MASK]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[DEFAULT_MASK]]> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: -; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE store ir<2>, vp<[[PTR]]> -; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE store ir<2>, vp<[[PTR]]> +; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: -; CHECK-NEXT: pred.store.continue: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): default.2 -; CHECK-EMPTY: -; CHECK-NEXT: default.2: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VTC]]> +; CHECK-NEXT: pred.store.continue: ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block +; CHECK-NEXT: Successor(s): default.2 +; CHECK-EMPTY: +; CHECK-NEXT: default.2: +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VTC]]> +; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<%0>, ir<[[VTC]]> From e3d37546e2ee2850ff140f5ea523cd44ce5ad078 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 12 May 2025 12:32:25 +0100 Subject: [PATCH 2/6] !fixup address latest comments, thanks --- .../Transforms/Vectorize/LoopVectorize.cpp | 17 +-- llvm/lib/Transforms/Vectorize/VPlan.cpp | 136 +++++++++--------- llvm/lib/Transforms/Vectorize/VPlan.h | 16 +-- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 29 ++-- .../Transforms/Vectorize/VPlanTransforms.cpp | 19 +-- .../Transforms/Vectorize/VPlanTransforms.h | 8 +- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 12 +- llvm/lib/Transforms/Vectorize/VPlanUtils.h | 5 + llvm/lib/Transforms/Vectorize/VPlanValue.h | 4 +- 10 files changed, 127 insertions(+), 121 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index be8decbfaf7c9..f6e3f3a36deb1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2760,15 +2760,6 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, return TTI.getIntrinsicInstrCost(CostAttrs, CostKind); } -static VPBasicBlock *getHeaderForMainVectorLoop(VPlan &Plan, - VPDominatorTree &VPDT) { - return find_singleton( - vp_depth_first_shallow(Plan.getEntry()), [&VPDT](VPBlockBase *VPB, bool) { - auto *VPBB = dyn_cast(VPB); - return VPBB && VPBB->isHeader(VPDT) ? VPBB : nullptr; - }); -} - void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Fix widened non-induction PHIs by setting up the PHI operands. if (EnableVPlanNativePath) @@ -2787,10 +2778,10 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { PSE.getSE()->forgetLoop(OrigLoop); PSE.getSE()->forgetBlockAndLoopDispositions(); - // Don't apply optimizations below when no vector loop remains, as they all + // Don't apply optimizations below when no (vector) loop remains, as they all // require one at the moment. VPBasicBlock *HeaderVPBB = - getHeaderForMainVectorLoop(*State.Plan, State.VPDT); + vputils::getTopLevelVectorLoopHeader(*State.Plan, State.VPDT); if (!HeaderVPBB) return; @@ -7811,6 +7802,7 @@ DenseMap LoopVectorizationPlanner::executePlan( VPBasicBlock *MiddleVPBB = BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr; + VPlanTransforms::disolveLoopRegions(BestVPlan); VPlanTransforms::convertToConcreteRecipes(BestVPlan, *Legal->getWidestInductionType()); @@ -7906,7 +7898,8 @@ DenseMap LoopVectorizationPlanner::executePlan( // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). - VPBasicBlock *HeaderVPBB = getHeaderForMainVectorLoop(BestVPlan, State.VPDT); + VPBasicBlock *HeaderVPBB = + vputils::getTopLevelVectorLoopHeader(BestVPlan, State.VPDT); if (HeaderVPBB) { MDNode *OrigLoopID = OrigLoop->getLoopID(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 6d35862ef6ad4..8aa769fdbad63 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -207,9 +207,17 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { return Parent->getEnclosingBlockWithPredecessors(); } -bool VPBasicBlock::isHeader(const VPDominatorTree &VPDT) const { - return getPredecessors().size() == 2 && - VPDT.dominates(this, getPredecessors()[1]); +bool VPBlockUtils::isHeader(const VPBlockBase *VPB, + const VPDominatorTree &VPDT) { + auto *VPBB = dyn_cast(VPB); + if (!VPBB) + return false; + if (auto *R = VPBB->getParent()) + return !R->isReplicator() && VPBB->getNumPredecessors() == 0; + + assert(!VPB->getParent() && "checking blocks in regions not implemented yet"); + return VPB->getPredecessors().size() == 2 && + VPDT.dominates(VPB, VPB->getPredecessors()[1]); } VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { @@ -425,22 +433,23 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) { if (ParentLoop && !State.LI->getLoopFor(NewBB)) ParentLoop->addBasicBlockToLoop(NewBB, *State.LI); + auto Preds = to_vector(getHierarchicalPredecessors()); + if (VPBlockUtils::isHeader(this, State.VPDT)) { + // There's no block yet for the latch, don't try to connect it yet. + Preds = {Preds[0]}; + } + // Hook up the new basic block to its predecessors. - for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { + for (VPBlockBase *PredVPBlock : Preds) { VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock(); auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors(); BasicBlock *PredBB = CFG.VPBB2IRBB.lookup(PredVPBB); - if (!PredBB) - continue; - assert(PredBB && "Predecessor basic-block not found building successor."); auto *PredBBTerminator = PredBB->getTerminator(); LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); auto *TermBr = dyn_cast(PredBBTerminator); if (isa(PredBBTerminator)) { - if (PredVPSuccessors.size() == 2) - continue; assert(PredVPSuccessors.size() == 1 && "Predecessor ending w/o branch must have single successor."); DebugLoc DL = PredBBTerminator->getDebugLoc(); @@ -496,7 +505,7 @@ void VPBasicBlock::execute(VPTransformState *State) { bool Replica = bool(State->Lane); BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. - if (isHeader(State->VPDT)) { + if (VPBlockUtils::isHeader(this, State->VPDT)) { // Create and register the new vector loop. Loop *PrevParentLoop = State->CurrentParentLoop; State->CurrentParentLoop = State->LI->AllocateLoop(); @@ -539,11 +548,8 @@ void VPBasicBlock::execute(VPTransformState *State) { executeRecipes(State, NewBB); // If this block is a latch, update CurrentParentLoop. - if (any_of(getSuccessors(), [State, this](VPBlockBase *Succ) { - auto *VPBB = dyn_cast(Succ); - return VPBB && VPBB->isHeader(State->VPDT) && - State->VPDT.dominates(Succ, this); - })) + if (getNumSuccessors() == 2 && + VPBlockUtils::isHeader(getSuccessors()[1], State->VPDT)) State->CurrentParentLoop = State->CurrentParentLoop->getParentLoop(); } @@ -866,10 +872,11 @@ void VPRegionBlock::removeRegion() { VPBlockUtils::disconnectBlocks(this, Middle); for (VPBlockBase *VPB : vp_depth_first_shallow(Entry)) - VPB->setParent(nullptr); + VPB->setParent(getParent()); VPBlockUtils::connectBlocks(Preheader, Header); VPBlockUtils::connectBlocks(Exiting, Middle); + VPBlockUtils::connectBlocks(Exiting, Header); } VPlan::VPlan(Loop *L) { @@ -981,57 +988,57 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : RPOT) Block->execute(State); + VPBasicBlock *Header = + vputils::getTopLevelVectorLoopHeader(*this, State->VPDT); + if (!Header) + return; + + auto *LatchVPBB = cast(Header->getPredecessors()[1]); + BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; + // Fix the latch value of canonical, reduction and first-order recurrences // phis in the vector loop. - for (VPBasicBlock *Header : - VPBlockUtils::blocksOnly(vp_depth_first_shallow(Entry))) { - if (!Header->isHeader(State->VPDT)) + for (VPRecipeBase &R : Header->phis()) { + // Skip phi-like recipes that generate their backedege values themselves. + if (isa(&R)) continue; - for (VPRecipeBase &R : Header->phis()) { - if (isa(&R)) - continue; - auto *LatchVPBB = cast(Header->getPredecessors()[1]); - BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; - - if (isa(&R)) { - PHINode *Phi = nullptr; - if (isa(&R)) { - Phi = cast(State->get(R.getVPSingleValue())); - } else { - auto *WidenPhi = cast(&R); - assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && - "recipe generating only scalars should have been replaced"); - auto *GEP = cast(State->get(WidenPhi)); - Phi = cast(GEP->getPointerOperand()); - } - - Phi->setIncomingBlock(1, VectorLatchBB); - - // Move the last step to the end of the latch block. This ensures - // consistent placement of all induction updates. - Instruction *Inc = cast(Phi->getIncomingValue(1)); - Inc->moveBefore( - std::prev(VectorLatchBB->getTerminator()->getIterator())); - - // Use the steps for the last part as backedge value for the induction. - if (auto *IV = dyn_cast(&R)) - Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); - continue; + if (isa(&R)) { + PHINode *Phi = nullptr; + if (isa(&R)) { + Phi = cast(State->get(R.getVPSingleValue())); + } else { + auto *WidenPhi = cast(&R); + assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && + "recipe generating only scalars should have been replaced"); + auto *GEP = cast(State->get(WidenPhi)); + Phi = cast(GEP->getPointerOperand()); } - auto *PhiR = cast(&R); - // VPInstructions currently model scalar Phis only. - bool NeedsScalar = isa(PhiR) || - (isa(PhiR) && - cast(PhiR)->isInLoop()); - - Value *Phi = State->get(PhiR, NeedsScalar); - // VPHeaderPHIRecipe supports getBackedgeValue() but VPInstruction does - // not. - Value *Val = State->get(PhiR->getOperand(1), NeedsScalar); - cast(Phi)->addIncoming(Val, VectorLatchBB); + Phi->setIncomingBlock(1, VectorLatchBB); + + // Move the last step to the end of the latch block. This ensures + // consistent placement of all induction updates. + Instruction *Inc = cast(Phi->getIncomingValue(1)); + Inc->moveBefore(std::prev(VectorLatchBB->getTerminator()->getIterator())); + + // Use the steps for the last part as backedge value for the induction. + if (auto *IV = dyn_cast(&R)) + Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); + continue; } + + auto *PhiR = cast(&R); + // VPInstructions currently model scalar Phis only. + bool NeedsScalar = isa(PhiR) || + (isa(PhiR) && + cast(PhiR)->isInLoop()); + + Value *Phi = State->get(PhiR, NeedsScalar); + // VPHeaderPHIRecipe supports getBackedgeValue() but VPInstruction does + // not. + Value *Val = State->get(PhiR->getOperand(1), NeedsScalar); + cast(Phi)->addIncoming(Val, VectorLatchBB); } } @@ -1390,17 +1397,18 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { #endif -bool VPValue::isDefinedOutsideLoopRegions() const { +bool VPValue::isDefinedOutsideLoop() const { auto *DefR = getDefiningRecipe(); if (!DefR) return true; + // For non-live-ins, check if is in a region only if the top-level loop region + // still exits. const VPBasicBlock *DefVPBB = DefR->getParent(); auto *Plan = DefVPBB->getPlan(); - if (Plan->getVectorLoopRegion()) - return !DefR->getParent()->getEnclosingLoopRegion(); - return DefVPBB == Plan->getEntry(); + return Plan->getVectorLoopRegion() && !DefVPBB->getEnclosingLoopRegion(); } + void VPValue::replaceAllUsesWith(VPValue *New) { replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 3931583233bb6..c6608760349e1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1589,9 +1589,7 @@ struct VPWidenSelectRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { return getOperand(0); } - bool isInvariantCond() const { - return getCond()->isDefinedOutsideLoopRegions(); - } + bool isInvariantCond() const { return getCond()->isDefinedOutsideLoop(); } /// Returns true if the recipe only uses the first lane of operand \p Op. bool onlyFirstLaneUsed(const VPValue *Op) const override { @@ -1604,17 +1602,16 @@ struct VPWidenSelectRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { /// A recipe for handling GEP instructions. class VPWidenGEPRecipe : public VPRecipeWithIRFlags { bool isPointerLoopInvariant() const { - return getOperand(0)->isDefinedOutsideLoopRegions(); + return getOperand(0)->isDefinedOutsideLoop(); } bool isIndexLoopInvariant(unsigned I) const { - return getOperand(I + 1)->isDefinedOutsideLoopRegions(); + return getOperand(I + 1)->isDefinedOutsideLoop(); } bool areAllOperandsInvariant() const { - return all_of(operands(), [](VPValue *Op) { - return Op->isDefinedOutsideLoopRegions(); - }); + return all_of(operands(), + [](VPValue *Op) { return Op->isDefinedOutsideLoop(); }); } public: @@ -3415,9 +3412,6 @@ class VPBasicBlock : public VPBlockBase { /// second predecessor is the exiting block of the region. const VPBasicBlock *getCFGPredecessor(unsigned Idx) const; - /// Returns true if the block is a loop header in a plain-CFG VPlan. - bool isHeader(const VPDominatorTree &VPDT) const; - protected: /// Execute the recipes in the IR basic block \p BB. void executeRecipes(VPTransformState *State, BasicBlock *BB); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index d90f5c2f19762..52ce8e72bf88b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -576,8 +576,8 @@ Value *VPInstruction::generate(VPTransformState &State) { case VPInstruction::BranchOnCond: { Value *Cond = State.get(getOperand(0), VPLane(0)); // Replace the temporary unreachable terminator with a new conditional - // branch, hooking it up to backward destination for exiting blocks now and - // to forward destination(s) later when they are created. + // branch, hooking it up to backward destination (header) for latch blocks + // now to forward destination(s) later when they are created. BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); CondBr->setSuccessor(0, nullptr); @@ -600,10 +600,10 @@ Value *VPInstruction::generate(VPTransformState &State) { VPBasicBlock *Header = cast(getParent()->getSuccessors()[1]); // Replace the temporary unreachable terminator with a new conditional - // branch, hooking it up to backward destination (the header) now and to the - // forward destination (the exit/middle block) later when it is created. - // Note that CreateCondBr expects a valid BB as first argument, so we need - // to set it to nullptr later. + // branch, hooking it up to backward destination (the header) for latch + // blocks now forward destination (the exit/middle block) later when it is + // created. Note that CreateCondBr expects a valid BB as first argument, so + // we need to set it to nullptr later. BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), State.CFG.VPBB2IRBB[Header]); CondBr->setSuccessor(0, nullptr); @@ -1560,7 +1560,7 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) { InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { SelectInst *SI = cast(getUnderlyingValue()); - bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions(); + bool ScalarCond = getOperand(0)->isDefinedOutsideLoop(); Type *ScalarTy = Ctx.Types.inferScalarType(this); Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); @@ -1784,7 +1784,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue()); if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && - getOperand(1)->isDefinedOutsideLoopRegions()) + getOperand(1)->isDefinedOutsideLoop()) RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); @@ -2634,13 +2634,12 @@ static void scalarizeInstruction(const Instruction *Instr, if (auto *II = dyn_cast(Cloned)) State.AC->registerAssumption(II); - assert( - (RepRecipe->getParent()->getParent() || - !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || - all_of(RepRecipe->operands(), - [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && - "Expected a recipe is either within a region or all of its operands " - "are defined outside the vectorized region."); + assert((RepRecipe->getParent()->getParent() || + !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || + all_of(RepRecipe->operands(), + [](VPValue *Op) { return Op->isDefinedOutsideLoop(); })) && + "Expected a recipe is either within a region or all of its operands " + "are defined outside the vectorized region."); } void VPReplicateRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e8481d0bf0c27..fb6e2d26e35c2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1623,9 +1623,8 @@ static void licm(VPlan &Plan) { // TODO: Relax checks in the future, e.g. we could also hoist reads, if // their memory location is not modified in the vector loop. if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() || - any_of(R.operands(), [](VPValue *Op) { - return !Op->isDefinedOutsideLoopRegions(); - })) + any_of(R.operands(), + [](VPValue *Op) { return !Op->isDefinedOutsideLoop(); })) continue; R.moveBefore(*Preheader, Preheader->end()); } @@ -2392,8 +2391,7 @@ void VPlanTransforms::createInterleaveGroups( } } -void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, - Type &CanonicalIVTy) { +void VPlanTransforms::disolveLoopRegions(VPlan &Plan) { // Replace loop regions with explicity CFG. SmallVector LoopRegions; for (VPRegionBlock *R : VPBlockUtils::blocksOnly( @@ -2401,19 +2399,16 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, if (!R->isReplicator()) LoopRegions.push_back(R); } - for (VPRegionBlock *R : LoopRegions) { - VPBlockBase *Header = R->getEntry(); - VPBlockBase *Latch = R->getExiting(); + for (VPRegionBlock *R : LoopRegions) R->removeRegion(); - // Add explicit backedge. - VPBlockUtils::connectBlocks(Latch, Header); - } +} +void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, + Type &CanonicalIVTy) { using namespace llvm::VPlanPatternMatch; VPTypeAnalysis TypeInfo(&CanonicalIVTy); SmallVector ToRemove; - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_deep(Plan.getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 4b80875b79f13..6f6946e39ce1a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -184,9 +184,11 @@ struct VPlanTransforms { VPRecipeBuilder &RecipeBuilder, VFRange &Range); - /// Lower abstract recipes to concrete ones, that can be codegen'd and replace - /// loop regions with explicit CFG. Use \p CanonicalIVTy as type for all - /// un-typed live-ins in VPTypeAnalysis. + /// Replace loop regions with explicit CFG. + static void disolveLoopRegions(VPlan &Plan); + + /// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p + /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy); /// Perform instcombine-like simplifications on recipes in \p Plan. Use \p diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index ce83c276297c0..447648018e514 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -246,7 +246,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { } if (auto *RepR = dyn_cast(&R)) { if (isa(RepR->getUnderlyingValue()) && - RepR->getOperand(1)->isDefinedOutsideLoopRegions()) { + RepR->getOperand(1)->isDefinedOutsideLoop()) { // Stores to an invariant address only need to store the last part. remapOperands(&R, UF - 1); return; diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 2db4957409c8d..fda0c70aaf5c6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "VPlanUtils.h" +#include "VPlanCFG.h" #include "VPlanPatternMatch.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -87,7 +88,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { return true; VPRecipeBase *R = V->getDefiningRecipe(); - if (R && V->isDefinedOutsideLoopRegions()) { + if (R && V->isDefinedOutsideLoop()) { if (match(V->getDefiningRecipe(), m_VPInstruction( m_VPValue()))) @@ -124,3 +125,12 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { return false; }); } + +VPBasicBlock *vputils::getTopLevelVectorLoopHeader(VPlan &Plan, + VPDominatorTree &VPDT) { + auto DepthFirst = vp_depth_first_shallow(Plan.getEntry()); + auto I = find_if(DepthFirst, [&VPDT](VPBlockBase *VPB) { + return VPBlockUtils::isHeader(VPB, VPDT); + }); + return I == DepthFirst.end() ? nullptr : cast(*I); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 67329a6d6953c..f2febb2282b4f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -94,6 +94,8 @@ bool isHeaderMask(const VPValue *V, VPlan &Plan); /// VPDerivedIV or VPCanonicalIVPHI). bool isUniformAcrossVFsAndUFs(VPValue *V); +/// Returns the header block of the top-level vector loop, if one exists. +VPBasicBlock *getTopLevelVectorLoopHeader(VPlan &Plan, VPDominatorTree &VPDT); } // namespace vputils //===----------------------------------------------------------------------===// @@ -240,6 +242,9 @@ class VPBlockUtils { VPBlockUtils::connectBlocks(From, BlockPtr, -1, SuccIdx); VPBlockUtils::connectBlocks(BlockPtr, To, PredIx, -1); } + + /// Returns true if \p VPB is a loop header. + static bool isHeader(const VPBlockBase *VPBB, const VPDominatorTree &VPDT); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 638156eab7a84..3e3ce920170e0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -181,8 +181,8 @@ class VPValue { return getUnderlyingValue(); } - /// Returns true if the VPValue is defined outside any loop region. - bool isDefinedOutsideLoopRegions() const; + /// Returns true if the VPValue is defined outside any loop. + bool isDefinedOutsideLoop() const; // Set \p Val as the underlying Value of this VPValue. void setUnderlyingValue(Value *Val) { From 3a064e7c13d0725d77c312fa60d70e86fb32a033 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 14 May 2025 10:17:52 +0100 Subject: [PATCH 3/6] !fixup address latest comments, thanks! --- .../Transforms/Vectorize/LoopVectorize.cpp | 8 ++++--- llvm/lib/Transforms/Vectorize/VPlan.cpp | 23 +++++++++++++------ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 +++---- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 3 +-- llvm/lib/Transforms/Vectorize/VPlanUtils.h | 13 +++++++---- 5 files changed, 35 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 31724f0a80106..505f1aad46cff 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2781,7 +2781,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Don't apply optimizations below when no (vector) loop remains, as they all // require one at the moment. VPBasicBlock *HeaderVPBB = - vputils::getTopLevelVectorLoopHeader(*State.Plan, State.VPDT); + vputils::getFirstLoopHeader(*State.Plan, State.VPDT); if (!HeaderVPBB) return; @@ -7801,6 +7801,9 @@ DenseMap LoopVectorizationPlanner::executePlan( TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); VPlanTransforms::removeDeadRecipes(BestVPlan); + // Retrieve and store the middle block before dissolving regions. Regions are + // dissolved after optimizing for VF and UF, which completely removes unneeded + // loop regions first. VPBasicBlock *MiddleVPBB = BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr; VPlanTransforms::disolveLoopRegions(BestVPlan); @@ -7899,8 +7902,7 @@ DenseMap LoopVectorizationPlanner::executePlan( // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). - VPBasicBlock *HeaderVPBB = - vputils::getTopLevelVectorLoopHeader(BestVPlan, State.VPDT); + VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT); if (HeaderVPBB) { MDNode *OrigLoopID = OrigLoop->getLoopID(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 8aa769fdbad63..8ac819a61ee26 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -212,14 +212,24 @@ bool VPBlockUtils::isHeader(const VPBlockBase *VPB, auto *VPBB = dyn_cast(VPB); if (!VPBB) return false; + + // If VPBB is in a region R, VPBB is a loop header if R is a loop region with + // VPBB as its entry, i.e., free of predecessors. if (auto *R = VPBB->getParent()) return !R->isReplicator() && VPBB->getNumPredecessors() == 0; - assert(!VPB->getParent() && "checking blocks in regions not implemented yet"); + // A header dominates its second predecessor (the latch), with the other + // predecessor being the preheader return VPB->getPredecessors().size() == 2 && VPDT.dominates(VPB, VPB->getPredecessors()[1]); } +bool VPBlockUtils::isLatch(const VPBlockBase *VPB, + const VPDominatorTree &VPDT) { + return VPB->getNumSuccessors() == 2 && + VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT); +} + VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { iterator It = begin(); while (It != end() && It->isPhi()) @@ -435,7 +445,7 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) { auto Preds = to_vector(getHierarchicalPredecessors()); if (VPBlockUtils::isHeader(this, State.VPDT)) { - // There's no block yet for the latch, don't try to connect it yet. + // There's no block for the latch yet, connect to the preheader only. Preds = {Preds[0]}; } @@ -443,8 +453,9 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) { for (VPBlockBase *PredVPBlock : Preds) { VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock(); auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors(); + assert(CFG.VPBB2IRBB.contains(PredVPBB) && + "Predecessor basic-block not found building successor."); BasicBlock *PredBB = CFG.VPBB2IRBB.lookup(PredVPBB); - assert(PredBB && "Predecessor basic-block not found building successor."); auto *PredBBTerminator = PredBB->getTerminator(); LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); @@ -548,8 +559,7 @@ void VPBasicBlock::execute(VPTransformState *State) { executeRecipes(State, NewBB); // If this block is a latch, update CurrentParentLoop. - if (getNumSuccessors() == 2 && - VPBlockUtils::isHeader(getSuccessors()[1], State->VPDT)) + if (VPBlockUtils::isLatch(this, State->VPDT)) State->CurrentParentLoop = State->CurrentParentLoop->getParentLoop(); } @@ -988,8 +998,7 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : RPOT) Block->execute(State); - VPBasicBlock *Header = - vputils::getTopLevelVectorLoopHeader(*this, State->VPDT); + VPBasicBlock *Header = vputils::getFirstLoopHeader(*this, State->VPDT); if (!Header) return; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 497e3652c7a01..2a22da0365770 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -577,7 +577,7 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Cond = State.get(getOperand(0), VPLane(0)); // Replace the temporary unreachable terminator with a new conditional // branch, hooking it up to backward destination (header) for latch blocks - // now to forward destination(s) later when they are created. + // now, and to forward destination(s) later when they are created. BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); CondBr->setSuccessor(0, nullptr); @@ -601,9 +601,9 @@ Value *VPInstruction::generate(VPTransformState &State) { // Replace the temporary unreachable terminator with a new conditional // branch, hooking it up to backward destination (the header) for latch - // blocks now forward destination (the exit/middle block) later when it is - // created. Note that CreateCondBr expects a valid BB as first argument, so - // we need to set it to nullptr later. + // blocks now, and to forward destination (the exit/middle block) later when + // it is created. Note that CreateCondBr expects a valid BB as first + // argument, so we need to set it to nullptr later. BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), State.CFG.VPBB2IRBB[Header]); CondBr->setSuccessor(0, nullptr); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index fda0c70aaf5c6..ad49008945ef2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -126,8 +126,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { }); } -VPBasicBlock *vputils::getTopLevelVectorLoopHeader(VPlan &Plan, - VPDominatorTree &VPDT) { +VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) { auto DepthFirst = vp_depth_first_shallow(Plan.getEntry()); auto I = find_if(DepthFirst, [&VPDT](VPBlockBase *VPB) { return VPBlockUtils::isHeader(VPB, VPDT); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index f2febb2282b4f..cb99b64ad6f25 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -94,8 +94,9 @@ bool isHeaderMask(const VPValue *V, VPlan &Plan); /// VPDerivedIV or VPCanonicalIVPHI). bool isUniformAcrossVFsAndUFs(VPValue *V); -/// Returns the header block of the top-level vector loop, if one exists. -VPBasicBlock *getTopLevelVectorLoopHeader(VPlan &Plan, VPDominatorTree &VPDT); +/// Returns the header block of the first, top-level loop, or null if none +/// exist. +VPBasicBlock *getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT); } // namespace vputils //===----------------------------------------------------------------------===// @@ -243,8 +244,12 @@ class VPBlockUtils { VPBlockUtils::connectBlocks(BlockPtr, To, PredIx, -1); } - /// Returns true if \p VPB is a loop header. - static bool isHeader(const VPBlockBase *VPBB, const VPDominatorTree &VPDT); + /// Returns true if \p VPB is a loop header, based on regions or \p VPDT in + /// their absence. + static bool isHeader(const VPBlockBase *VPB, const VPDominatorTree &VPDT); + + /// Returns true if \p VPB is a loop latch, using isHeader(). + static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT); }; } // namespace llvm From 9cae5f728f2fd2e1e8a484ddd136af269db43f94 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 14 May 2025 13:51:48 +0100 Subject: [PATCH 4/6] !fixup address latest comments, thanks --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 24 +++++---- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 50 ++++++++----------- 2 files changed, 35 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 8ac819a61ee26..58aff840a03fb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -226,6 +226,9 @@ bool VPBlockUtils::isHeader(const VPBlockBase *VPB, bool VPBlockUtils::isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT) { + // A latch has a header as its second successor, with its other successor + // leaving the loop. A preheader OTOH has a header as its first (and only) + // successor. return VPB->getNumSuccessors() == 2 && VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT); } @@ -455,7 +458,7 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) { auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors(); assert(CFG.VPBB2IRBB.contains(PredVPBB) && "Predecessor basic-block not found building successor."); - BasicBlock *PredBB = CFG.VPBB2IRBB.lookup(PredVPBB); + BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; auto *PredBBTerminator = PredBB->getTerminator(); LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); @@ -1406,18 +1409,17 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { #endif -bool VPValue::isDefinedOutsideLoop() const { - auto *DefR = getDefiningRecipe(); - if (!DefR) - return true; - - // For non-live-ins, check if is in a region only if the top-level loop region - // still exits. - const VPBasicBlock *DefVPBB = DefR->getParent(); - auto *Plan = DefVPBB->getPlan(); - return Plan->getVectorLoopRegion() && !DefVPBB->getEnclosingLoopRegion(); +/// Returns true if there is a vector loop region and \p VPV is defined in a +/// loop region. +static bool isDefinedInsideLoopRegions(const VPValue *VPV) { + const VPRecipeBase *DefR = VPV->getDefiningRecipe(); + return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() || + DefR->getParent()->getEnclosingLoopRegion()); } +bool VPValue::isDefinedOutsideLoop() const { + return !isDefinedInsideLoopRegions(this); +} void VPValue::replaceAllUsesWith(VPValue *New) { replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2a22da0365770..106d0ec8c65b1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -456,6 +456,26 @@ Value *VPInstruction::generatePerLane(VPTransformState &State, State.get(getOperand(1), Lane), Name); } +/// Create a conditional branch using \p Cond branching to the successors of \p +/// VPBB. Note that the first successor is always forward (i.e. not created yet) +/// while the second successor may already have been created (if it is a header +/// block and VPBB is a header). +static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB, + VPTransformState &State) { + // Replace the temporary unreachable terminator with a new conditional + // branch, hooking it up to backward destination (header) for latch blocks + // now, and to forward destination(s) later when they are created. + // Second successor may be backwards - iff it is already in VPBB2IRBB. + VPBasicBlock *SecondVPSucc = cast(VPBB->getSuccessors()[1]); + BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc); + BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB]; + BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc); + // First successor is always forward, reset it to nullptr + CondBr->setSuccessor(0, nullptr); + IRBB->getTerminator()->eraseFromParent(); + return CondBr; +} + Value *VPInstruction::generate(VPTransformState &State) { IRBuilderBase &Builder = State.Builder; @@ -575,40 +595,14 @@ Value *VPInstruction::generate(VPTransformState &State) { } case VPInstruction::BranchOnCond: { Value *Cond = State.get(getOperand(0), VPLane(0)); - // Replace the temporary unreachable terminator with a new conditional - // branch, hooking it up to backward destination (header) for latch blocks - // now, and to forward destination(s) later when they are created. - BranchInst *CondBr = - Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); - CondBr->setSuccessor(0, nullptr); - Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); - - VPBasicBlock *Header = cast(getParent()->getSuccessors()[1]); - if (!State.CFG.VPBB2IRBB.contains(Header)) - return CondBr; - - CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); - return CondBr; + return createCondBranch(Cond, getParent(), State); } case VPInstruction::BranchOnCount: { // First create the compare. Value *IV = State.get(getOperand(0), /*IsScalar*/ true); Value *TC = State.get(getOperand(1), /*IsScalar*/ true); Value *Cond = Builder.CreateICmpEQ(IV, TC); - - // Now create the branch. - VPBasicBlock *Header = cast(getParent()->getSuccessors()[1]); - - // Replace the temporary unreachable terminator with a new conditional - // branch, hooking it up to backward destination (the header) for latch - // blocks now, and to forward destination (the exit/middle block) later when - // it is created. Note that CreateCondBr expects a valid BB as first - // argument, so we need to set it to nullptr later. - BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), - State.CFG.VPBB2IRBB[Header]); - CondBr->setSuccessor(0, nullptr); - Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); - return CondBr; + return createCondBranch(Cond, getParent(), State); } case VPInstruction::Broadcast: { return Builder.CreateVectorSplat( From 6c169e04f30c564771feee083faa71ce52f8b28d Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 17 May 2025 22:48:53 +0100 Subject: [PATCH 5/6] !fixup use ::isHeader in VPlan verifier. --- llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 68b35d42e8674..54cf8ac2ed04a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -73,9 +73,7 @@ bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) { auto RecipeI = VPBB->begin(); auto End = VPBB->end(); unsigned NumActiveLaneMaskPhiRecipes = 0; - const VPRegionBlock *ParentR = VPBB->getParent(); - bool IsHeaderVPBB = ParentR && !ParentR->isReplicator() && - ParentR->getEntryBasicBlock() == VPBB; + bool IsHeaderVPBB = VPBlockUtils::isHeader(VPBB, VPDT); while (RecipeI != End && RecipeI->isPhi()) { if (isa(RecipeI)) NumActiveLaneMaskPhiRecipes++; From b4670f6e1f2fbca31aae6f4a50e0a796e70d6512 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 24 May 2025 11:41:12 +0100 Subject: [PATCH 6/6] !fixup address latest comments, thanks --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 19 +++++++++++-------- llvm/lib/Transforms/Vectorize/VPlan.h | 17 ++++++++++------- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 19 ++++++++++--------- .../Transforms/Vectorize/VPlanTransforms.cpp | 7 ++++--- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 2 +- 7 files changed, 38 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 336d01377d7bd..165b57c87beb1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -450,10 +450,12 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) { if (ParentLoop && !State.LI->getLoopFor(NewBB)) ParentLoop->addBasicBlockToLoop(NewBB, *State.LI); - auto Preds = to_vector(getHierarchicalPredecessors()); + SmallVector Preds; if (VPBlockUtils::isHeader(this, State.VPDT)) { // There's no block for the latch yet, connect to the preheader only. - Preds = {Preds[0]}; + Preds = {getPredecessors()[0]}; + } else { + Preds = to_vector(getPredecessors()); } // Hook up the new basic block to its predecessors. @@ -879,11 +881,10 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, } #endif -void VPRegionBlock::removeRegion() { +void VPRegionBlock::dissolveToCFGLoop() { auto *Header = cast(getEntry()); VPBlockBase *Preheader = getSinglePredecessor(); - auto *Exiting = cast(getExiting()); - + auto *ExitingLatch = cast(getExiting()); VPBlockBase *Middle = getSingleSuccessor(); VPBlockUtils::disconnectBlocks(Preheader, this); VPBlockUtils::disconnectBlocks(this, Middle); @@ -892,8 +893,8 @@ void VPRegionBlock::removeRegion() { VPB->setParent(getParent()); VPBlockUtils::connectBlocks(Preheader, Header); - VPBlockUtils::connectBlocks(Exiting, Middle); - VPBlockUtils::connectBlocks(Exiting, Header); + VPBlockUtils::connectBlocks(ExitingLatch, Middle); + VPBlockUtils::connectBlocks(ExitingLatch, Header); } VPlan::VPlan(Loop *L) { @@ -1005,6 +1006,8 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : RPOT) Block->execute(State); + State->CFG.DTU.flush(); + VPBasicBlock *Header = vputils::getFirstLoopHeader(*this, State->VPDT); if (!Header) return; @@ -1427,7 +1430,7 @@ static bool isDefinedInsideLoopRegions(const VPValue *VPV) { DefR->getParent()->getEnclosingLoopRegion()); } -bool VPValue::isDefinedOutsideLoop() const { +bool VPValue::isDefinedOutsideLoopRegions() const { return !isDefinedInsideLoopRegions(this); } void VPValue::replaceAllUsesWith(VPValue *New) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 88ab9753802e7..d85e8d2dbe4f5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1631,7 +1631,9 @@ struct VPWidenSelectRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { return getOperand(0); } - bool isInvariantCond() const { return getCond()->isDefinedOutsideLoop(); } + bool isInvariantCond() const { + return getCond()->isDefinedOutsideLoopRegions(); + } /// Returns true if the recipe only uses the first lane of operand \p Op. bool onlyFirstLaneUsed(const VPValue *Op) const override { @@ -1644,16 +1646,17 @@ struct VPWidenSelectRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { /// A recipe for handling GEP instructions. class VPWidenGEPRecipe : public VPRecipeWithIRFlags { bool isPointerLoopInvariant() const { - return getOperand(0)->isDefinedOutsideLoop(); + return getOperand(0)->isDefinedOutsideLoopRegions(); } bool isIndexLoopInvariant(unsigned I) const { - return getOperand(I + 1)->isDefinedOutsideLoop(); + return getOperand(I + 1)->isDefinedOutsideLoopRegions(); } bool areAllOperandsInvariant() const { - return all_of(operands(), - [](VPValue *Op) { return Op->isDefinedOutsideLoop(); }); + return all_of(operands(), [](VPValue *Op) { + return Op->isDefinedOutsideLoopRegions(); + }); } public: @@ -3868,8 +3871,8 @@ class VPRegionBlock : public VPBlockBase { VPRegionBlock *clone() override; /// Remove the current region from its VPlan, connecting its predecessor to - /// its entry and exiting block to its successor. - void removeRegion(); + /// its entry, and its exiting block to its successor. + void dissolveToCFGLoop(); }; /// VPlan models a candidate for vectorization, encoding various decisions take diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index e2da6eb1157c9..5c2ddb62c7155 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -465,7 +465,7 @@ Value *VPInstruction::generatePerLane(VPTransformState &State, /// Create a conditional branch using \p Cond branching to the successors of \p /// VPBB. Note that the first successor is always forward (i.e. not created yet) /// while the second successor may already have been created (if it is a header -/// block and VPBB is a header). +/// block and VPBB is a latch). static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB, VPTransformState &State) { // Replace the temporary unreachable terminator with a new conditional @@ -1557,7 +1557,7 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) { InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { SelectInst *SI = cast(getUnderlyingValue()); - bool ScalarCond = getOperand(0)->isDefinedOutsideLoop(); + bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions(); Type *ScalarTy = Ctx.Types.inferScalarType(this); Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); @@ -1778,7 +1778,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS); if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && - getOperand(1)->isDefinedOutsideLoop()) + getOperand(1)->isDefinedOutsideLoopRegions()) RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); @@ -2677,12 +2677,13 @@ static void scalarizeInstruction(const Instruction *Instr, if (auto *II = dyn_cast(Cloned)) State.AC->registerAssumption(II); - assert((RepRecipe->getParent()->getParent() || - !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || - all_of(RepRecipe->operands(), - [](VPValue *Op) { return Op->isDefinedOutsideLoop(); })) && - "Expected a recipe is either within a region or all of its operands " - "are defined outside the vectorized region."); + assert( + (RepRecipe->getParent()->getParent() || + !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || + all_of(RepRecipe->operands(), + [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && + "Expected a recipe is either within a region or all of its operands " + "are defined outside the vectorized region."); } void VPReplicateRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 37b78530ef3a0..52d61d96c8083 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1736,8 +1736,9 @@ static void licm(VPlan &Plan) { // TODO: Relax checks in the future, e.g. we could also hoist reads, if // their memory location is not modified in the vector loop. if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() || - any_of(R.operands(), - [](VPValue *Op) { return !Op->isDefinedOutsideLoop(); })) + any_of(R.operands(), [](VPValue *Op) { + return !Op->isDefinedOutsideLoopRegions(); + })) continue; R.moveBefore(*Preheader, Preheader->end()); } @@ -2514,7 +2515,7 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) { LoopRegions.push_back(R); } for (VPRegionBlock *R : LoopRegions) - R->removeRegion(); + R->dissolveToCFGLoop(); } // Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe. diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 447648018e514..ce83c276297c0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -246,7 +246,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { } if (auto *RepR = dyn_cast(&R)) { if (isa(RepR->getUnderlyingValue()) && - RepR->getOperand(1)->isDefinedOutsideLoop()) { + RepR->getOperand(1)->isDefinedOutsideLoopRegions()) { // Stores to an invariant address only need to store the last part. remapOperands(&R, UF - 1); return; diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 09cda7099b1ef..81bd21bb904c0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -90,7 +90,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { return true; VPRecipeBase *R = V->getDefiningRecipe(); - if (R && V->isDefinedOutsideLoop()) { + if (R && V->isDefinedOutsideLoopRegions()) { if (match(V->getDefiningRecipe(), m_VPInstruction( m_VPValue()))) diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 4cda711dfcf26..6cc792627f60d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -182,7 +182,7 @@ class VPValue { } /// Returns true if the VPValue is defined outside any loop. - bool isDefinedOutsideLoop() const; + bool isDefinedOutsideLoopRegions() const; // Set \p Val as the underlying Value of this VPValue. void setUnderlyingValue(Value *Val) {